""" 字段排查脚本 — 数据流字段补全 Spec Task 1.1 对 12 张目标表执行排查流程: 1. 查 DWD 现有列 2. 查 ODS 现有列 3. 解析 FACT_MAPPINGS 现状(从 dwd_load_task.py 源码导入) 4. 判断自动映射(ODS 列名 == DWD 列名) 5. 输出排查记录表(markdown),标注每个字段的排查结论和建议操作 用法: cd C:\\NeoZQYY python scripts/ops/field_audit.py python scripts/ops/field_audit.py --output path/to/output.md """ from __future__ import annotations import argparse import os import sys from datetime import datetime from pathlib import Path import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv # ── 项目根目录 & 路径设置 ── ROOT = Path(__file__).resolve().parents[2] ETL_ROOT = ROOT / "apps" / "etl" / "connectors" / "feiqiu" sys.path.insert(0, str(ETL_ROOT)) # 导入 FACT_MAPPINGS / TABLE_MAP(仅读取类属性,不实例化) from tasks.dwd.dwd_load_task import DwdLoadTask # ── SCD2 列集合(排查时忽略) ── SCD2_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"} # ── 需要排查的表及其疑似缺失字段 ── AUDIT_TARGETS: list[dict] = [ { "ods_table": "assistant_accounts_master", "dwd_tables": ["dim_assistant", "dim_assistant_ex"], "suspect_ods_cols": ["system_role_id", "job_num", "cx_unit_price", "pd_unit_price"], "category": "A", "notes": "4 个 ODS→DWD 未映射", }, { "ods_table": "assistant_service_records", "dwd_tables": ["dwd_assistant_service_log", "dwd_assistant_service_log_ex"], "suspect_ods_cols": ["site_assistant_id", "operator_id", "operator_name"], "category": "A", "notes": "3 个 ODS→DWD 未映射(site_assistant_id 可能已映射为 order_assistant_id)", }, { "ods_table": "store_goods_sales_records", "dwd_tables": ["dwd_store_goods_sale", "dwd_store_goods_sale_ex"], "suspect_ods_cols": ["discount_price"], "category": "A", "notes": "1 个 ODS→DWD 未映射(可能已映射为 discount_money)", }, { "ods_table": "member_balance_changes", "dwd_tables": ["dwd_member_balance_change", "dwd_member_balance_change_ex"], "suspect_ods_cols": ["relate_id"], "category": "A", "notes": "1 个 ODS→DWD 未映射", }, { "ods_table": "tenant_goods_master", "dwd_tables": ["dim_tenant_goods", "dim_tenant_goods_ex"], "suspect_ods_cols": ["commoditycode"], "category": "A", "notes": "1 个 ODS→DWD 未映射(可能已映射为 commodity_code_list)", }, { "ods_table": "site_tables_master", "dwd_tables": ["dim_table", "dim_table_ex"], "suspect_ods_cols": [ "sitename", "appletqrcodeurl", "audit_status", "charge_free", "create_time", "delay_lights_time", "is_rest_area", "light_status", "only_allow_groupon", "order_delay_time", "self_table", "tablestatusname", "temporary_light_second", "virtual_table", ], "category": "A", "notes": "14 个 ODS→DWD 未映射", }, { "ods_table": "recharge_settlements", "dwd_tables": ["dwd_recharge_order", "dwd_recharge_order_ex"], "suspect_ods_cols": [ "electricityadjustmoney", "electricitymoney", "mervousalesamount", "plcouponsaleamount", "realelectricitymoney", ], "category": "B", "notes": "5 个 ODS→DWD 未映射 + 5 个 DWD 无 ODS 源(驼峰/蛇形命名差异)", }, { "ods_table": "store_goods_master", "dwd_tables": ["dim_store_goods", "dim_store_goods_ex"], "suspect_ods_cols": [ "time_slot_sale", "batch_stock_quantity", "provisional_total_cost", ], "category": "B", "notes": "平层 + 嵌套展开 + ODS→DWD 补全", }, { "ods_table": "goods_stock_summary", "dwd_tables": [], # 无 DWD 表,需新建 "suspect_ods_cols": [ "sitegoodsid", "goodsname", "goodsunit", "goodscategoryid", "goodscategorysecondid", "categoryname", "rangestartstock", "rangeendstock", "rangein", "rangeout", "rangesale", "rangesalemoney", "rangeinventory", "currentstock", ], "category": "C", "notes": "14 个 ODS 字段,无 DWD 目标表,需新建", }, { "ods_table": "goods_stock_movements", "dwd_tables": [], # 无 DWD 表,需新建 "suspect_ods_cols": [ # ODS 实际列名为驼峰式(无下划线) "sitegoodsstockid", "tenantid", "siteid", "sitegoodsid", "goodsname", "goodscategoryid", "goodssecondcategoryid", "unit", "price", "stocktype", "changenum", "startnum", "endnum", "changenuma", "startnuma", "endnuma", "remark", "operatorname", "createtime", ], "category": "C", "notes": "19 个 ODS 字段,无 DWD 目标表,需新建", }, ] # ── recharge_settlements 已知的 DWD 无 ODS 源字段(用于交叉比对) ── RECHARGE_DWD_ORPHANS = [ "pl_coupon_sale_amount", "mervou_sales_amount", "electricity_money", "real_electricity_money", "electricity_adjust_money", ] def get_db_columns(cur, schema: str, table: str) -> list[str]: """查询数据库表的列名列表(小写)。""" cur.execute( "SELECT column_name FROM information_schema.columns " "WHERE table_schema = %s AND table_name = %s ORDER BY ordinal_position", (schema, table), ) return [r["column_name"].lower() for r in cur.fetchall()] def get_sample_values(conn, schema: str, table: str, column: str, limit: int = 5) -> list: """获取指定列的非空采样值(最多 limit 个)。失败时回滚并返回空列表。""" cur = conn.cursor(cursor_factory=RealDictCursor) try: cur.execute( f'SELECT DISTINCT "{column}" FROM "{schema}"."{table}" ' f'WHERE "{column}" IS NOT NULL LIMIT %s', (limit,), ) return [r[column] for r in cur.fetchall()] except Exception: conn.rollback() return [] finally: cur.close() def parse_fact_mappings() -> dict[str, dict[str, str]]: """ 解析 FACT_MAPPINGS,返回 {dwd_full_table: {dwd_col: ods_expr}} 的映射。 同时构建反向索引 {dwd_full_table: {ods_expr_lower: dwd_col}}。 """ forward: dict[str, dict[str, str]] = {} reverse: dict[str, dict[str, str]] = {} for dwd_table, entries in DwdLoadTask.FACT_MAPPINGS.items(): fwd = {} rev = {} for dwd_col, ods_expr, _cast in entries: fwd[dwd_col.lower()] = ods_expr # 反向索引:ods 表达式 → dwd 列名 # 处理简单列名和 JSON 表达式 ods_key = ods_expr.lower().strip('"') rev[ods_key] = dwd_col.lower() forward[dwd_table] = fwd reverse[dwd_table] = rev return forward, reverse def audit_one_table( conn, target: dict, fm_forward: dict, fm_reverse: dict, ) -> list[dict]: """ 对单张表执行排查,返回排查记录列表。 每条记录: {ods_col, dwd_table, dwd_col_match, fm_status, conclusion, action, samples} """ cur = conn.cursor(cursor_factory=RealDictCursor) ods_table = target["ods_table"] dwd_tables = target["dwd_tables"] suspect_cols = target["suspect_ods_cols"] # 查 ODS 现有列 ods_cols = set(get_db_columns(cur, "ods", ods_table)) # 查各 DWD 表现有列 dwd_cols_map: dict[str, set[str]] = {} for dt in dwd_tables: dwd_cols_map[dt] = set(get_db_columns(cur, "dwd", dt)) records = [] for ods_col in suspect_cols: ods_col_lower = ods_col.lower() record = { "ods_col": ods_col_lower, "ods_exists": ods_col_lower in ods_cols, "dwd_matches": [], "fm_status": "未配置", "conclusion": "", "action": "", "samples": [], } # 采样值 if record["ods_exists"]: record["samples"] = get_sample_values(conn, "ods", ods_table, ods_col_lower) # 遍历所有关联 DWD 表检查 for dt in dwd_tables: dwd_full = f"dwd.{dt}" dwd_cols = dwd_cols_map.get(dt, set()) fm_fwd = fm_forward.get(dwd_full, {}) fm_rev = fm_reverse.get(dwd_full, {}) # 检查 1: FACT_MAPPINGS 反向索引 — ODS 列是否已被映射 if ods_col_lower in fm_rev: mapped_to = fm_rev[ods_col_lower] record["dwd_matches"].append(f"{dt}.{mapped_to}") record["fm_status"] = f"已映射 → {dt}.{mapped_to}" record["conclusion"] = "已映射(FACT_MAPPINGS 显式配置)" record["action"] = "无需变更" break # 检查 2: DWD 表中是否有同名列(自动映射) if ods_col_lower in dwd_cols: record["dwd_matches"].append(f"{dt}.{ods_col_lower}") record["fm_status"] = "自动映射(同名列)" record["conclusion"] = "已映射(自动匹配)" record["action"] = "无需变更" break # 检查 3: DWD 表中是否有近似列名(蛇形/驼峰转换) snake = _camel_to_snake(ods_col_lower) if snake != ods_col_lower and snake in dwd_cols: record["dwd_matches"].append(f"{dt}.{snake}") # 还需检查 FACT_MAPPINGS 是否已配置此映射 if snake in fm_fwd: record["fm_status"] = f"已映射 → {dt}.{snake}(命名转换)" record["conclusion"] = "已映射(命名差异,FACT_MAPPINGS 已覆盖)" record["action"] = "无需变更" else: record["fm_status"] = f"DWD 列存在 {dt}.{snake},但 FACT_MAPPINGS 未配置" record["conclusion"] = "映射遗漏(DWD 列已存在,缺 FACT_MAPPINGS)" record["action"] = "仅补充 FACT_MAPPINGS" break else: # 所有 DWD 表都没找到匹配 if not record["ods_exists"]: record["conclusion"] = "ODS 列不存在" record["action"] = "需确认 API 是否返回该字段" elif not dwd_tables: record["conclusion"] = "无 DWD 目标表" record["action"] = "需新建 DWD 表" else: record["conclusion"] = "确实缺失" record["action"] = "需新增 DWD 列 + FACT_MAPPINGS" records.append(record) # 额外排查:recharge_settlements 的 DWD 无 ODS 源字段 if ods_table == "recharge_settlements": for dwd_orphan in RECHARGE_DWD_ORPHANS: orphan_record = { "ods_col": f"(DWD orphan) {dwd_orphan}", "ods_exists": False, "dwd_matches": [], "fm_status": "", "conclusion": "", "action": "", "samples": [], } # 检查是否已在 FACT_MAPPINGS 中被映射 for dt in dwd_tables: dwd_full = f"dwd.{dt}" fm_fwd = fm_forward.get(dwd_full, {}) if dwd_orphan in fm_fwd: src = fm_fwd[dwd_orphan] orphan_record["fm_status"] = f"已映射 ← {src}" orphan_record["conclusion"] = "已映射(FACT_MAPPINGS 已覆盖)" orphan_record["action"] = "无需变更" orphan_record["dwd_matches"].append(f"{dt}.{dwd_orphan}") break else: orphan_record["conclusion"] = "DWD 列存在但无 ODS 映射" orphan_record["action"] = "需补充 FACT_MAPPINGS" records.append(orphan_record) return records def _camel_to_snake(name: str) -> str: """简易驼峰转蛇形:在大写字母前插入下划线。""" import re s1 = re.sub(r"([A-Z])", r"_\1", name) return s1.lower().lstrip("_") def generate_report(all_results: dict[str, list[dict]]) -> str: """生成 Markdown 排查报告。""" lines: list[str] = [] now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") lines.append(f"# 字段排查报告\n") lines.append(f"> 生成时间:{now_str}\n") lines.append(f"> 排查范围:{len(all_results)} 张表\n") # 汇总统计 total_fields = 0 already_mapped = 0 need_fm_only = 0 need_new_col = 0 need_new_table = 0 ods_missing = 0 for table, records in all_results.items(): for r in records: total_fields += 1 action = r["action"] if "无需变更" in action: already_mapped += 1 elif "仅补充" in action: need_fm_only += 1 elif "新增 DWD 列" in action: need_new_col += 1 elif "新建 DWD 表" in action: need_new_table += 1 elif "需确认" in action: ods_missing += 1 lines.append("\n## 汇总\n") lines.append(f"| 指标 | 数量 |") lines.append(f"|------|------|") lines.append(f"| 排查字段总数 | {total_fields} |") lines.append(f"| 已映射(无需变更) | {already_mapped} |") lines.append(f"| 映射遗漏(仅补 FACT_MAPPINGS) | {need_fm_only} |") lines.append(f"| 确实缺失(需新增 DWD 列) | {need_new_col} |") lines.append(f"| 无 DWD 表(需新建) | {need_new_table} |") lines.append(f"| ODS 列不存在(需确认 API) | {ods_missing} |") # 逐表详情 for target_info, records in all_results.items(): ods_table, category, notes = target_info lines.append(f"\n---\n") lines.append(f"## {ods_table}({category} 类)\n") lines.append(f"> {notes}\n") lines.append(f"| # | ODS 列 | ODS 存在 | DWD 匹配 | FACT_MAPPINGS 状态 | 排查结论 | 建议操作 | 采样值 |") lines.append(f"|---|--------|---------|---------|-------------------|---------|---------|--------|") for i, r in enumerate(records, 1): ods_exists = "✅" if r["ods_exists"] else "❌" dwd_match = ", ".join(r["dwd_matches"]) if r["dwd_matches"] else "—" samples_str = ", ".join(str(s)[:30] for s in r["samples"][:3]) if r["samples"] else "—" lines.append( f"| {i} | `{r['ods_col']}` | {ods_exists} | {dwd_match} " f"| {r['fm_status']} | {r['conclusion']} | **{r['action']}** | {samples_str} |" ) # TABLE_MAP 覆盖检查 lines.append(f"\n---\n") lines.append(f"## TABLE_MAP 注册状态\n") lines.append(f"| DWD 表 | ODS 源表 | 已注册 |") lines.append(f"|--------|---------|--------|") for target in AUDIT_TARGETS: for dt in target["dwd_tables"]: dwd_full = f"dwd.{dt}" ods_full = f"ods.{target['ods_table']}" registered = dwd_full in DwdLoadTask.TABLE_MAP reg_str = "✅" if registered else "❌ 未注册" if registered: actual_ods = DwdLoadTask.TABLE_MAP[dwd_full] if actual_ods != ods_full: reg_str = f"⚠️ 映射到 {actual_ods}" lines.append(f"| `{dwd_full}` | `{ods_full}` | {reg_str} |") # C 类无 DWD 表的 for target in AUDIT_TARGETS: if not target["dwd_tables"]: lines.append(f"| (待新建) | `ods.{target['ods_table']}` | ❌ 无 DWD 表 |") return "\n".join(lines) def main(): parser = argparse.ArgumentParser(description="字段排查脚本") parser.add_argument( "--output", type=str, default=None, help="输出文件路径(默认 $FIELD_AUDIT_ROOT/field_audit_report.md)", ) args = parser.parse_args() # 加载环境变量 load_dotenv(ROOT / ".env") load_dotenv(ROOT / ".env.local", override=True) dsn = os.environ.get("PG_DSN") if not dsn: print("错误:未配置 PG_DSN 环境变量", file=sys.stderr) sys.exit(1) print(f"连接数据库...") conn = psycopg2.connect(dsn) conn.autocommit = True print(f"解析 FACT_MAPPINGS...") fm_forward, fm_reverse = parse_fact_mappings() # 执行排查 # key = (ods_table, category, notes) 用于报告分组 all_results: dict[tuple, list[dict]] = {} for target in AUDIT_TARGETS: key = (target["ods_table"], target["category"], target["notes"]) print(f"排查 {target['ods_table']}({target['category']} 类)...") records = audit_one_table(conn, target, fm_forward, fm_reverse) all_results[key] = records # 打印简要结果 for r in records: icon = "✅" if "无需变更" in r["action"] else "⚠️" print(f" {icon} {r['ods_col']}: {r['conclusion']} → {r['action']}") conn.close() # 生成报告 report = generate_report(all_results) # 从 .env 读取 FIELD_AUDIT_ROOT from _env_paths import get_output_path default_dir = get_output_path("FIELD_AUDIT_ROOT") output_path = Path(args.output) if args.output else default_dir / "field_audit_report.md" output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(report, encoding="utf-8") print(f"\n排查报告已生成:{output_path}") if __name__ == "__main__": main()