""" 数据流结构分析报告生成器(v3) 读取 analyze_dataflow.py 采集的数据,生成带锚点链接、上下游映射列、 业务描述、多示例值、字段差异报告的 Markdown 报告。 增强内容(v3): - 总览表增加 API JSON 字段数列 - 覆盖率表增加业务描述列 - 逐表详情增加业务描述列(来自 BD_manual 文档) - 说明+示例值合并,多示例展示,枚举值解释 - 总览章节增加 API↔ODS↔DWD 字段对比差异报告 用法: python scripts/ops/gen_dataflow_report.py python scripts/ops/gen_dataflow_report.py --output-dir /path/to/output """ from __future__ import annotations import argparse import json import os from datetime import datetime from pathlib import Path from dotenv import load_dotenv # noqa: F401 — _env_paths 负责加载,此处保留以防其他模块间接引用 # ── 白名单定义 ────────────────────────────────────────────────────────── # 白名单字段仍然参与检查和统计,但在报告的 1.1 差异明细表格和 3. 逐表详情表格中 # 折叠显示(不展开详细行),并注明白名单原因。 # CHANGE 2026-02-21 | 重构白名单逻辑:统一术语为"白名单",字段仍正常检查,仅报告展示折叠 # ODS 层 ETL 元数据列(非业务字段,ETL 流程自动生成) WHITELIST_ETL_META_COLS = { "source_file", "source_endpoint", "fetched_at", "payload", "content_hash", } # DWD 维表 SCD2 管理列(ETL 框架自动维护,非业务映射) WHITELIST_DWD_SCD2_COLS = { "valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id", } # API 嵌套对象前缀(上游 API 的门店信息嵌套结构,已通过 site_id 关联,不逐字段映射) WHITELIST_API_NESTED_PREFIXES = ("siteProfile.",) def is_whitelist_etl_meta(col_name: str) -> bool: """判断是否为 ETL 元数据白名单列""" return col_name in WHITELIST_ETL_META_COLS def is_whitelist_scd2(col_name: str) -> bool: """判断是否为 DWD SCD2 管理白名单列""" return col_name in WHITELIST_DWD_SCD2_COLS def is_whitelist_api_nested(json_path: str) -> bool: """判断是否为 API 嵌套对象白名单字段""" return any(json_path.startswith(p) for p in WHITELIST_API_NESTED_PREFIXES) def whitelist_reason(col_name: str, json_path: str = "", layer: str = "") -> str: """返回白名单原因描述,非白名单返回空字符串""" if is_whitelist_etl_meta(col_name): return "ETL 元数据列" if is_whitelist_scd2(col_name): return "SCD2 管理列" if json_path and is_whitelist_api_nested(json_path): return "API 嵌套对象(siteProfile)" return "" def load_json(path: Path) -> dict | list | None: if not path.exists(): return None with open(path, "r", encoding="utf-8") as f: return json.load(f) def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="生成数据流结构分析 Markdown 报告") parser.add_argument("--output-dir", type=str, default=None, help="输出目录(默认读取 .env 中的 SYSTEM_ANALYZE_ROOT)") return parser def resolve_data_dir(override: str | None = None) -> Path: if override: return Path(override) from _env_paths import get_output_path return get_output_path("SYSTEM_ANALYZE_ROOT") def _esc(s: str) -> str: """转义 Markdown 表格中的管道符""" return str(s).replace("|", "\\|").replace("\n", " ") if s else "" def _format_samples(samples: list[str], max_show: int = 5) -> str: """格式化多示例值,截断过长的值""" if not samples: return "" shown = [] for s in samples[:max_show]: s = _esc(s) if len(s) > 30: s = s[:27] + "..." shown.append(f"`{s}`") result = ", ".join(shown) if len(samples) > max_show: result += f" …共{len(samples)}种" return result def _is_enum_like(samples: list[str], total_records: int) -> bool: """判断字段是否像枚举(不同值少且记录数足够多)""" if total_records < 5: return False return 1 < len(samples) <= 8 def _write_source_file_manifest(w, data_dir: Path, tables: list[dict], fm_dir: Path | None = None): """在报告开头输出本次分析用到的所有 JSON 数据源文件清单""" if fm_dir is None: fm_dir = data_dir / "field_mappings" w("## 数据源文件清单") w() w("本报告基于以下 JSON 数据文件生成:") w() categories = [ ("collection_manifest.json", "采集元数据(表清单、日期范围、记录数)"), ("json_trees/", "API JSON 字段结构(递归展开后的字段路径、类型、示例值)"), ("field_mappings/", "三层字段映射(API→ODS→DWD 映射关系)"), ("db_schemas/", "数据库表结构(ODS/DWD 列定义,来自 PostgreSQL)"), ("bd_descriptions/", "业务描述(来自 BD_manual 文档)"), ] for cat_path, cat_desc in categories: if cat_path.endswith("/"): # 子目录:列出实际存在的文件 # CHANGE 2026-02-21 | field_mappings 使用传入的 fm_dir(可能是 field_mappings_new) if cat_path.rstrip("/") == "field_mappings": sub_dir = fm_dir else: sub_dir = data_dir / cat_path.rstrip("/") if sub_dir.is_dir(): try: files = sorted(f.name for f in sub_dir.iterdir() if f.suffix == ".json") except PermissionError: w(f"**{cat_path}** — {cat_desc}(目录权限拒绝)") w() continue if sub_dir.is_dir(): files = sorted(f.name for f in sub_dir.iterdir() if f.suffix == ".json") w(f"**{cat_path}** — {cat_desc}({len(files)} 个文件)") w() for fn in files: w(f"- `{cat_path}{fn}`") w() else: w(f"**{cat_path}** — {cat_desc}(目录不存在)") w() else: # 单文件 fp = data_dir / cat_path status = "✓" if fp.exists() else "✗ 缺失" w(f"- `{cat_path}` — {cat_desc}({status})") w() w("---") w() def generate_report(data_dir: Path) -> str: """生成完整的 Markdown 报告""" manifest = load_json(data_dir / "collection_manifest.json") if not manifest: raise FileNotFoundError(f"找不到 collection_manifest.json: {data_dir}") # CHANGE 2026-02-21 | Windows 文件锁 fallback:field_mappings_new 优先于被锁的 field_mappings _fm_dir = data_dir / "field_mappings" _fm_new = data_dir / "field_mappings_new" if _fm_new.exists() and any(_fm_new.iterdir()): _fm_dir = _fm_new tables = manifest["tables"] now = datetime.now() lines: list[str] = [] def w(s: str = ""): lines.append(s) # ── 从 manifest 读取 API 请求日期范围 ── api_date_from = manifest.get("date_from") api_date_to = manifest.get("date_to") total_records_all = sum(t.get("record_count", 0) for t in tables) # ── 报告头 ── w("# 飞球连接器 — 数据流结构分析报告") w() w(f"> 生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} CST") w(f"> 分析范围:飞球(feiqiu)连接器,共 {len(tables)} 张 ODS 表") w("> 数据来源:API JSON 采样 + PostgreSQL ODS/DWD 表结构 + 三层字段映射 + BD_manual 业务文档") if api_date_from or api_date_to: w(f"> API 请求日期范围:{api_date_from or '—'} ~ {api_date_to or '—'}") w(f"> JSON 数据总量:{total_records_all} 条记录") w() # ── 数据源文件清单 ── _write_source_file_manifest(w, data_dir, tables, fm_dir=_fm_dir) # ── 1. 总览表(增加 API JSON 字段数列) ── w("## 1. 总览") w() w("| # | ODS 表名 | 业务描述 | 采样记录数 | API JSON 字段数 | ODS 列数 | DWD 目标表 | DWD 总列数 |") w("|---|---------|---------|-----------|---------------|---------|-----------|-----------|") total_records = 0 total_ods_cols = 0 total_dwd_cols = 0 total_json_fields = 0 for i, t in enumerate(tables, 1): dwd_names = ", ".join(t["dwd_tables"]) if t["dwd_tables"] else "—" json_fc = t.get("json_field_count", 0) w(f"| {i} | `{t['table']}` | {t['description']} | {t['record_count']} | {json_fc} | {t['ods_column_count']} | {dwd_names} | {t['dwd_column_count']} |") total_records += t["record_count"] total_ods_cols += t["ods_column_count"] total_dwd_cols += t["dwd_column_count"] total_json_fields += json_fc w(f"| | **合计** | | **{total_records}** | **{total_json_fields}** | **{total_ods_cols}** | | **{total_dwd_cols}** |") w() # ── 1.1 字段对比差异报告 ── _write_field_diff_report(w, data_dir, tables, fm_dir=_fm_dir) # ── 2. 全局统计 ── w("## 2. 全局统计") w() # 2.1 JSON→ODS 映射覆盖 total_json = 0 total_mapped = 0 per_table_stats: list[dict] = [] for t in tables: fm = load_json(_fm_dir / f"{t['table']}.json") if not fm or "json_to_ods" not in fm: per_table_stats.append({ "table": t["table"], "description": t["description"], "json_count": 0, "mapped": 0, "unmapped": 0, "pct": "—", }) continue j2o = fm["json_to_ods"] json_count = len(j2o) mapped = sum(1 for m in j2o if m.get("ods_col") is not None) unmapped = json_count - mapped pct = f"{mapped / json_count * 100:.1f}%" if json_count > 0 else "—" per_table_stats.append({ "table": t["table"], "description": t["description"], "json_count": json_count, "mapped": mapped, "unmapped": unmapped, "pct": pct, }) total_json += json_count total_mapped += mapped total_unmapped = total_json - total_mapped w("### 2.1 JSON→ODS 映射覆盖") w() w(f"- JSON 字段总数:{total_json}") if total_json > 0: w(f"- 已映射到 ODS 列:{total_mapped}({total_mapped / total_json * 100:.1f}%)") w(f"- 仅存于 payload:{total_unmapped}({total_unmapped / total_json * 100:.1f}%)") else: w("- 已映射到 ODS 列:0") w("- 仅存于 payload:0") w() # 2.2 ODS→DWD 映射覆盖 w("### 2.2 ODS→DWD 映射覆盖") w() w(f"- DWD 列总数:{total_dwd_cols}") w() # 2.3 各表覆盖率(增加业务描述列) w("### 2.3 各表 JSON→ODS 映射覆盖率") w() w("| ODS 表名 | 业务描述 | JSON 字段数 | 已映射 | 仅 payload | 覆盖率 |") w("|---------|---------|-----------|-------|-----------|-------|") sorted_stats = sorted(per_table_stats, key=lambda x: (0 if x["pct"] == "—" else -float(x["pct"].rstrip("%")))) for s in sorted_stats: w(f"| `{s['table']}` | {s['description']} | {s['json_count']} | {s['mapped']} | {s['unmapped']} | {s['pct']} |") w() # ── 3. 逐表详情 ── w("## 3. 逐表详情") w() for idx, t in enumerate(tables, 1): table_name = t["table"] fm = load_json(_fm_dir / f"{table_name}.json") jt = load_json(data_dir / "json_trees" / f"{table_name}.json") ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json") bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json") # 锚点 ID anchors = fm.get("anchors", {}) if fm else {} api_anchor = anchors.get("api", f"api-{table_name}") ods_anchor = anchors.get("ods", f"ods-{table_name}") dwd_anchors = anchors.get("dwd", {}) dwd_tables_list = t.get("dwd_tables", []) json_fc = t.get("json_field_count", 0) w(f"### 3.{idx} {table_name}({t['description']})") w() w(f"- 任务代码:`{t['task_code']}`") w(f"- 采样记录数:{t['record_count']}") w(f"- API JSON 字段数:{json_fc}") w(f"- ODS 列数:{t['ods_column_count']}") if dwd_tables_list: w(f"- DWD 目标表:{', '.join(dwd_tables_list)}") else: w("- DWD 目标表:—(仅 ODS 落地)") w() # ── API 源字段区块 ── _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor) # ── ODS 表结构区块 ── _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors) # ── DWD 表结构区块 ── for dwd_name in dwd_tables_list: dwd_anchor = dwd_anchors.get(dwd_name, f"dwd-{dwd_name}") dwd_schema = load_json(data_dir / "db_schemas" / f"dwd_{dwd_name}.json") _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name) return "\n".join(lines) def _write_field_diff_report(w, data_dir: Path, tables: list[dict], fm_dir: Path | None = None): """生成 API↔ODS↔DWD 字段对比差异报告(汇总表 + 逐表分表)""" if fm_dir is None: fm_dir = data_dir / "field_mappings" w("### 1.1 API↔ODS↔DWD 字段对比差异") w() w("以下汇总各表在三层之间的字段差异(点击数字跳转至分表详情):") w() w("| ODS 表名 | API→ODS 未映射 | ODS 无 JSON 源 | ODS→DWD 未映射 | DWD 无 ODS 源 | 主要差异原因 |") w("|---------|--------------|--------------|--------------|-------------|------------|") # CHANGE 2026-02-21 | 重构白名单逻辑:字段仍正常检查计数,白名单字段在分表详情中折叠 # 收集每表差异数据,用于汇总表和分表 diff_rows: list[dict] = [] for t in tables: table_name = t["table"] fm = load_json(fm_dir / f"{table_name}.json") if not fm: w(f"| `{table_name}` | — | — | — | — | 无映射数据 |") diff_rows.append(None) continue anchors = fm.get("anchors", {}) api_anchor = anchors.get("api", f"api-{table_name.replace('_', '-')}") ods_anchor = anchors.get("ods", f"ods-{table_name.replace('_', '-')}") dwd_anchors = anchors.get("dwd", {}) diff_anchor = f"diff-{table_name.replace('_', '-')}" j2o = fm.get("json_to_ods", []) o2d = fm.get("ods_to_dwd", {}) d2o = fm.get("dwd_to_ods", {}) # ── API→ODS 未映射字段(全部检查,含白名单) ── api_unmapped_flat: list[str] = [] api_unmapped_nested: list[str] = [] api_unmapped_whitelist: list[tuple[str, str]] = [] # (json_path, reason) for m in j2o: if m.get("ods_col") is None: jp = m.get("json_path", "") wl_reason = whitelist_reason("", json_path=jp) if wl_reason: api_unmapped_whitelist.append((jp, wl_reason)) elif "." in jp: api_unmapped_nested.append(jp) else: api_unmapped_flat.append(jp) api_unmapped_total = len(api_unmapped_flat) + len(api_unmapped_nested) + len(api_unmapped_whitelist) # ── ODS 无 JSON 源(全部检查,含白名单) ── ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json") ods_mapped_cols = {m["ods_col"] for m in j2o if m.get("ods_col")} ods_no_json_fields: list[str] = [] ods_no_json_whitelist: list[tuple[str, str]] = [] # (col_name, reason) if ods_schema and "columns" in ods_schema: for col in ods_schema["columns"]: if col["name"] not in ods_mapped_cols: wl_reason = whitelist_reason(col["name"]) if wl_reason: ods_no_json_whitelist.append((col["name"], wl_reason)) else: ods_no_json_fields.append(col["name"]) # ── ODS→DWD 未映射(全部检查,含白名单) ── ods_cols_with_dwd = set(o2d.keys()) ods_no_dwd_fields: list[str] = [] ods_no_dwd_whitelist: list[tuple[str, str]] = [] if ods_schema and "columns" in ods_schema: for col in ods_schema["columns"]: if col["name"] not in ods_cols_with_dwd: wl_reason = whitelist_reason(col["name"]) if wl_reason: ods_no_dwd_whitelist.append((col["name"], wl_reason)) else: ods_no_dwd_fields.append(col["name"]) # ── DWD 无 ODS 源(全部检查,含白名单) ── dwd_no_ods_fields: list[tuple[str, str]] = [] # (dwd_table, dwd_col) dwd_no_ods_whitelist: list[tuple[str, str, str]] = [] # (dwd_table, dwd_col, reason) for dwd_name, entries in d2o.items(): for entry in entries: if entry.get("ods_source") == "—": wl_reason = whitelist_reason(entry["dwd_col"]) if wl_reason: dwd_no_ods_whitelist.append((dwd_name, entry["dwd_col"], wl_reason)) else: dwd_no_ods_fields.append((dwd_name, entry["dwd_col"])) # 差异原因(含白名单统计) reasons: list[str] = [] if api_unmapped_nested: reasons.append(f"嵌套对象 {len(api_unmapped_nested)} 个") if api_unmapped_flat: reasons.append(f"平层未映射 {len(api_unmapped_flat)} 个") if dwd_no_ods_fields: reasons.append(f"SCD2/派生列 {len(dwd_no_ods_fields)} 个") wl_total = len(api_unmapped_whitelist) + len(ods_no_json_whitelist) + len(ods_no_dwd_whitelist) + len(dwd_no_ods_whitelist) if wl_total: reasons.append(f"白名单 {wl_total} 个") reason_str = ";".join(reasons) if reasons else "—" # 汇总表单元格:数量 + 跳转链接(白名单字段也计入总数) def _cell(count: int) -> str: if count == 0: return "0" return f"[{count}](#{diff_anchor})" w(f"| `{table_name}` | {_cell(api_unmapped_total)} | {_cell(len(ods_no_json_fields) + len(ods_no_json_whitelist))} | {_cell(len(ods_no_dwd_fields) + len(ods_no_dwd_whitelist))} | {_cell(len(dwd_no_ods_fields) + len(dwd_no_ods_whitelist))} | {reason_str} |") diff_rows.append({ "table_name": table_name, "diff_anchor": diff_anchor, "api_anchor": api_anchor, "ods_anchor": ods_anchor, "dwd_anchors": dwd_anchors, "api_unmapped_flat": api_unmapped_flat, "api_unmapped_nested": api_unmapped_nested, "api_unmapped_whitelist": api_unmapped_whitelist, "ods_no_json_fields": ods_no_json_fields, "ods_no_json_whitelist": ods_no_json_whitelist, "ods_no_dwd_fields": ods_no_dwd_fields, "ods_no_dwd_whitelist": ods_no_dwd_whitelist, "dwd_no_ods_fields": dwd_no_ods_fields, "dwd_no_ods_whitelist": dwd_no_ods_whitelist, }) w() # ── 逐表差异分表 ── # CHANGE 2026-02-21 | 白名单字段折叠显示,不展开详细表格行,注明白名单原因 sub_idx = 0 for row in diff_rows: if row is None: continue has_any = (row["api_unmapped_flat"] or row["api_unmapped_nested"] or row["api_unmapped_whitelist"] or row["ods_no_json_fields"] or row["ods_no_json_whitelist"] or row["ods_no_dwd_fields"] or row["ods_no_dwd_whitelist"] or row["dwd_no_ods_fields"] or row["dwd_no_ods_whitelist"]) if not has_any: continue sub_idx += 1 table_name = row["table_name"] w(f'') w() w(f"#### 1.1.{sub_idx} {table_name} 字段差异明细") w() api_anchor = row["api_anchor"] ods_anchor = row["ods_anchor"] dwd_anchors = row["dwd_anchors"] # 加载辅助数据:json_trees(示例值)、bd_descriptions(业务说明) jt = load_json(data_dir / "json_trees" / f"{table_name}.json") bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json") jt_lookup: dict[str, dict] = {} if jt and "fields" in jt: for fld in jt["fields"]: jt_lookup[fld["path"]] = fld ods_descs = bd.get("ods_fields", {}) if bd else {} dwd_descs_all = bd.get("dwd_fields", {}) if bd else {} def _sample_str(field_name: str, layer: str, dwd_tbl: str = "") -> str: """从 json_trees 或 bd_descriptions 获取示例值字符串""" if layer == "API": entry = jt_lookup.get(field_name, {}) samples = entry.get("samples", []) total_recs = entry.get("total_records", 0) if not samples: single = entry.get("sample", "") if single: samples = [str(single)] if _is_enum_like(samples, total_recs): return ", ".join(f"`{_esc(s)}`" for s in samples[:5]) if samples: return _format_samples(samples, max_show=3) return "" def _desc_str(field_name: str, layer: str, dwd_tbl: str = "") -> str: """从 bd_descriptions 获取业务说明""" key = field_name.split(".")[-1].replace("[]", "").lower() if layer in ("ODS", "API"): desc = ods_descs.get(key, "") elif layer == "DWD" and dwd_tbl: desc = dwd_descs_all.get(dwd_tbl, {}).get(key, "") else: desc = "" if desc and len(desc) > 40: desc = desc[:37] + "..." return _esc(desc) def _write_whitelist_summary(w, items: list, category: str): """白名单字段折叠汇总(不展开详细表格行)""" if not items: return # 按原因分组 by_reason: dict[str, list[str]] = {} for item in items: if isinstance(item, tuple) and len(item) == 3: name, _, reason = item # (dwd_table, dwd_col, reason) elif isinstance(item, tuple) and len(item) == 2: name, reason = item else: name, reason = str(item), "白名单" by_reason.setdefault(reason, []).append(name) parts = [] for reason, names in by_reason.items(): parts.append(f"{reason}: `{'`, `'.join(names[:5])}`{'...' if len(names) > 5 else ''} ({len(names)} 个)") w(f"> ℹ️ {category}白名单字段(已检查,不展开详情):{';'.join(parts)}") w() # ── API→ODS 未映射(平层) ── if row["api_unmapped_flat"]: w(f"**API→ODS 未映射(平层)** — {len(row['api_unmapped_flat'])} 个") w() w("| # | JSON 字段 | 示例值 | 说明 | 状态 |") w("|---|----------|-------|------|------|") for i, f in enumerate(row["api_unmapped_flat"], 1): sample = _sample_str(f, "API") desc = _desc_str(f, "API") w(f"| {i} | **[`{_esc(f)}`](#{api_anchor})** | {sample} | {desc} | **⚠️ 未映射** |") w() # ── API→ODS 未映射(嵌套对象,非白名单) ── if row["api_unmapped_nested"]: w(f"
API→ODS 未映射(嵌套对象)— {len(row['api_unmapped_nested'])} 个") w() w("| # | JSON 字段 | 示例值 | 说明 | 状态 |") w("|---|----------|-------|------|------|") for i, f in enumerate(row["api_unmapped_nested"], 1): sample = _sample_str(f, "API") desc = _desc_str(f, "API") w(f"| {i} | [`{_esc(f)}`](#{api_anchor}) | {sample} | {desc} | 📦 嵌套 |") w() w("
") w() # ── API 白名单字段汇总 ── _write_whitelist_summary(w, row["api_unmapped_whitelist"], "API→ODS ") # ── ODS 无 JSON 源 ── if row["ods_no_json_fields"]: w(f"**ODS 无 JSON 源** — {len(row['ods_no_json_fields'])} 个") w() w("| # | ODS 列 | 说明 | 状态 |") w("|---|-------|------|------|") for i, f in enumerate(row["ods_no_json_fields"], 1): desc = _desc_str(f, "ODS") w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {desc} | **⚠️ 无 JSON 源** |") w() # ── ODS 无 JSON 源 白名单汇总 ── _write_whitelist_summary(w, row["ods_no_json_whitelist"], "ODS 无 JSON 源 ") # ── ODS→DWD 未映射 ── if row["ods_no_dwd_fields"]: w(f"**ODS→DWD 未映射** — {len(row['ods_no_dwd_fields'])} 个") w() w("| # | ODS 列 | 说明 | 状态 |") w("|---|-------|------|------|") for i, f in enumerate(row["ods_no_dwd_fields"], 1): desc = _desc_str(f, "ODS") w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {desc} | **⚠️ 无 DWD 目标** |") w() # ── ODS→DWD 白名单汇总 ── _write_whitelist_summary(w, row["ods_no_dwd_whitelist"], "ODS→DWD ") # ── DWD 无 ODS 源 ── if row["dwd_no_ods_fields"]: w(f"**DWD 无 ODS 源** — {len(row['dwd_no_ods_fields'])} 个") w() w("| # | DWD 表 | DWD 列 | 说明 | 状态 |") w("|---|-------|-------|------|------|") for i, (dwd_name, dwd_col) in enumerate(row["dwd_no_ods_fields"], 1): dwd_a = dwd_anchors.get(dwd_name, f"dwd-{dwd_name.replace('_', '-')}") desc = _desc_str(dwd_col, "DWD", dwd_tbl=dwd_name) w(f"| {i} | {dwd_name} | **[`{_esc(dwd_col)}`](#{dwd_a})** | {desc} | **⚠️ 无 ODS 源** |") w() # ── DWD 无 ODS 源 白名单汇总 ── _write_whitelist_summary(w, row["dwd_no_ods_whitelist"], "DWD 无 ODS 源 ") w() def _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor): """生成 API 源字段区块(增加业务描述列,合并说明+示例值,白名单字段折叠)""" w(f'') w() w(f"#### API 源字段 — {table_name} [🔗 ODS](#{ods_anchor})") w() if not fm or "json_to_ods" not in fm: w("_无 field_mappings 数据_") w() return j2o = fm["json_to_ods"] # 构建 json_tree 查找表(含 samples) jt_lookup: dict[str, dict] = {} if jt and "fields" in jt: for f in jt["fields"]: jt_lookup[f["path"]] = f # BD_manual ODS 描述(用于交叉引用 JSON 字段的业务含义) ods_descs = bd.get("ods_fields", {}) if bd else {} # CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总 normal_items: list[dict] = [] whitelist_items: list[tuple[str, str]] = [] # (json_path, reason) for m in j2o: jp = m.get("json_path", "") wl_reason = whitelist_reason("", json_path=jp) if wl_reason: whitelist_items.append((jp, wl_reason)) else: normal_items.append(m) mapped_count = sum(1 for m in j2o if m.get("ods_col") is not None) total_count = len(j2o) if total_count > 0: w(f"已映射 {mapped_count}/{total_count},覆盖率 {mapped_count / total_count * 100:.1f}%") if whitelist_items: w(f"(其中 {len(whitelist_items)} 个白名单字段已折叠)") else: w("无字段") w() w("| # | JSON 字段 | 类型 | → ODS 列 | 业务描述 | 示例值与说明 |") w("|---|----------|------|---------|---------|------------|") for i, m in enumerate(normal_items, 1): json_path = m["json_path"] json_type = m.get("json_type", "") ods_col = m.get("ods_col") match_type = m.get("match_type", "") occurrence_pct = m.get("occurrence_pct", 0) # 从 json_tree 获取示例值(优先用 samples 多示例) jt_entry = jt_lookup.get(json_path, {}) samples = jt_entry.get("samples", []) total_recs = jt_entry.get("total_records", 0) if not samples: single = jt_entry.get("sample", "") if single: samples = [str(single)] # 构建 ODS 列链接 if ods_col: ods_link = f"[`{ods_col}`](#{ods_anchor})" else: ods_link = "⚠️ 未映射" # 业务描述(从 BD_manual 查找,用 ODS 列名或 JSON 叶子名) leaf = json_path.split(".")[-1].replace("[]", "").lower() biz_desc = ods_descs.get(leaf, "") if biz_desc and len(biz_desc) > 60: biz_desc = biz_desc[:57] + "..." biz_desc = _esc(biz_desc) # 合并说明+示例值 notes_parts: list[str] = [] if "." in json_path and match_type == "unmapped": notes_parts.append("📦 嵌套对象") if match_type == "case_insensitive": notes_parts.append("大小写匹配") if occurrence_pct < 100: notes_parts.append(f"出现率 {occurrence_pct:.0f}%") # 示例值展示 if _is_enum_like(samples, total_recs): notes_parts.append(f"枚举值: {', '.join(f'`{_esc(s)}`' for s in samples[:8])}") elif samples: notes_parts.append(f"示例: {_format_samples(samples)}") note_str = ";".join(notes_parts) if notes_parts else "" w(f"| {i} | `{_esc(json_path)}` | {json_type} | {ods_link} | {biz_desc} | {note_str} |") w() # 白名单字段折叠汇总 if whitelist_items: by_reason: dict[str, list[str]] = {} for jp, reason in whitelist_items: by_reason.setdefault(reason, []).append(jp) parts = [] for reason, names in by_reason.items(): parts.append(f"{reason}: `{'`, `'.join(names[:5])}`{'...' if len(names) > 5 else ''} ({len(names)} 个)") w(f"> ℹ️ 白名单字段(已检查,不展开详情):{';'.join(parts)}") w() def _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors): """生成 ODS 表结构区块(含上下游双向映射列 + 业务描述,白名单字段折叠)""" w(f'') w() w(f"#### ODS 表结构 — ods.{table_name} [🔗 API](#{api_anchor})") w() if not ods_schema or "columns" not in ods_schema: w("_无 DB schema 数据_") w() return # 构建 json_to_ods 反向查找:ods_col → json_path ods_to_json: dict[str, str] = {} if fm and "json_to_ods" in fm: for m in fm["json_to_ods"]: if m.get("ods_col"): ods_to_json.setdefault(m["ods_col"], m["json_path"]) # 构建 ods_to_dwd 查找 ods_to_dwd: dict[str, list[dict]] = {} if fm and "ods_to_dwd" in fm: ods_to_dwd = fm["ods_to_dwd"] # BD_manual ODS 描述 ods_descs = bd.get("ods_fields", {}) if bd else {} cols = ods_schema["columns"] # CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总 normal_cols: list[dict] = [] whitelist_cols: list[tuple[str, str]] = [] # (col_name, reason) for col in cols: wl_reason = whitelist_reason(col["name"]) if wl_reason: whitelist_cols.append((col["name"], wl_reason)) else: normal_cols.append(col) w(f"共 {len(cols)} 列") if whitelist_cols: w(f"(其中 {len(whitelist_cols)} 个白名单列已折叠)") w() w("| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 | 业务描述 |") w("|---|---------|------|----------|-----------|---------|") for i, col in enumerate(normal_cols, 1): col_name = col["name"] col_type = col["data_type"] # ← JSON 源 json_src = ods_to_json.get(col_name) if json_src: json_link = f"[`{_esc(json_src)}`](#{api_anchor})" else: json_link = "—" # → DWD 目标 dwd_targets = ods_to_dwd.get(col_name, []) if dwd_targets: dwd_links = [] for dt in dwd_targets: dwd_tbl = dt["dwd_table"] dwd_col = dt["dwd_col"] dwd_anc = dwd_anchors.get(dwd_tbl, f"dwd-{dwd_tbl}") dwd_links.append(f"[`{dwd_tbl}.{dwd_col}`](#{dwd_anc})") dwd_link = ", ".join(dwd_links) else: dwd_link = "—" # 业务描述 biz_desc = ods_descs.get(col_name.lower(), "") if biz_desc and len(biz_desc) > 60: biz_desc = biz_desc[:57] + "..." biz_desc = _esc(biz_desc) w(f"| {i} | `{col_name}` | {col_type} | {json_link} | {dwd_link} | {biz_desc} |") w() # 白名单列折叠汇总 if whitelist_cols: by_reason: dict[str, list[str]] = {} for cn, reason in whitelist_cols: by_reason.setdefault(reason, []).append(cn) parts = [] for reason, names in by_reason.items(): parts.append(f"{reason}: `{'`, `'.join(names)}` ({len(names)} 个)") w(f"> ℹ️ 白名单列(已检查,不展开详情):{';'.join(parts)}") w() def _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name): """生成 DWD 表结构区块(增加业务描述列,白名单字段折叠)""" w(f'') w() w(f"#### DWD 表结构 — dwd.{dwd_name} [🔗 ODS](#{ods_anchor})") w() if not dwd_schema or "columns" not in dwd_schema: w("_无 DB schema 数据_") w() return # 构建 dwd_to_ods 查找 dwd_to_ods_map: dict[str, dict] = {} if fm and "dwd_to_ods" in fm and dwd_name in fm["dwd_to_ods"]: for entry in fm["dwd_to_ods"][dwd_name]: dwd_to_ods_map[entry["dwd_col"]] = entry # BD_manual DWD 描述 dwd_descs = {} if bd and "dwd_fields" in bd: dwd_descs = bd["dwd_fields"].get(dwd_name, {}) cols = dwd_schema["columns"] # CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总 normal_cols: list[dict] = [] whitelist_cols: list[tuple[str, str]] = [] # (col_name, reason) for col in cols: wl_reason = whitelist_reason(col["name"]) if wl_reason: whitelist_cols.append((col["name"], wl_reason)) else: normal_cols.append(col) w(f"共 {len(cols)} 列") if whitelist_cols: w(f"(其中 {len(whitelist_cols)} 个白名单列已折叠)") w() w("| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 业务描述 |") w("|---|---------|------|----------|------|---------|") for i, col in enumerate(normal_cols, 1): col_name = col["name"] col_type = col["data_type"] mapping = dwd_to_ods_map.get(col_name) if mapping: ods_src = mapping.get("ods_source", "") ods_link = f"[`{ods_src}`](#{ods_anchor})" if ods_src and ods_src != "—" else "—" transform = mapping.get("mapping_type", "") note = mapping.get("note", "") else: ods_link = "—" transform = "" note = "" # 业务描述(优先 BD_manual,其次 mapping note,最后 DB comment) biz_desc = dwd_descs.get(col_name.lower(), "") if not biz_desc and note: biz_desc = note if not biz_desc: db_comment = col.get("comment", "") if db_comment: if "【说明】" in db_comment: desc_part = db_comment.split("【说明】")[1] if "【" in desc_part: desc_part = desc_part.split("【")[0] biz_desc = desc_part.strip().rstrip("。").strip() else: biz_desc = db_comment if biz_desc and len(biz_desc) > 60: biz_desc = biz_desc[:57] + "..." biz_desc = _esc(biz_desc) w(f"| {i} | `{col_name}` | {col_type} | {ods_link} | {_esc(transform)} | {biz_desc} |") w() # 白名单列折叠汇总 if whitelist_cols: by_reason: dict[str, list[str]] = {} for cn, reason in whitelist_cols: by_reason.setdefault(reason, []).append(cn) parts = [] for reason, names in by_reason.items(): parts.append(f"{reason}: `{'`, `'.join(names)}` ({len(names)} 个)") w(f"> ℹ️ 白名单列(已检查,不展开详情):{';'.join(parts)}") w() def main() -> None: # _env_paths 在 import 时已通过绝对路径加载根 .env,无需相对路径 load_dotenv # CHANGE 2026-02-21 | 移除 load_dotenv(Path(".env")),避免 cwd 不在项目根时失效 from _env_paths import get_output_path # noqa: F401 — 触发 .env 加载 parser = build_parser() args = parser.parse_args() data_dir = resolve_data_dir(args.output_dir) if not data_dir.exists(): print(f"错误:数据目录不存在: {data_dir}") return print(f"读取数据目录: {data_dir}") report = generate_report(data_dir) now = datetime.now() filename = f"dataflow_{now.strftime('%Y-%m-%d_%H%M%S')}.md" output_path = data_dir / filename with open(output_path, "w", encoding="utf-8") as f: f.write(report) print(f"\n{'='*60}") print(f"报告生成完成") print(f"{'='*60}") print(f" 输出路径: {output_path}") print(f" 文件大小: {output_path.stat().st_size / 1024:.1f} KB") print(f"{'='*60}") if __name__ == "__main__": main()