Files
Neo-ZQYY/scripts/ops/gen_dataflow_report.py

955 lines
38 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据流结构分析报告生成器v3
读取 analyze_dataflow.py 采集的数据,生成带锚点链接、上下游映射列、
业务描述、多示例值、字段差异报告的 Markdown 报告。
增强内容v3
- 总览表增加 API JSON 字段数列
- 覆盖率表增加业务描述列
- 逐表详情增加业务描述列(来自 BD_manual 文档)
- 说明+示例值合并,多示例展示,枚举值解释
- 总览章节增加 API↔ODS↔DWD 字段对比差异报告
用法:
python scripts/ops/gen_dataflow_report.py
python scripts/ops/gen_dataflow_report.py --output-dir /path/to/output
"""
from __future__ import annotations
import argparse
import json
import os
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv # noqa: F401 — _env_paths 负责加载,此处保留以防其他模块间接引用
# ── 白名单定义 ──────────────────────────────────────────────────────────
# 白名单字段仍然参与检查和统计,但在报告的 1.1 差异明细表格和 3. 逐表详情表格中
# 折叠显示(不展开详细行),并注明白名单原因。
# CHANGE 2026-02-21 | 重构白名单逻辑:统一术语为"白名单",字段仍正常检查,仅报告展示折叠
# ODS 层 ETL 元数据列非业务字段ETL 流程自动生成)
WHITELIST_ETL_META_COLS = {
"source_file", "source_endpoint", "fetched_at", "payload", "content_hash",
}
# DWD 维表 SCD2 管理列ETL 框架自动维护,非业务映射)
WHITELIST_DWD_SCD2_COLS = {
"valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id",
}
# API 嵌套对象前缀(上游 API 的门店信息嵌套结构,已通过 site_id 关联,不逐字段映射)
WHITELIST_API_NESTED_PREFIXES = ("siteProfile.",)
def is_whitelist_etl_meta(col_name: str) -> bool:
"""判断是否为 ETL 元数据白名单列"""
return col_name in WHITELIST_ETL_META_COLS
def is_whitelist_scd2(col_name: str) -> bool:
"""判断是否为 DWD SCD2 管理白名单列"""
return col_name in WHITELIST_DWD_SCD2_COLS
def is_whitelist_api_nested(json_path: str) -> bool:
"""判断是否为 API 嵌套对象白名单字段"""
return any(json_path.startswith(p) for p in WHITELIST_API_NESTED_PREFIXES)
def whitelist_reason(col_name: str, json_path: str = "", layer: str = "") -> str:
"""返回白名单原因描述,非白名单返回空字符串"""
if is_whitelist_etl_meta(col_name):
return "ETL 元数据列"
if is_whitelist_scd2(col_name):
return "SCD2 管理列"
if json_path and is_whitelist_api_nested(json_path):
return "API 嵌套对象siteProfile"
return ""
def load_json(path: Path) -> dict | list | None:
if not path.exists():
return None
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="生成数据流结构分析 Markdown 报告")
parser.add_argument("--output-dir", type=str, default=None,
help="输出目录(默认读取 .env 中的 SYSTEM_ANALYZE_ROOT")
return parser
def resolve_data_dir(override: str | None = None) -> Path:
if override:
return Path(override)
from _env_paths import get_output_path
return get_output_path("SYSTEM_ANALYZE_ROOT")
def _esc(s: str) -> str:
"""转义 Markdown 表格中的管道符"""
return str(s).replace("|", "\\|").replace("\n", " ") if s else ""
def _format_samples(samples: list[str], max_show: int = 5) -> str:
"""格式化多示例值,截断过长的值"""
if not samples:
return ""
shown = []
for s in samples[:max_show]:
s = _esc(s)
if len(s) > 30:
s = s[:27] + "..."
shown.append(f"`{s}`")
result = ", ".join(shown)
if len(samples) > max_show:
result += f" …共{len(samples)}"
return result
def _is_enum_like(samples: list[str], total_records: int) -> bool:
"""判断字段是否像枚举(不同值少且记录数足够多)"""
if total_records < 5:
return False
return 1 < len(samples) <= 8
def _write_source_file_manifest(w, data_dir: Path, tables: list[dict], fm_dir: Path | None = None):
"""在报告开头输出本次分析用到的所有 JSON 数据源文件清单"""
if fm_dir is None:
fm_dir = data_dir / "field_mappings"
w("## 数据源文件清单")
w()
w("本报告基于以下 JSON 数据文件生成:")
w()
categories = [
("collection_manifest.json", "采集元数据(表清单、日期范围、记录数)"),
("json_trees/", "API JSON 字段结构(递归展开后的字段路径、类型、示例值)"),
("field_mappings/", "三层字段映射API→ODS→DWD 映射关系)"),
("db_schemas/", "数据库表结构ODS/DWD 列定义,来自 PostgreSQL"),
("bd_descriptions/", "业务描述(来自 BD_manual 文档)"),
]
for cat_path, cat_desc in categories:
if cat_path.endswith("/"):
# 子目录:列出实际存在的文件
# CHANGE 2026-02-21 | field_mappings 使用传入的 fm_dir可能是 field_mappings_new
if cat_path.rstrip("/") == "field_mappings":
sub_dir = fm_dir
else:
sub_dir = data_dir / cat_path.rstrip("/")
if sub_dir.is_dir():
try:
files = sorted(f.name for f in sub_dir.iterdir() if f.suffix == ".json")
except PermissionError:
w(f"**{cat_path}** — {cat_desc}(目录权限拒绝)")
w()
continue
if sub_dir.is_dir():
files = sorted(f.name for f in sub_dir.iterdir() if f.suffix == ".json")
w(f"**{cat_path}** — {cat_desc}{len(files)} 个文件)")
w()
for fn in files:
w(f"- `{cat_path}{fn}`")
w()
else:
w(f"**{cat_path}** — {cat_desc}(目录不存在)")
w()
else:
# 单文件
fp = data_dir / cat_path
status = "" if fp.exists() else "✗ 缺失"
w(f"- `{cat_path}` — {cat_desc}{status}")
w()
w("---")
w()
def generate_report(data_dir: Path) -> str:
"""生成完整的 Markdown 报告"""
manifest = load_json(data_dir / "collection_manifest.json")
if not manifest:
raise FileNotFoundError(f"找不到 collection_manifest.json: {data_dir}")
# CHANGE 2026-02-21 | Windows 文件锁 fallbackfield_mappings_new 优先于被锁的 field_mappings
_fm_dir = data_dir / "field_mappings"
_fm_new = data_dir / "field_mappings_new"
if _fm_new.exists() and any(_fm_new.iterdir()):
_fm_dir = _fm_new
tables = manifest["tables"]
now = datetime.now()
lines: list[str] = []
def w(s: str = ""):
lines.append(s)
# ── 从 manifest 读取 API 请求日期范围 ──
api_date_from = manifest.get("date_from")
api_date_to = manifest.get("date_to")
total_records_all = sum(t.get("record_count", 0) for t in tables)
# ── 报告头 ──
w("# 飞球连接器 — 数据流结构分析报告")
w()
w(f"> 生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} CST")
w(f"> 分析范围飞球feiqiu连接器{len(tables)} 张 ODS 表")
w("> 数据来源API JSON 采样 + PostgreSQL ODS/DWD 表结构 + 三层字段映射 + BD_manual 业务文档")
if api_date_from or api_date_to:
w(f"> API 请求日期范围:{api_date_from or ''} ~ {api_date_to or ''}")
w(f"> JSON 数据总量:{total_records_all} 条记录")
w()
# ── 数据源文件清单 ──
_write_source_file_manifest(w, data_dir, tables, fm_dir=_fm_dir)
# ── 1. 总览表(增加 API JSON 字段数列) ──
w("## 1. 总览")
w()
w("| # | ODS 表名 | 业务描述 | 采样记录数 | API JSON 字段数 | ODS 列数 | DWD 目标表 | DWD 总列数 |")
w("|---|---------|---------|-----------|---------------|---------|-----------|-----------|")
total_records = 0
total_ods_cols = 0
total_dwd_cols = 0
total_json_fields = 0
for i, t in enumerate(tables, 1):
dwd_names = ", ".join(t["dwd_tables"]) if t["dwd_tables"] else ""
json_fc = t.get("json_field_count", 0)
w(f"| {i} | `{t['table']}` | {t['description']} | {t['record_count']} | {json_fc} | {t['ods_column_count']} | {dwd_names} | {t['dwd_column_count']} |")
total_records += t["record_count"]
total_ods_cols += t["ods_column_count"]
total_dwd_cols += t["dwd_column_count"]
total_json_fields += json_fc
w(f"| | **合计** | | **{total_records}** | **{total_json_fields}** | **{total_ods_cols}** | | **{total_dwd_cols}** |")
w()
# ── 1.1 字段对比差异报告 ──
_write_field_diff_report(w, data_dir, tables, fm_dir=_fm_dir)
# ── 2. 全局统计 ──
w("## 2. 全局统计")
w()
# 2.1 JSON→ODS 映射覆盖
total_json = 0
total_mapped = 0
per_table_stats: list[dict] = []
for t in tables:
fm = load_json(_fm_dir / f"{t['table']}.json")
if not fm or "json_to_ods" not in fm:
per_table_stats.append({
"table": t["table"], "description": t["description"],
"json_count": 0, "mapped": 0, "unmapped": 0, "pct": "",
})
continue
j2o = fm["json_to_ods"]
json_count = len(j2o)
mapped = sum(1 for m in j2o if m.get("ods_col") is not None)
unmapped = json_count - mapped
pct = f"{mapped / json_count * 100:.1f}%" if json_count > 0 else ""
per_table_stats.append({
"table": t["table"], "description": t["description"],
"json_count": json_count, "mapped": mapped, "unmapped": unmapped, "pct": pct,
})
total_json += json_count
total_mapped += mapped
total_unmapped = total_json - total_mapped
w("### 2.1 JSON→ODS 映射覆盖")
w()
w(f"- JSON 字段总数:{total_json}")
if total_json > 0:
w(f"- 已映射到 ODS 列:{total_mapped}{total_mapped / total_json * 100:.1f}%")
w(f"- 仅存于 payload{total_unmapped}{total_unmapped / total_json * 100:.1f}%")
else:
w("- 已映射到 ODS 列0")
w("- 仅存于 payload0")
w()
# 2.2 ODS→DWD 映射覆盖
w("### 2.2 ODS→DWD 映射覆盖")
w()
w(f"- DWD 列总数:{total_dwd_cols}")
w()
# 2.3 各表覆盖率(增加业务描述列)
w("### 2.3 各表 JSON→ODS 映射覆盖率")
w()
w("| ODS 表名 | 业务描述 | JSON 字段数 | 已映射 | 仅 payload | 覆盖率 |")
w("|---------|---------|-----------|-------|-----------|-------|")
sorted_stats = sorted(per_table_stats, key=lambda x: (0 if x["pct"] == "" else -float(x["pct"].rstrip("%"))))
for s in sorted_stats:
w(f"| `{s['table']}` | {s['description']} | {s['json_count']} | {s['mapped']} | {s['unmapped']} | {s['pct']} |")
w()
# ── 3. 逐表详情 ──
w("## 3. 逐表详情")
w()
for idx, t in enumerate(tables, 1):
table_name = t["table"]
fm = load_json(_fm_dir / f"{table_name}.json")
jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
# 锚点 ID
anchors = fm.get("anchors", {}) if fm else {}
api_anchor = anchors.get("api", f"api-{table_name}")
ods_anchor = anchors.get("ods", f"ods-{table_name}")
dwd_anchors = anchors.get("dwd", {})
dwd_tables_list = t.get("dwd_tables", [])
json_fc = t.get("json_field_count", 0)
w(f"### 3.{idx} {table_name}{t['description']}")
w()
w(f"- 任务代码:`{t['task_code']}`")
w(f"- 采样记录数:{t['record_count']}")
w(f"- API JSON 字段数:{json_fc}")
w(f"- ODS 列数:{t['ods_column_count']}")
if dwd_tables_list:
w(f"- DWD 目标表:{', '.join(dwd_tables_list)}")
else:
w("- DWD 目标表:—(仅 ODS 落地)")
w()
# ── API 源字段区块 ──
_write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor)
# ── ODS 表结构区块 ──
_write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors)
# ── DWD 表结构区块 ──
for dwd_name in dwd_tables_list:
dwd_anchor = dwd_anchors.get(dwd_name, f"dwd-{dwd_name}")
dwd_schema = load_json(data_dir / "db_schemas" / f"dwd_{dwd_name}.json")
_write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name)
return "\n".join(lines)
def _write_field_diff_report(w, data_dir: Path, tables: list[dict], fm_dir: Path | None = None):
"""生成 API↔ODS↔DWD 字段对比差异报告(汇总表 + 逐表分表)"""
if fm_dir is None:
fm_dir = data_dir / "field_mappings"
w("### 1.1 API↔ODS↔DWD 字段对比差异")
w()
w("以下汇总各表在三层之间的字段差异(点击数字跳转至分表详情):")
w()
w("| ODS 表名 | API→ODS 未映射 | ODS 无 JSON 源 | ODS→DWD 未映射 | DWD 无 ODS 源 | 主要差异原因 |")
w("|---------|--------------|--------------|--------------|-------------|------------|")
# CHANGE 2026-02-21 | 重构白名单逻辑:字段仍正常检查计数,白名单字段在分表详情中折叠
# 收集每表差异数据,用于汇总表和分表
diff_rows: list[dict] = []
for t in tables:
table_name = t["table"]
fm = load_json(fm_dir / f"{table_name}.json")
if not fm:
w(f"| `{table_name}` | — | — | — | — | 无映射数据 |")
diff_rows.append(None)
continue
anchors = fm.get("anchors", {})
api_anchor = anchors.get("api", f"api-{table_name.replace('_', '-')}")
ods_anchor = anchors.get("ods", f"ods-{table_name.replace('_', '-')}")
dwd_anchors = anchors.get("dwd", {})
diff_anchor = f"diff-{table_name.replace('_', '-')}"
j2o = fm.get("json_to_ods", [])
o2d = fm.get("ods_to_dwd", {})
d2o = fm.get("dwd_to_ods", {})
# ── API→ODS 未映射字段(全部检查,含白名单) ──
api_unmapped_flat: list[str] = []
api_unmapped_nested: list[str] = []
api_unmapped_whitelist: list[tuple[str, str]] = [] # (json_path, reason)
for m in j2o:
if m.get("ods_col") is None:
jp = m.get("json_path", "")
wl_reason = whitelist_reason("", json_path=jp)
if wl_reason:
api_unmapped_whitelist.append((jp, wl_reason))
elif "." in jp:
api_unmapped_nested.append(jp)
else:
api_unmapped_flat.append(jp)
api_unmapped_total = len(api_unmapped_flat) + len(api_unmapped_nested) + len(api_unmapped_whitelist)
# ── ODS 无 JSON 源(全部检查,含白名单) ──
ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
ods_mapped_cols = {m["ods_col"] for m in j2o if m.get("ods_col")}
ods_no_json_fields: list[str] = []
ods_no_json_whitelist: list[tuple[str, str]] = [] # (col_name, reason)
if ods_schema and "columns" in ods_schema:
for col in ods_schema["columns"]:
if col["name"] not in ods_mapped_cols:
wl_reason = whitelist_reason(col["name"])
if wl_reason:
ods_no_json_whitelist.append((col["name"], wl_reason))
else:
ods_no_json_fields.append(col["name"])
# ── ODS→DWD 未映射(全部检查,含白名单) ──
ods_cols_with_dwd = set(o2d.keys())
ods_no_dwd_fields: list[str] = []
ods_no_dwd_whitelist: list[tuple[str, str]] = []
if ods_schema and "columns" in ods_schema:
for col in ods_schema["columns"]:
if col["name"] not in ods_cols_with_dwd:
wl_reason = whitelist_reason(col["name"])
if wl_reason:
ods_no_dwd_whitelist.append((col["name"], wl_reason))
else:
ods_no_dwd_fields.append(col["name"])
# ── DWD 无 ODS 源(全部检查,含白名单) ──
dwd_no_ods_fields: list[tuple[str, str]] = [] # (dwd_table, dwd_col)
dwd_no_ods_whitelist: list[tuple[str, str, str]] = [] # (dwd_table, dwd_col, reason)
for dwd_name, entries in d2o.items():
for entry in entries:
if entry.get("ods_source") == "":
wl_reason = whitelist_reason(entry["dwd_col"])
if wl_reason:
dwd_no_ods_whitelist.append((dwd_name, entry["dwd_col"], wl_reason))
else:
dwd_no_ods_fields.append((dwd_name, entry["dwd_col"]))
# 差异原因(含白名单统计)
reasons: list[str] = []
if api_unmapped_nested:
reasons.append(f"嵌套对象 {len(api_unmapped_nested)}")
if api_unmapped_flat:
reasons.append(f"平层未映射 {len(api_unmapped_flat)}")
if dwd_no_ods_fields:
reasons.append(f"SCD2/派生列 {len(dwd_no_ods_fields)}")
wl_total = len(api_unmapped_whitelist) + len(ods_no_json_whitelist) + len(ods_no_dwd_whitelist) + len(dwd_no_ods_whitelist)
if wl_total:
reasons.append(f"白名单 {wl_total}")
reason_str = "".join(reasons) if reasons else ""
# 汇总表单元格:数量 + 跳转链接(白名单字段也计入总数)
def _cell(count: int) -> str:
if count == 0:
return "0"
return f"[{count}](#{diff_anchor})"
w(f"| `{table_name}` | {_cell(api_unmapped_total)} | {_cell(len(ods_no_json_fields) + len(ods_no_json_whitelist))} | {_cell(len(ods_no_dwd_fields) + len(ods_no_dwd_whitelist))} | {_cell(len(dwd_no_ods_fields) + len(dwd_no_ods_whitelist))} | {reason_str} |")
diff_rows.append({
"table_name": table_name,
"diff_anchor": diff_anchor,
"api_anchor": api_anchor,
"ods_anchor": ods_anchor,
"dwd_anchors": dwd_anchors,
"api_unmapped_flat": api_unmapped_flat,
"api_unmapped_nested": api_unmapped_nested,
"api_unmapped_whitelist": api_unmapped_whitelist,
"ods_no_json_fields": ods_no_json_fields,
"ods_no_json_whitelist": ods_no_json_whitelist,
"ods_no_dwd_fields": ods_no_dwd_fields,
"ods_no_dwd_whitelist": ods_no_dwd_whitelist,
"dwd_no_ods_fields": dwd_no_ods_fields,
"dwd_no_ods_whitelist": dwd_no_ods_whitelist,
})
w()
# ── 逐表差异分表 ──
# CHANGE 2026-02-21 | 白名单字段折叠显示,不展开详细表格行,注明白名单原因
sub_idx = 0
for row in diff_rows:
if row is None:
continue
has_any = (row["api_unmapped_flat"] or row["api_unmapped_nested"]
or row["api_unmapped_whitelist"]
or row["ods_no_json_fields"] or row["ods_no_json_whitelist"]
or row["ods_no_dwd_fields"] or row["ods_no_dwd_whitelist"]
or row["dwd_no_ods_fields"] or row["dwd_no_ods_whitelist"])
if not has_any:
continue
sub_idx += 1
table_name = row["table_name"]
w(f'<a id="{row["diff_anchor"]}"></a>')
w()
w(f"#### 1.1.{sub_idx} {table_name} 字段差异明细")
w()
api_anchor = row["api_anchor"]
ods_anchor = row["ods_anchor"]
dwd_anchors = row["dwd_anchors"]
# 加载辅助数据json_trees示例值、bd_descriptions业务说明
jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
jt_lookup: dict[str, dict] = {}
if jt and "fields" in jt:
for fld in jt["fields"]:
jt_lookup[fld["path"]] = fld
ods_descs = bd.get("ods_fields", {}) if bd else {}
dwd_descs_all = bd.get("dwd_fields", {}) if bd else {}
def _sample_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
"""从 json_trees 或 bd_descriptions 获取示例值字符串"""
if layer == "API":
entry = jt_lookup.get(field_name, {})
samples = entry.get("samples", [])
total_recs = entry.get("total_records", 0)
if not samples:
single = entry.get("sample", "")
if single:
samples = [str(single)]
if _is_enum_like(samples, total_recs):
return ", ".join(f"`{_esc(s)}`" for s in samples[:5])
if samples:
return _format_samples(samples, max_show=3)
return ""
def _desc_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
"""从 bd_descriptions 获取业务说明"""
key = field_name.split(".")[-1].replace("[]", "").lower()
if layer in ("ODS", "API"):
desc = ods_descs.get(key, "")
elif layer == "DWD" and dwd_tbl:
desc = dwd_descs_all.get(dwd_tbl, {}).get(key, "")
else:
desc = ""
if desc and len(desc) > 40:
desc = desc[:37] + "..."
return _esc(desc)
def _write_whitelist_summary(w, items: list, category: str):
"""白名单字段折叠汇总(不展开详细表格行)"""
if not items:
return
# 按原因分组
by_reason: dict[str, list[str]] = {}
for item in items:
if isinstance(item, tuple) and len(item) == 3:
name, _, reason = item # (dwd_table, dwd_col, reason)
elif isinstance(item, tuple) and len(item) == 2:
name, reason = item
else:
name, reason = str(item), "白名单"
by_reason.setdefault(reason, []).append(name)
parts = []
for reason, names in by_reason.items():
parts.append(f"{reason}: `{'`, `'.join(names[:5])}`{'...' if len(names) > 5 else ''} ({len(names)} 个)")
w(f"> {category}白名单字段(已检查,不展开详情):{''.join(parts)}")
w()
# ── API→ODS 未映射(平层) ──
if row["api_unmapped_flat"]:
w(f"**API→ODS 未映射(平层)** — {len(row['api_unmapped_flat'])}")
w()
w("| # | JSON 字段 | 示例值 | 说明 | 状态 |")
w("|---|----------|-------|------|------|")
for i, f in enumerate(row["api_unmapped_flat"], 1):
sample = _sample_str(f, "API")
desc = _desc_str(f, "API")
w(f"| {i} | **[`{_esc(f)}`](#{api_anchor})** | {sample} | {desc} | **⚠️ 未映射** |")
w()
# ── API→ODS 未映射(嵌套对象,非白名单) ──
if row["api_unmapped_nested"]:
w(f"<details><summary>API→ODS 未映射(嵌套对象)— {len(row['api_unmapped_nested'])} 个</summary>")
w()
w("| # | JSON 字段 | 示例值 | 说明 | 状态 |")
w("|---|----------|-------|------|------|")
for i, f in enumerate(row["api_unmapped_nested"], 1):
sample = _sample_str(f, "API")
desc = _desc_str(f, "API")
w(f"| {i} | [`{_esc(f)}`](#{api_anchor}) | {sample} | {desc} | 📦 嵌套 |")
w()
w("</details>")
w()
# ── API 白名单字段汇总 ──
_write_whitelist_summary(w, row["api_unmapped_whitelist"], "API→ODS ")
# ── ODS 无 JSON 源 ──
if row["ods_no_json_fields"]:
w(f"**ODS 无 JSON 源** — {len(row['ods_no_json_fields'])}")
w()
w("| # | ODS 列 | 说明 | 状态 |")
w("|---|-------|------|------|")
for i, f in enumerate(row["ods_no_json_fields"], 1):
desc = _desc_str(f, "ODS")
w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {desc} | **⚠️ 无 JSON 源** |")
w()
# ── ODS 无 JSON 源 白名单汇总 ──
_write_whitelist_summary(w, row["ods_no_json_whitelist"], "ODS 无 JSON 源 ")
# ── ODS→DWD 未映射 ──
if row["ods_no_dwd_fields"]:
w(f"**ODS→DWD 未映射** — {len(row['ods_no_dwd_fields'])}")
w()
w("| # | ODS 列 | 说明 | 状态 |")
w("|---|-------|------|------|")
for i, f in enumerate(row["ods_no_dwd_fields"], 1):
desc = _desc_str(f, "ODS")
w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {desc} | **⚠️ 无 DWD 目标** |")
w()
# ── ODS→DWD 白名单汇总 ──
_write_whitelist_summary(w, row["ods_no_dwd_whitelist"], "ODS→DWD ")
# ── DWD 无 ODS 源 ──
if row["dwd_no_ods_fields"]:
w(f"**DWD 无 ODS 源** — {len(row['dwd_no_ods_fields'])}")
w()
w("| # | DWD 表 | DWD 列 | 说明 | 状态 |")
w("|---|-------|-------|------|------|")
for i, (dwd_name, dwd_col) in enumerate(row["dwd_no_ods_fields"], 1):
dwd_a = dwd_anchors.get(dwd_name, f"dwd-{dwd_name.replace('_', '-')}")
desc = _desc_str(dwd_col, "DWD", dwd_tbl=dwd_name)
w(f"| {i} | {dwd_name} | **[`{_esc(dwd_col)}`](#{dwd_a})** | {desc} | **⚠️ 无 ODS 源** |")
w()
# ── DWD 无 ODS 源 白名单汇总 ──
_write_whitelist_summary(w, row["dwd_no_ods_whitelist"], "DWD 无 ODS 源 ")
w()
def _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor):
"""生成 API 源字段区块(增加业务描述列,合并说明+示例值,白名单字段折叠)"""
w(f'<a id="{api_anchor}"></a>')
w()
w(f"#### API 源字段 — {table_name} [🔗 ODS](#{ods_anchor})")
w()
if not fm or "json_to_ods" not in fm:
w("_无 field_mappings 数据_")
w()
return
j2o = fm["json_to_ods"]
# 构建 json_tree 查找表(含 samples
jt_lookup: dict[str, dict] = {}
if jt and "fields" in jt:
for f in jt["fields"]:
jt_lookup[f["path"]] = f
# BD_manual ODS 描述(用于交叉引用 JSON 字段的业务含义)
ods_descs = bd.get("ods_fields", {}) if bd else {}
# CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总
normal_items: list[dict] = []
whitelist_items: list[tuple[str, str]] = [] # (json_path, reason)
for m in j2o:
jp = m.get("json_path", "")
wl_reason = whitelist_reason("", json_path=jp)
if wl_reason:
whitelist_items.append((jp, wl_reason))
else:
normal_items.append(m)
mapped_count = sum(1 for m in j2o if m.get("ods_col") is not None)
total_count = len(j2o)
if total_count > 0:
w(f"已映射 {mapped_count}/{total_count},覆盖率 {mapped_count / total_count * 100:.1f}%")
if whitelist_items:
w(f"(其中 {len(whitelist_items)} 个白名单字段已折叠)")
else:
w("无字段")
w()
w("| # | JSON 字段 | 类型 | → ODS 列 | 业务描述 | 示例值与说明 |")
w("|---|----------|------|---------|---------|------------|")
for i, m in enumerate(normal_items, 1):
json_path = m["json_path"]
json_type = m.get("json_type", "")
ods_col = m.get("ods_col")
match_type = m.get("match_type", "")
occurrence_pct = m.get("occurrence_pct", 0)
# 从 json_tree 获取示例值(优先用 samples 多示例)
jt_entry = jt_lookup.get(json_path, {})
samples = jt_entry.get("samples", [])
total_recs = jt_entry.get("total_records", 0)
if not samples:
single = jt_entry.get("sample", "")
if single:
samples = [str(single)]
# 构建 ODS 列链接
if ods_col:
ods_link = f"[`{ods_col}`](#{ods_anchor})"
else:
ods_link = "⚠️ 未映射"
# 业务描述(从 BD_manual 查找,用 ODS 列名或 JSON 叶子名)
leaf = json_path.split(".")[-1].replace("[]", "").lower()
biz_desc = ods_descs.get(leaf, "")
if biz_desc and len(biz_desc) > 60:
biz_desc = biz_desc[:57] + "..."
biz_desc = _esc(biz_desc)
# 合并说明+示例值
notes_parts: list[str] = []
if "." in json_path and match_type == "unmapped":
notes_parts.append("📦 嵌套对象")
if match_type == "case_insensitive":
notes_parts.append("大小写匹配")
if occurrence_pct < 100:
notes_parts.append(f"出现率 {occurrence_pct:.0f}%")
# 示例值展示
if _is_enum_like(samples, total_recs):
notes_parts.append(f"枚举值: {', '.join(f'`{_esc(s)}`' for s in samples[:8])}")
elif samples:
notes_parts.append(f"示例: {_format_samples(samples)}")
note_str = "".join(notes_parts) if notes_parts else ""
w(f"| {i} | `{_esc(json_path)}` | {json_type} | {ods_link} | {biz_desc} | {note_str} |")
w()
# 白名单字段折叠汇总
if whitelist_items:
by_reason: dict[str, list[str]] = {}
for jp, reason in whitelist_items:
by_reason.setdefault(reason, []).append(jp)
parts = []
for reason, names in by_reason.items():
parts.append(f"{reason}: `{'`, `'.join(names[:5])}`{'...' if len(names) > 5 else ''} ({len(names)} 个)")
w(f"> 白名单字段(已检查,不展开详情):{''.join(parts)}")
w()
def _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors):
"""生成 ODS 表结构区块(含上下游双向映射列 + 业务描述,白名单字段折叠)"""
w(f'<a id="{ods_anchor}"></a>')
w()
w(f"#### ODS 表结构 — ods.{table_name} [🔗 API](#{api_anchor})")
w()
if not ods_schema or "columns" not in ods_schema:
w("_无 DB schema 数据_")
w()
return
# 构建 json_to_ods 反向查找ods_col → json_path
ods_to_json: dict[str, str] = {}
if fm and "json_to_ods" in fm:
for m in fm["json_to_ods"]:
if m.get("ods_col"):
ods_to_json.setdefault(m["ods_col"], m["json_path"])
# 构建 ods_to_dwd 查找
ods_to_dwd: dict[str, list[dict]] = {}
if fm and "ods_to_dwd" in fm:
ods_to_dwd = fm["ods_to_dwd"]
# BD_manual ODS 描述
ods_descs = bd.get("ods_fields", {}) if bd else {}
cols = ods_schema["columns"]
# CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总
normal_cols: list[dict] = []
whitelist_cols: list[tuple[str, str]] = [] # (col_name, reason)
for col in cols:
wl_reason = whitelist_reason(col["name"])
if wl_reason:
whitelist_cols.append((col["name"], wl_reason))
else:
normal_cols.append(col)
w(f"{len(cols)}")
if whitelist_cols:
w(f"(其中 {len(whitelist_cols)} 个白名单列已折叠)")
w()
w("| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 | 业务描述 |")
w("|---|---------|------|----------|-----------|---------|")
for i, col in enumerate(normal_cols, 1):
col_name = col["name"]
col_type = col["data_type"]
# ← JSON 源
json_src = ods_to_json.get(col_name)
if json_src:
json_link = f"[`{_esc(json_src)}`](#{api_anchor})"
else:
json_link = ""
# → DWD 目标
dwd_targets = ods_to_dwd.get(col_name, [])
if dwd_targets:
dwd_links = []
for dt in dwd_targets:
dwd_tbl = dt["dwd_table"]
dwd_col = dt["dwd_col"]
dwd_anc = dwd_anchors.get(dwd_tbl, f"dwd-{dwd_tbl}")
dwd_links.append(f"[`{dwd_tbl}.{dwd_col}`](#{dwd_anc})")
dwd_link = ", ".join(dwd_links)
else:
dwd_link = ""
# 业务描述
biz_desc = ods_descs.get(col_name.lower(), "")
if biz_desc and len(biz_desc) > 60:
biz_desc = biz_desc[:57] + "..."
biz_desc = _esc(biz_desc)
w(f"| {i} | `{col_name}` | {col_type} | {json_link} | {dwd_link} | {biz_desc} |")
w()
# 白名单列折叠汇总
if whitelist_cols:
by_reason: dict[str, list[str]] = {}
for cn, reason in whitelist_cols:
by_reason.setdefault(reason, []).append(cn)
parts = []
for reason, names in by_reason.items():
parts.append(f"{reason}: `{'`, `'.join(names)}` ({len(names)} 个)")
w(f"> 白名单列(已检查,不展开详情):{''.join(parts)}")
w()
def _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name):
"""生成 DWD 表结构区块(增加业务描述列,白名单字段折叠)"""
w(f'<a id="{dwd_anchor}"></a>')
w()
w(f"#### DWD 表结构 — dwd.{dwd_name} [🔗 ODS](#{ods_anchor})")
w()
if not dwd_schema or "columns" not in dwd_schema:
w("_无 DB schema 数据_")
w()
return
# 构建 dwd_to_ods 查找
dwd_to_ods_map: dict[str, dict] = {}
if fm and "dwd_to_ods" in fm and dwd_name in fm["dwd_to_ods"]:
for entry in fm["dwd_to_ods"][dwd_name]:
dwd_to_ods_map[entry["dwd_col"]] = entry
# BD_manual DWD 描述
dwd_descs = {}
if bd and "dwd_fields" in bd:
dwd_descs = bd["dwd_fields"].get(dwd_name, {})
cols = dwd_schema["columns"]
# CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总
normal_cols: list[dict] = []
whitelist_cols: list[tuple[str, str]] = [] # (col_name, reason)
for col in cols:
wl_reason = whitelist_reason(col["name"])
if wl_reason:
whitelist_cols.append((col["name"], wl_reason))
else:
normal_cols.append(col)
w(f"{len(cols)}")
if whitelist_cols:
w(f"(其中 {len(whitelist_cols)} 个白名单列已折叠)")
w()
w("| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 业务描述 |")
w("|---|---------|------|----------|------|---------|")
for i, col in enumerate(normal_cols, 1):
col_name = col["name"]
col_type = col["data_type"]
mapping = dwd_to_ods_map.get(col_name)
if mapping:
ods_src = mapping.get("ods_source", "")
ods_link = f"[`{ods_src}`](#{ods_anchor})" if ods_src and ods_src != "" else ""
transform = mapping.get("mapping_type", "")
note = mapping.get("note", "")
else:
ods_link = ""
transform = ""
note = ""
# 业务描述(优先 BD_manual其次 mapping note最后 DB comment
biz_desc = dwd_descs.get(col_name.lower(), "")
if not biz_desc and note:
biz_desc = note
if not biz_desc:
db_comment = col.get("comment", "")
if db_comment:
if "【说明】" in db_comment:
desc_part = db_comment.split("【说明】")[1]
if "" in desc_part:
desc_part = desc_part.split("")[0]
biz_desc = desc_part.strip().rstrip("").strip()
else:
biz_desc = db_comment
if biz_desc and len(biz_desc) > 60:
biz_desc = biz_desc[:57] + "..."
biz_desc = _esc(biz_desc)
w(f"| {i} | `{col_name}` | {col_type} | {ods_link} | {_esc(transform)} | {biz_desc} |")
w()
# 白名单列折叠汇总
if whitelist_cols:
by_reason: dict[str, list[str]] = {}
for cn, reason in whitelist_cols:
by_reason.setdefault(reason, []).append(cn)
parts = []
for reason, names in by_reason.items():
parts.append(f"{reason}: `{'`, `'.join(names)}` ({len(names)} 个)")
w(f"> 白名单列(已检查,不展开详情):{''.join(parts)}")
w()
def main() -> None:
# _env_paths 在 import 时已通过绝对路径加载根 .env无需相对路径 load_dotenv
# CHANGE 2026-02-21 | 移除 load_dotenv(Path(".env")),避免 cwd 不在项目根时失效
from _env_paths import get_output_path # noqa: F401 — 触发 .env 加载
parser = build_parser()
args = parser.parse_args()
data_dir = resolve_data_dir(args.output_dir)
if not data_dir.exists():
print(f"错误:数据目录不存在: {data_dir}")
return
print(f"读取数据目录: {data_dir}")
report = generate_report(data_dir)
now = datetime.now()
filename = f"dataflow_{now.strftime('%Y-%m-%d_%H%M%S')}.md"
output_path = data_dir / filename
with open(output_path, "w", encoding="utf-8") as f:
f.write(report)
print(f"\n{'='*60}")
print(f"报告生成完成")
print(f"{'='*60}")
print(f" 输出路径: {output_path}")
print(f" 文件大小: {output_path.stat().st_size / 1024:.1f} KB")
print(f"{'='*60}")
if __name__ == "__main__":
main()