Files
Neo-ZQYY/scripts/ops/gen_dataflow_report.py

788 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据流结构分析报告生成器v3
读取 analyze_dataflow.py 采集的数据,生成带锚点链接、上下游映射列、
业务描述、多示例值、字段差异报告的 Markdown 报告。
增强内容v3
- 总览表增加 API JSON 字段数列
- 覆盖率表增加业务描述列
- 逐表详情增加业务描述列(来自 BD_manual 文档)
- 说明+示例值合并,多示例展示,枚举值解释
- 总览章节增加 API↔ODS↔DWD 字段对比差异报告
用法:
python scripts/ops/gen_dataflow_report.py
python scripts/ops/gen_dataflow_report.py --output-dir export/dataflow_analysis
"""
from __future__ import annotations
import argparse
import json
import os
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
def load_json(path: Path) -> dict | list | None:
if not path.exists():
return None
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="生成数据流结构分析 Markdown 报告")
parser.add_argument("--output-dir", type=str, default=None,
help="输出目录(默认读取 SYSTEM_ANALYZE_ROOT 或 export/dataflow_analysis")
return parser
def resolve_data_dir(override: str | None = None) -> Path:
if override:
return Path(override)
env_root = os.environ.get("SYSTEM_ANALYZE_ROOT")
if env_root:
return Path(env_root)
return Path("export/dataflow_analysis")
def _esc(s: str) -> str:
"""转义 Markdown 表格中的管道符"""
return str(s).replace("|", "\\|").replace("\n", " ") if s else ""
# ── 字段用途推测规则 ──
# 基于字段名模式 + 表名上下文推断字段可能的业务含义
# 置信度:高(≥80%) / 中(50-79%) / 低(<50%)
import re as _re
_FIELD_GUESS_RULES: list[tuple[str, str, str]] = [
# (字段名模式正则, 推测用途, 置信度)
# ── SCD2 / ETL 元数据 ──
(r"^scd2_", "SCD2 缓慢变化维度元数据", ""),
(r"^etl_", "ETL 流程元数据", ""),
(r"^dw_insert", "数仓装载时间戳", ""),
(r"^content_hash$", "数据变更检测哈希", ""),
(r"^source_file$", "ETL 来源文件标识", ""),
(r"^source_endpoint$", "ETL 来源接口标识", ""),
(r"^fetched_at$", "ETL 抓取时间", ""),
(r"^payload$", "原始 JSON 全量存储", ""),
# ── 主键 / 外键 ──
(r"^id$", "主键标识", ""),
# ── 门店 / 组织(放在通用 _id$ 之前) ──
(r"^(site_id|shop_id|store_id)$", "门店标识", ""),
(r"^(tenant_id|org_id)$", "租户/组织标识", ""),
(r"(shop_name|site_name|store_name)", "门店名称", ""),
# ── 时间类 ──
(r"(^|_)(create|created)(_at|_time|_date)$", "记录创建时间", ""),
(r"(^|_)(update|updated|modify)(_at|_time|_date)$", "记录更新时间", ""),
(r"(^|_)(delete|deleted)(_at|_time|_date)$", "逻辑删除时间", ""),
(r"(^|_)(start|begin)(_at|_time|_date)$", "起始时间", ""),
(r"(^|_)(end|expire)(_at|_time|_date)$", "结束/过期时间", ""),
(r"(^|_)entry_time$", "入职/入场时间", ""),
(r"(^|_)resign_time$", "离职时间", ""),
(r"_time$", "时间戳字段", ""),
(r"_date$", "日期字段", ""),
# ── 通用派生(放在标志位之前,确保 derived_flag 等优先匹配派生) ──
(r"^derived_", "ETL 派生计算列", ""),
(r"^calc_", "计算字段", ""),
# ── 状态 / 标志 ──
(r"(^|_)is_delete$", "逻辑删除标志", ""),
(r"^is_", "布尔标志位", ""),
(r"(^|_)status$", "状态码", ""),
(r"_status$", "状态字段", ""),
(r"_enabled$", "启用/禁用开关", ""),
(r"_flag$", "标志位", ""),
# ── 金额 / 价格 ──
(r"(price|amount|fee|cost|money|balance|total)", "金额/价格相关", ""),
(r"(discount|coupon|refund)", "优惠/退款相关", ""),
# ── 人员 ──
(r"(real_name|nickname|^name$)", "姓名/昵称", ""),
(r"(mobile|phone|tel)", "联系电话", ""),
(r"(avatar|photo|image)", "头像/图片 URL", ""),
(r"(gender|sex)", "性别", ""),
(r"(birth|birthday)", "出生日期", ""),
(r"(height|weight)", "身高/体重", ""),
# ── 嵌套对象常见前缀 ──
(r"^siteProfile\.", "门店档案嵌套属性", ""),
(r"^memberInfo\.", "会员信息嵌套属性", ""),
(r"^assistantInfo\.", "助教信息嵌套属性", ""),
(r"^tableInfo\.", "台桌信息嵌套属性", ""),
(r"^orderInfo\.", "订单信息嵌套属性", ""),
(r"^payInfo\.", "支付信息嵌套属性", ""),
# ── 排序 / 显示 ──
(r"(sort|order|rank|seq)", "排序/序号", ""),
(r"(remark|memo|note|comment|introduce)", "备注/说明文本", ""),
(r"(url|link|qrcode|qr_code)", "链接/二维码", ""),
# ── 通用 ID 后缀(放在具体 ID 规则之后) ──
(r"_id$", "关联实体 ID外键", ""),
]
def _guess_field_purpose(field_name: str, table_name: str, layer: str) -> tuple[str, str]:
"""根据字段名和表上下文推测用途,返回 (推测用途, 置信度)。"""
fn_lower = field_name.lower()
for pattern, purpose, confidence in _FIELD_GUESS_RULES:
if _re.search(pattern, fn_lower):
return purpose, confidence
return f"待分析({layer}层字段)", ""
def _format_samples(samples: list[str], max_show: int = 5) -> str:
"""格式化多示例值,截断过长的值"""
if not samples:
return ""
shown = []
for s in samples[:max_show]:
s = _esc(s)
if len(s) > 30:
s = s[:27] + "..."
shown.append(f"`{s}`")
result = ", ".join(shown)
if len(samples) > max_show:
result += f" …共{len(samples)}"
return result
def _is_enum_like(samples: list[str], total_records: int) -> bool:
"""判断字段是否像枚举(不同值少且记录数足够多)"""
if total_records < 5:
return False
return 1 < len(samples) <= 8
def generate_report(data_dir: Path) -> str:
"""生成完整的 Markdown 报告"""
manifest = load_json(data_dir / "collection_manifest.json")
if not manifest:
raise FileNotFoundError(f"找不到 collection_manifest.json: {data_dir}")
tables = manifest["tables"]
now = datetime.now()
lines: list[str] = []
def w(s: str = ""):
lines.append(s)
# ── 报告头 ──
w("# 飞球连接器 — 数据流结构分析报告")
w()
w(f"> 生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} CST")
w(f"> 分析范围飞球feiqiu连接器{len(tables)} 张 ODS 表")
w("> 数据来源API JSON 采样 + PostgreSQL ODS/DWD 表结构 + 三层字段映射 + BD_manual 业务文档")
w()
# ── 1. 总览表(增加 API JSON 字段数列) ──
w("## 1. 总览")
w()
w("| # | ODS 表名 | 业务描述 | 采样记录数 | API JSON 字段数 | ODS 列数 | DWD 目标表 | DWD 总列数 |")
w("|---|---------|---------|-----------|---------------|---------|-----------|-----------|")
total_records = 0
total_ods_cols = 0
total_dwd_cols = 0
total_json_fields = 0
for i, t in enumerate(tables, 1):
dwd_names = ", ".join(t["dwd_tables"]) if t["dwd_tables"] else ""
json_fc = t.get("json_field_count", 0)
w(f"| {i} | `{t['table']}` | {t['description']} | {t['record_count']} | {json_fc} | {t['ods_column_count']} | {dwd_names} | {t['dwd_column_count']} |")
total_records += t["record_count"]
total_ods_cols += t["ods_column_count"]
total_dwd_cols += t["dwd_column_count"]
total_json_fields += json_fc
w(f"| | **合计** | | **{total_records}** | **{total_json_fields}** | **{total_ods_cols}** | | **{total_dwd_cols}** |")
w()
# ── 1.1 字段对比差异报告 ──
_write_field_diff_report(w, data_dir, tables)
# ── 2. 全局统计 ──
w("## 2. 全局统计")
w()
# 2.1 JSON→ODS 映射覆盖
total_json = 0
total_mapped = 0
per_table_stats: list[dict] = []
for t in tables:
fm = load_json(data_dir / "field_mappings" / f"{t['table']}.json")
if not fm or "json_to_ods" not in fm:
per_table_stats.append({
"table": t["table"], "description": t["description"],
"json_count": 0, "mapped": 0, "unmapped": 0, "pct": "",
})
continue
j2o = fm["json_to_ods"]
json_count = len(j2o)
mapped = sum(1 for m in j2o if m.get("ods_col") is not None)
unmapped = json_count - mapped
pct = f"{mapped / json_count * 100:.1f}%" if json_count > 0 else ""
per_table_stats.append({
"table": t["table"], "description": t["description"],
"json_count": json_count, "mapped": mapped, "unmapped": unmapped, "pct": pct,
})
total_json += json_count
total_mapped += mapped
total_unmapped = total_json - total_mapped
w("### 2.1 JSON→ODS 映射覆盖")
w()
w(f"- JSON 字段总数:{total_json}")
if total_json > 0:
w(f"- 已映射到 ODS 列:{total_mapped}{total_mapped / total_json * 100:.1f}%")
w(f"- 仅存于 payload{total_unmapped}{total_unmapped / total_json * 100:.1f}%")
else:
w("- 已映射到 ODS 列0")
w("- 仅存于 payload0")
w()
# 2.2 ODS→DWD 映射覆盖
w("### 2.2 ODS→DWD 映射覆盖")
w()
w(f"- DWD 列总数:{total_dwd_cols}")
w()
# 2.3 各表覆盖率(增加业务描述列)
w("### 2.3 各表 JSON→ODS 映射覆盖率")
w()
w("| ODS 表名 | 业务描述 | JSON 字段数 | 已映射 | 仅 payload | 覆盖率 |")
w("|---------|---------|-----------|-------|-----------|-------|")
sorted_stats = sorted(per_table_stats, key=lambda x: (0 if x["pct"] == "" else -float(x["pct"].rstrip("%"))))
for s in sorted_stats:
w(f"| `{s['table']}` | {s['description']} | {s['json_count']} | {s['mapped']} | {s['unmapped']} | {s['pct']} |")
w()
# ── 3. 逐表详情 ──
w("## 3. 逐表详情")
w()
for idx, t in enumerate(tables, 1):
table_name = t["table"]
fm = load_json(data_dir / "field_mappings" / f"{table_name}.json")
jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
# 锚点 ID
anchors = fm.get("anchors", {}) if fm else {}
api_anchor = anchors.get("api", f"api-{table_name}")
ods_anchor = anchors.get("ods", f"ods-{table_name}")
dwd_anchors = anchors.get("dwd", {})
dwd_tables_list = t.get("dwd_tables", [])
json_fc = t.get("json_field_count", 0)
w(f"### 3.{idx} {table_name}{t['description']}")
w()
w(f"- 任务代码:`{t['task_code']}`")
w(f"- 采样记录数:{t['record_count']}")
w(f"- API JSON 字段数:{json_fc}")
w(f"- ODS 列数:{t['ods_column_count']}")
if dwd_tables_list:
w(f"- DWD 目标表:{', '.join(dwd_tables_list)}")
else:
w("- DWD 目标表:—(仅 ODS 落地)")
w()
# ── API 源字段区块 ──
_write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor)
# ── ODS 表结构区块 ──
_write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors)
# ── DWD 表结构区块 ──
for dwd_name in dwd_tables_list:
dwd_anchor = dwd_anchors.get(dwd_name, f"dwd-{dwd_name}")
dwd_schema = load_json(data_dir / "db_schemas" / f"dwd_{dwd_name}.json")
_write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name)
return "\n".join(lines)
def _write_field_diff_report(w, data_dir: Path, tables: list[dict]):
"""生成 API↔ODS↔DWD 字段对比差异报告(汇总表 + 逐表分表)"""
w("### 1.1 API↔ODS↔DWD 字段对比差异")
w()
w("以下汇总各表在三层之间的字段差异(点击数字跳转至分表详情):")
w()
w("| ODS 表名 | API→ODS 未映射 | ODS 无 JSON 源 | ODS→DWD 未映射 | DWD 无 ODS 源 | 主要差异原因 |")
w("|---------|--------------|--------------|--------------|-------------|------------|")
# 收集每表差异数据,用于汇总表和分表
etl_meta_cols = {"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"}
diff_rows: list[dict] = []
for t in tables:
table_name = t["table"]
fm = load_json(data_dir / "field_mappings" / f"{table_name}.json")
if not fm:
w(f"| `{table_name}` | — | — | — | — | 无映射数据 |")
diff_rows.append(None)
continue
anchors = fm.get("anchors", {})
api_anchor = anchors.get("api", f"api-{table_name.replace('_', '-')}")
ods_anchor = anchors.get("ods", f"ods-{table_name.replace('_', '-')}")
dwd_anchors = anchors.get("dwd", {})
diff_anchor = f"diff-{table_name.replace('_', '-')}"
j2o = fm.get("json_to_ods", [])
o2d = fm.get("ods_to_dwd", {})
d2o = fm.get("dwd_to_ods", {})
# ── API→ODS 未映射字段 ──
api_unmapped_flat: list[str] = []
api_unmapped_nested: list[str] = []
for m in j2o:
if m.get("ods_col") is None:
jp = m.get("json_path", "")
if "." in jp:
api_unmapped_nested.append(jp)
else:
api_unmapped_flat.append(jp)
api_unmapped_total = len(api_unmapped_flat) + len(api_unmapped_nested)
# ── ODS 无 JSON 源 ──
ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
ods_mapped_cols = {m["ods_col"] for m in j2o if m.get("ods_col")}
ods_no_json_fields: list[str] = []
if ods_schema and "columns" in ods_schema:
for col in ods_schema["columns"]:
if col["name"] not in ods_mapped_cols and col["name"] not in etl_meta_cols:
ods_no_json_fields.append(col["name"])
# ── ODS→DWD 未映射 ──
ods_cols_with_dwd = set(o2d.keys())
ods_no_dwd_fields: list[str] = []
if ods_schema and "columns" in ods_schema:
for col in ods_schema["columns"]:
if col["name"] not in ods_cols_with_dwd and col["name"] not in etl_meta_cols:
ods_no_dwd_fields.append(col["name"])
# ── DWD 无 ODS 源 ──
dwd_no_ods_fields: list[tuple[str, str]] = [] # (dwd_table, dwd_col)
for dwd_name, entries in d2o.items():
for entry in entries:
if entry.get("ods_source") == "":
dwd_no_ods_fields.append((dwd_name, entry["dwd_col"]))
# 差异原因
reasons: list[str] = []
if api_unmapped_nested:
reasons.append(f"嵌套对象 {len(api_unmapped_nested)}")
if api_unmapped_flat:
reasons.append(f"平层未映射 {len(api_unmapped_flat)}")
if dwd_no_ods_fields:
reasons.append(f"SCD2/派生列 {len(dwd_no_ods_fields)}")
reason_str = "".join(reasons) if reasons else ""
# 汇总表单元格:数量 + 跳转链接
def _cell(count: int) -> str:
if count == 0:
return "0"
return f"[{count}](#{diff_anchor})"
w(f"| `{table_name}` | {_cell(api_unmapped_total)} | {_cell(len(ods_no_json_fields))} | {_cell(len(ods_no_dwd_fields))} | {_cell(len(dwd_no_ods_fields))} | {reason_str} |")
diff_rows.append({
"table_name": table_name,
"diff_anchor": diff_anchor,
"api_anchor": api_anchor,
"ods_anchor": ods_anchor,
"dwd_anchors": dwd_anchors,
"api_unmapped_flat": api_unmapped_flat,
"api_unmapped_nested": api_unmapped_nested,
"ods_no_json_fields": ods_no_json_fields,
"ods_no_dwd_fields": ods_no_dwd_fields,
"dwd_no_ods_fields": dwd_no_ods_fields,
})
w()
# ── 逐表差异分表 ──
sub_idx = 0
for row in diff_rows:
if row is None:
continue
has_any = (row["api_unmapped_flat"] or row["api_unmapped_nested"]
or row["ods_no_json_fields"] or row["ods_no_dwd_fields"]
or row["dwd_no_ods_fields"])
if not has_any:
continue
sub_idx += 1
table_name = row["table_name"]
w(f'<a id="{row["diff_anchor"]}"></a>')
w()
w(f"#### 1.1.{sub_idx} {table_name} 字段差异明细")
w()
api_anchor = row["api_anchor"]
ods_anchor = row["ods_anchor"]
dwd_anchors = row["dwd_anchors"]
# 加载辅助数据json_trees示例值、bd_descriptions业务说明
jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
jt_lookup: dict[str, dict] = {}
if jt and "fields" in jt:
for fld in jt["fields"]:
jt_lookup[fld["path"]] = fld
ods_descs = bd.get("ods_fields", {}) if bd else {}
dwd_descs_all = bd.get("dwd_fields", {}) if bd else {}
def _sample_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
"""从 json_trees 或 bd_descriptions 获取示例值字符串"""
if layer == "API":
entry = jt_lookup.get(field_name, {})
samples = entry.get("samples", [])
total_recs = entry.get("total_records", 0)
if not samples:
single = entry.get("sample", "")
if single:
samples = [str(single)]
if _is_enum_like(samples, total_recs):
return ", ".join(f"`{_esc(s)}`" for s in samples[:5])
if samples:
return _format_samples(samples, max_show=3)
return ""
def _desc_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
"""从 bd_descriptions 获取业务说明"""
key = field_name.split(".")[-1].replace("[]", "").lower()
if layer in ("ODS", "API"):
desc = ods_descs.get(key, "")
elif layer == "DWD" and dwd_tbl:
desc = dwd_descs_all.get(dwd_tbl, {}).get(key, "")
else:
desc = ""
if desc and len(desc) > 40:
desc = desc[:37] + "..."
return _esc(desc)
# ── API→ODS 未映射(平层) ──
if row["api_unmapped_flat"]:
w(f"**API→ODS 未映射(平层)** — {len(row['api_unmapped_flat'])}")
w()
w("| # | JSON 字段 | 推测用途 | 置信度 | 示例值 | 说明 | 状态 |")
w("|---|----------|---------|-------|-------|------|------|")
for i, f in enumerate(row["api_unmapped_flat"], 1):
purpose, conf = _guess_field_purpose(f, table_name, "API")
sample = _sample_str(f, "API")
desc = _desc_str(f, "API")
w(f"| {i} | **[`{_esc(f)}`](#{api_anchor})** | {_esc(purpose)} | {conf} | {sample} | {desc} | **⚠️ 未映射** |")
w()
# ── API→ODS 未映射(嵌套对象) ──
if row["api_unmapped_nested"]:
w(f"<details><summary>API→ODS 未映射(嵌套对象)— {len(row['api_unmapped_nested'])} 个</summary>")
w()
w("| # | JSON 字段 | 推测用途 | 置信度 | 示例值 | 说明 | 状态 |")
w("|---|----------|---------|-------|-------|------|------|")
for i, f in enumerate(row["api_unmapped_nested"], 1):
purpose, conf = _guess_field_purpose(f, table_name, "API")
sample = _sample_str(f, "API")
desc = _desc_str(f, "API")
w(f"| {i} | [`{_esc(f)}`](#{api_anchor}) | {_esc(purpose)} | {conf} | {sample} | {desc} | 📦 嵌套 |")
w()
w("</details>")
w()
# ── ODS 无 JSON 源 ──
if row["ods_no_json_fields"]:
w(f"**ODS 无 JSON 源** — {len(row['ods_no_json_fields'])}")
w()
w("| # | ODS 列 | 推测用途 | 置信度 | 说明 | 状态 |")
w("|---|-------|---------|-------|------|------|")
for i, f in enumerate(row["ods_no_json_fields"], 1):
purpose, conf = _guess_field_purpose(f, table_name, "ODS")
desc = _desc_str(f, "ODS")
w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {_esc(purpose)} | {conf} | {desc} | **⚠️ 无 JSON 源** |")
w()
# ── ODS→DWD 未映射 ──
if row["ods_no_dwd_fields"]:
w(f"**ODS→DWD 未映射** — {len(row['ods_no_dwd_fields'])}")
w()
w("| # | ODS 列 | 推测用途 | 置信度 | 说明 | 状态 |")
w("|---|-------|---------|-------|------|------|")
for i, f in enumerate(row["ods_no_dwd_fields"], 1):
purpose, conf = _guess_field_purpose(f, table_name, "ODS")
desc = _desc_str(f, "ODS")
w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {_esc(purpose)} | {conf} | {desc} | **⚠️ 无 DWD 目标** |")
w()
# ── DWD 无 ODS 源 ──
if row["dwd_no_ods_fields"]:
w(f"**DWD 无 ODS 源** — {len(row['dwd_no_ods_fields'])}")
w()
w("| # | DWD 表 | DWD 列 | 推测用途 | 置信度 | 说明 | 状态 |")
w("|---|-------|-------|---------|-------|------|------|")
for i, (dwd_name, dwd_col) in enumerate(row["dwd_no_ods_fields"], 1):
dwd_a = dwd_anchors.get(dwd_name, f"dwd-{dwd_name.replace('_', '-')}")
purpose, conf = _guess_field_purpose(dwd_col, table_name, "DWD")
desc = _desc_str(dwd_col, "DWD", dwd_tbl=dwd_name)
w(f"| {i} | {dwd_name} | **[`{_esc(dwd_col)}`](#{dwd_a})** | {_esc(purpose)} | {conf} | {desc} | **⚠️ 无 ODS 源** |")
w()
w()
def _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor):
"""生成 API 源字段区块(增加业务描述列,合并说明+示例值)"""
w(f'<a id="{api_anchor}"></a>')
w()
w(f"#### API 源字段 — {table_name} [🔗 ODS](#{ods_anchor})")
w()
if not fm or "json_to_ods" not in fm:
w("_无 field_mappings 数据_")
w()
return
j2o = fm["json_to_ods"]
# 构建 json_tree 查找表(含 samples
jt_lookup: dict[str, dict] = {}
if jt and "fields" in jt:
for f in jt["fields"]:
jt_lookup[f["path"]] = f
# BD_manual ODS 描述(用于交叉引用 JSON 字段的业务含义)
ods_descs = bd.get("ods_fields", {}) if bd else {}
mapped_count = sum(1 for m in j2o if m.get("ods_col") is not None)
total_count = len(j2o)
if total_count > 0:
w(f"已映射 {mapped_count}/{total_count},覆盖率 {mapped_count / total_count * 100:.1f}%")
else:
w("无字段")
w()
w("| # | JSON 字段 | 类型 | → ODS 列 | 业务描述 | 示例值与说明 |")
w("|---|----------|------|---------|---------|------------|")
for i, m in enumerate(j2o, 1):
json_path = m["json_path"]
json_type = m.get("json_type", "")
ods_col = m.get("ods_col")
match_type = m.get("match_type", "")
occurrence_pct = m.get("occurrence_pct", 0)
# 从 json_tree 获取示例值(优先用 samples 多示例)
jt_entry = jt_lookup.get(json_path, {})
samples = jt_entry.get("samples", [])
total_recs = jt_entry.get("total_records", 0)
if not samples:
single = jt_entry.get("sample", "")
if single:
samples = [str(single)]
# 构建 ODS 列链接
if ods_col:
ods_link = f"[`{ods_col}`](#{ods_anchor})"
else:
ods_link = "⚠️ 未映射"
# 业务描述(从 BD_manual 查找,用 ODS 列名或 JSON 叶子名)
leaf = json_path.split(".")[-1].replace("[]", "").lower()
biz_desc = ods_descs.get(leaf, "")
if biz_desc and len(biz_desc) > 60:
biz_desc = biz_desc[:57] + "..."
biz_desc = _esc(biz_desc)
# 合并说明+示例值
notes_parts: list[str] = []
if json_path.startswith("siteProfile.") or ("." in json_path and match_type == "unmapped"):
notes_parts.append("📦 嵌套对象")
if match_type == "case_insensitive":
notes_parts.append("大小写匹配")
if occurrence_pct < 100:
notes_parts.append(f"出现率 {occurrence_pct:.0f}%")
# 示例值展示
if _is_enum_like(samples, total_recs):
notes_parts.append(f"枚举值: {', '.join(f'`{_esc(s)}`' for s in samples[:8])}")
elif samples:
notes_parts.append(f"示例: {_format_samples(samples)}")
note_str = "".join(notes_parts) if notes_parts else ""
w(f"| {i} | `{_esc(json_path)}` | {json_type} | {ods_link} | {biz_desc} | {note_str} |")
w()
def _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors):
"""生成 ODS 表结构区块(含上下游双向映射列 + 业务描述)"""
w(f'<a id="{ods_anchor}"></a>')
w()
w(f"#### ODS 表结构 — ods.{table_name} [🔗 API](#{api_anchor})")
w()
if not ods_schema or "columns" not in ods_schema:
w("_无 DB schema 数据_")
w()
return
# 构建 json_to_ods 反向查找ods_col → json_path
ods_to_json: dict[str, str] = {}
if fm and "json_to_ods" in fm:
for m in fm["json_to_ods"]:
if m.get("ods_col"):
ods_to_json.setdefault(m["ods_col"], m["json_path"])
# 构建 ods_to_dwd 查找
ods_to_dwd: dict[str, list[dict]] = {}
if fm and "ods_to_dwd" in fm:
ods_to_dwd = fm["ods_to_dwd"]
# BD_manual ODS 描述
ods_descs = bd.get("ods_fields", {}) if bd else {}
cols = ods_schema["columns"]
w(f"{len(cols)}")
w()
w("| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 | 业务描述 |")
w("|---|---------|------|----------|-----------|---------|")
for i, col in enumerate(cols, 1):
col_name = col["name"]
col_type = col["data_type"]
# ← JSON 源
json_src = ods_to_json.get(col_name)
if json_src:
json_link = f"[`{_esc(json_src)}`](#{api_anchor})"
else:
json_link = ""
# → DWD 目标
dwd_targets = ods_to_dwd.get(col_name, [])
if dwd_targets:
dwd_links = []
for dt in dwd_targets:
dwd_tbl = dt["dwd_table"]
dwd_col = dt["dwd_col"]
dwd_anc = dwd_anchors.get(dwd_tbl, f"dwd-{dwd_tbl}")
dwd_links.append(f"[`{dwd_tbl}.{dwd_col}`](#{dwd_anc})")
dwd_link = ", ".join(dwd_links)
else:
dwd_link = ""
# 业务描述
biz_desc = ods_descs.get(col_name.lower(), "")
if biz_desc and len(biz_desc) > 60:
biz_desc = biz_desc[:57] + "..."
biz_desc = _esc(biz_desc)
w(f"| {i} | `{col_name}` | {col_type} | {json_link} | {dwd_link} | {biz_desc} |")
w()
def _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name):
"""生成 DWD 表结构区块(增加业务描述列)"""
w(f'<a id="{dwd_anchor}"></a>')
w()
w(f"#### DWD 表结构 — dwd.{dwd_name} [🔗 ODS](#{ods_anchor})")
w()
if not dwd_schema or "columns" not in dwd_schema:
w("_无 DB schema 数据_")
w()
return
# 构建 dwd_to_ods 查找
dwd_to_ods_map: dict[str, dict] = {}
if fm and "dwd_to_ods" in fm and dwd_name in fm["dwd_to_ods"]:
for entry in fm["dwd_to_ods"][dwd_name]:
dwd_to_ods_map[entry["dwd_col"]] = entry
# BD_manual DWD 描述
dwd_descs = {}
if bd and "dwd_fields" in bd:
dwd_descs = bd["dwd_fields"].get(dwd_name, {})
cols = dwd_schema["columns"]
w(f"{len(cols)}")
w()
w("| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 业务描述 |")
w("|---|---------|------|----------|------|---------|")
for i, col in enumerate(cols, 1):
col_name = col["name"]
col_type = col["data_type"]
mapping = dwd_to_ods_map.get(col_name)
if mapping:
ods_src = mapping.get("ods_source", "")
ods_link = f"[`{ods_src}`](#{ods_anchor})" if ods_src and ods_src != "" else ""
transform = mapping.get("mapping_type", "")
note = mapping.get("note", "")
else:
ods_link = ""
transform = ""
note = ""
if col_name in ("valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id"):
transform = "ETL 生成"
# 业务描述(优先 BD_manual其次 mapping note最后 DB comment
biz_desc = dwd_descs.get(col_name.lower(), "")
if not biz_desc and note:
biz_desc = note
if not biz_desc:
db_comment = col.get("comment", "")
if db_comment:
if "【说明】" in db_comment:
desc_part = db_comment.split("【说明】")[1]
if "" in desc_part:
desc_part = desc_part.split("")[0]
biz_desc = desc_part.strip().rstrip("").strip()
else:
biz_desc = db_comment
if biz_desc and len(biz_desc) > 60:
biz_desc = biz_desc[:57] + "..."
biz_desc = _esc(biz_desc)
w(f"| {i} | `{col_name}` | {col_type} | {ods_link} | {_esc(transform)} | {biz_desc} |")
w()
def main() -> None:
load_dotenv(Path(".env"), override=False)
parser = build_parser()
args = parser.parse_args()
data_dir = resolve_data_dir(args.output_dir)
if not data_dir.exists():
print(f"错误:数据目录不存在: {data_dir}")
return
print(f"读取数据目录: {data_dir}")
report = generate_report(data_dir)
now = datetime.now()
filename = f"dataflow_{now.strftime('%Y-%m-%d_%H%M%S')}.md"
output_path = data_dir / filename
with open(output_path, "w", encoding="utf-8") as f:
f.write(report)
print(f"\n{'='*60}")
print(f"报告生成完成")
print(f"{'='*60}")
print(f" 输出路径: {output_path}")
print(f" 文件大小: {output_path.stat().st_size / 1024:.1f} KB")
print(f"{'='*60}")
if __name__ == "__main__":
main()