feat: 累积功能变更 — 聊天集成、租户管理、小程序更新、ETL 增强、迁移脚本
包含多个会话的累积代码变更: - backend: AI 聊天服务、触发器调度、认证增强、WebSocket、调度器最小间隔 - admin-web: ETL 状态页、任务管理、调度配置、登录优化 - miniprogram: 看板页面、聊天集成、UI 组件、导航更新 - etl: DWS 新任务(finance_area_daily/board_cache)、连接器增强 - tenant-admin: 项目初始化 - db: 19 个迁移脚本(etl_feiqiu 11 + zqyy_app 8) - packages/shared: 枚举和工具函数更新 - tools: 数据库工具、报表生成、健康检查 - docs: PRD/架构/部署/合约文档更新 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1255
tools/db/etl_consistency_check.py
Normal file
1255
tools/db/etl_consistency_check.py
Normal file
File diff suppressed because it is too large
Load Diff
280
tools/db/gen_consolidated_ddl.py
Normal file
280
tools/db/gen_consolidated_ddl.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""
|
||||
从测试数据库导出完整 DDL,按 schema 分文件写入 docs/database/ddl/。
|
||||
以数据库现状为准,整合所有 schema/表/约束/索引/视图/物化视图/序列/FDW 配置。
|
||||
|
||||
输出文件:
|
||||
docs/database/ddl/etl_feiqiu__meta.sql
|
||||
docs/database/ddl/etl_feiqiu__ods.sql
|
||||
docs/database/ddl/etl_feiqiu__dwd.sql
|
||||
docs/database/ddl/etl_feiqiu__core.sql
|
||||
docs/database/ddl/etl_feiqiu__dws.sql
|
||||
docs/database/ddl/etl_feiqiu__app.sql
|
||||
docs/database/ddl/zqyy_app__public.sql
|
||||
docs/database/ddl/zqyy_app__auth.sql
|
||||
docs/database/ddl/zqyy_app__biz.sql
|
||||
docs/database/ddl/fdw.sql
|
||||
|
||||
用法:cd C:\\NeoZQYY && python scripts/ops/gen_consolidated_ddl.py
|
||||
"""
|
||||
import os, sys
|
||||
from pathlib import Path
|
||||
from datetime import date
|
||||
|
||||
import psycopg2
|
||||
|
||||
# ── 环境 ──────────────────────────────────────────────────────────────────
|
||||
from dotenv import load_dotenv
|
||||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
load_dotenv(ROOT / ".env")
|
||||
|
||||
ETL_DSN = os.environ.get("TEST_DB_DSN") or os.environ.get("PG_DSN")
|
||||
APP_DSN = os.environ.get("TEST_APP_DB_DSN") or os.environ.get("APP_DB_DSN")
|
||||
if not ETL_DSN:
|
||||
sys.exit("ERROR: TEST_DB_DSN / PG_DSN 未配置")
|
||||
if not APP_DSN:
|
||||
sys.exit("ERROR: TEST_APP_DB_DSN / APP_DB_DSN 未配置")
|
||||
|
||||
OUTPUT_DIR = ROOT / "docs" / "database" / "ddl"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
FDW_FILE = ROOT / "db" / "fdw" / "setup_fdw.sql"
|
||||
TODAY = date.today().isoformat()
|
||||
|
||||
# ── SQL 模板 ──────────────────────────────────────────────────────────────
|
||||
SQL_TABLES = """
|
||||
WITH cols AS (
|
||||
SELECT table_schema, table_name,
|
||||
string_agg(
|
||||
format(E' %%I %%s%%s%%s',
|
||||
column_name,
|
||||
CASE WHEN data_type = 'USER-DEFINED' THEN udt_name
|
||||
WHEN data_type = 'ARRAY' THEN udt_name
|
||||
WHEN character_maximum_length IS NOT NULL THEN data_type || '(' || character_maximum_length || ')'
|
||||
WHEN numeric_precision IS NOT NULL AND data_type IN ('numeric','decimal') THEN data_type || '(' || numeric_precision || ',' || numeric_scale || ')'
|
||||
ELSE data_type END,
|
||||
CASE WHEN column_default IS NOT NULL THEN ' DEFAULT ' || column_default ELSE '' END,
|
||||
CASE WHEN is_nullable = 'NO' THEN ' NOT NULL' ELSE '' END
|
||||
), E',\\n' ORDER BY ordinal_position
|
||||
) as col_defs
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s
|
||||
AND table_name IN (SELECT table_name FROM information_schema.tables WHERE table_schema = %s AND table_type = 'BASE TABLE')
|
||||
GROUP BY table_schema, table_name
|
||||
)
|
||||
SELECT format(E'CREATE TABLE %%I.%%I (\\n%%s\\n);', table_schema, table_name, col_defs) as ddl
|
||||
FROM cols ORDER BY table_name;
|
||||
"""
|
||||
|
||||
SQL_CONSTRAINTS = """
|
||||
SELECT n.nspname as schema, conrelid::regclass as tbl, conname,
|
||||
pg_get_constraintdef(c.oid) as def, contype
|
||||
FROM pg_constraint c
|
||||
JOIN pg_namespace n ON n.oid = c.connamespace
|
||||
WHERE n.nspname = %s AND contype IN ('p','u','f')
|
||||
ORDER BY conrelid::regclass::text, contype, conname;
|
||||
"""
|
||||
|
||||
SQL_INDEXES = """
|
||||
SELECT indexname, indexdef
|
||||
FROM pg_indexes
|
||||
WHERE schemaname = %s
|
||||
AND indexname NOT IN (SELECT conname FROM pg_constraint WHERE contype IN ('p','u'))
|
||||
ORDER BY tablename, indexname;
|
||||
"""
|
||||
|
||||
SQL_SEQUENCES = """
|
||||
SELECT sequence_name, data_type
|
||||
FROM information_schema.sequences
|
||||
WHERE sequence_schema = %s
|
||||
ORDER BY sequence_name;
|
||||
"""
|
||||
|
||||
SQL_VIEWS = """
|
||||
SELECT viewname, definition
|
||||
FROM pg_views
|
||||
WHERE schemaname = %s
|
||||
ORDER BY viewname;
|
||||
"""
|
||||
|
||||
SQL_MATVIEWS = """
|
||||
SELECT matviewname, definition
|
||||
FROM pg_matviews
|
||||
WHERE schemaname = %s
|
||||
ORDER BY matviewname;
|
||||
"""
|
||||
|
||||
SQL_MV_INDEXES = """
|
||||
SELECT indexname, indexdef
|
||||
FROM pg_indexes
|
||||
WHERE schemaname = %s
|
||||
AND tablename LIKE 'mv_%%'
|
||||
ORDER BY tablename, indexname;
|
||||
"""
|
||||
|
||||
SQL_TABLE_COUNT = """
|
||||
SELECT count(*) FROM information_schema.tables
|
||||
WHERE table_schema = %s AND table_type = 'BASE TABLE';
|
||||
"""
|
||||
|
||||
# ── 辅助函数 ──────────────────────────────────────────────────────────────
|
||||
def query(conn, sql, params=None):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, params)
|
||||
return cur.fetchall()
|
||||
|
||||
def section(f, title, level=1):
|
||||
sep = "=" * 77 if level == 1 else "-" * 77
|
||||
f.write(f"\n-- {sep}\n-- {title}\n-- {sep}\n\n")
|
||||
|
||||
def write_sequences(f, conn, schema):
|
||||
rows = query(conn, SQL_SEQUENCES, (schema,))
|
||||
if not rows:
|
||||
return
|
||||
f.write("-- 序列\n")
|
||||
for name, dtype in rows:
|
||||
f.write(f"CREATE SEQUENCE IF NOT EXISTS {schema}.{name} AS {dtype};\n")
|
||||
f.write("\n")
|
||||
|
||||
def write_tables(f, conn, schema):
|
||||
rows = query(conn, SQL_TABLES, (schema, schema))
|
||||
if not rows:
|
||||
return
|
||||
f.write("-- 表\n")
|
||||
for (ddl,) in rows:
|
||||
f.write(ddl + "\n\n")
|
||||
|
||||
def write_constraints(f, conn, schema):
|
||||
rows = query(conn, SQL_CONSTRAINTS, (schema,))
|
||||
if not rows:
|
||||
return
|
||||
f.write("-- 约束(主键 / 唯一 / 外键)\n")
|
||||
for _, tbl, conname, condef, _ in rows:
|
||||
f.write(f"ALTER TABLE {tbl} ADD CONSTRAINT {conname} {condef};\n")
|
||||
f.write("\n")
|
||||
|
||||
def write_indexes(f, conn, schema):
|
||||
rows = query(conn, SQL_INDEXES, (schema,))
|
||||
if not rows:
|
||||
return
|
||||
f.write("-- 索引\n")
|
||||
for _, indexdef in rows:
|
||||
f.write(indexdef + ";\n")
|
||||
f.write("\n")
|
||||
|
||||
def write_views(f, conn, schema):
|
||||
rows = query(conn, SQL_VIEWS, (schema,))
|
||||
if not rows:
|
||||
return
|
||||
f.write("-- 视图\n")
|
||||
for vname, vdef in rows:
|
||||
f.write(f"CREATE OR REPLACE VIEW {schema}.{vname} AS\n{vdef.strip()}\n;\n\n")
|
||||
|
||||
def write_matviews(f, conn, schema):
|
||||
rows = query(conn, SQL_MATVIEWS, (schema,))
|
||||
if not rows:
|
||||
return
|
||||
f.write("-- 物化视图\n")
|
||||
for mvname, mvdef in rows:
|
||||
f.write(f"CREATE MATERIALIZED VIEW {schema}.{mvname} AS\n{mvdef.strip()}\n;\n\n")
|
||||
# 物化视图索引
|
||||
idx_rows = query(conn, SQL_MV_INDEXES, (schema,))
|
||||
if idx_rows:
|
||||
f.write("-- 物化视图索引\n")
|
||||
for _, indexdef in idx_rows:
|
||||
f.write(indexdef + ";\n")
|
||||
f.write("\n")
|
||||
|
||||
def write_schema_file(conn, db_name, schema, label, views_only=False):
|
||||
"""为单个 schema 生成独立 DDL 文件。"""
|
||||
filename = f"{db_name}__{schema}.sql"
|
||||
filepath = OUTPUT_DIR / filename
|
||||
|
||||
# 获取表数量
|
||||
table_count = query(conn, SQL_TABLE_COUNT, (schema,))[0][0]
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(f"""\
|
||||
-- =============================================================================
|
||||
-- {db_name} / {schema}({label})
|
||||
-- 生成日期:{TODAY}
|
||||
-- 来源:测试库(通过脚本自动导出)
|
||||
-- =============================================================================
|
||||
|
||||
CREATE SCHEMA IF NOT EXISTS {schema};
|
||||
|
||||
""")
|
||||
if views_only:
|
||||
write_views(f, conn, schema)
|
||||
else:
|
||||
write_sequences(f, conn, schema)
|
||||
write_tables(f, conn, schema)
|
||||
write_constraints(f, conn, schema)
|
||||
write_indexes(f, conn, schema)
|
||||
write_views(f, conn, schema)
|
||||
write_matviews(f, conn, schema)
|
||||
|
||||
size_kb = filepath.stat().st_size / 1024
|
||||
obj_desc = "仅视图" if views_only else f"{table_count} 表"
|
||||
print(f" ✅ {filename:<35s} {size_kb:>6.1f} KB ({obj_desc})")
|
||||
return filepath
|
||||
|
||||
|
||||
def write_fdw_file():
|
||||
"""输出 FDW 配置文件。"""
|
||||
filepath = OUTPUT_DIR / "fdw.sql"
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(f"""\
|
||||
-- =============================================================================
|
||||
-- FDW 跨库映射(在 zqyy_app 中执行)
|
||||
-- 生成日期:{TODAY}
|
||||
-- 来源:db/fdw/setup_fdw.sql
|
||||
-- =============================================================================
|
||||
|
||||
""")
|
||||
if FDW_FILE.exists():
|
||||
f.write(FDW_FILE.read_text(encoding="utf-8"))
|
||||
f.write("\n")
|
||||
else:
|
||||
f.write("-- FDW 配置文件未找到:db/fdw/setup_fdw.sql\n")
|
||||
|
||||
size_kb = filepath.stat().st_size / 1024
|
||||
print(f" ✅ {'fdw.sql':<35s} {size_kb:>6.1f} KB")
|
||||
return filepath
|
||||
|
||||
|
||||
# ── 主流程 ────────────────────────────────────────────────────────────────
|
||||
def main():
|
||||
etl_conn = psycopg2.connect(ETL_DSN)
|
||||
app_conn = psycopg2.connect(APP_DSN)
|
||||
|
||||
print(f"输出目录:{OUTPUT_DIR}\n")
|
||||
|
||||
# etl_feiqiu 六层 schema
|
||||
write_schema_file(etl_conn, "etl_feiqiu", "meta", "ETL 调度元数据")
|
||||
write_schema_file(etl_conn, "etl_feiqiu", "ods", "原始数据层")
|
||||
write_schema_file(etl_conn, "etl_feiqiu", "dwd", "明细数据层")
|
||||
write_schema_file(etl_conn, "etl_feiqiu", "core", "跨门店标准化维度/事实")
|
||||
write_schema_file(etl_conn, "etl_feiqiu", "dws", "汇总数据层")
|
||||
write_schema_file(etl_conn, "etl_feiqiu", "app", "RLS 视图层", views_only=True)
|
||||
|
||||
# zqyy_app
|
||||
write_schema_file(app_conn, "zqyy_app", "public", "小程序业务表")
|
||||
write_schema_file(app_conn, "zqyy_app", "auth", "用户认证与权限")
|
||||
write_schema_file(app_conn, "zqyy_app", "biz", "核心业务表(任务/备注/触发器)")
|
||||
|
||||
# FDW
|
||||
write_fdw_file()
|
||||
|
||||
etl_conn.close()
|
||||
app_conn.close()
|
||||
|
||||
# 删除旧的合并文件
|
||||
old_file = ROOT / "docs" / "database" / "consolidated_ddl.sql"
|
||||
if old_file.exists():
|
||||
old_file.unlink()
|
||||
print(f"\n🗑️ 已删除旧文件:{old_file.name}")
|
||||
|
||||
print(f"\n✅ 完成,共 10 个文件")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
83
tools/db/setup_fdw_both.py
Normal file
83
tools/db/setup_fdw_both.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
在 zqyy_app 和 test_zqyy_app 中执行 FDW 配置。
|
||||
- zqyy_app -> setup_fdw.sql (指向 etl_feiqiu)
|
||||
- test_zqyy_app -> setup_fdw_test.sql (指向 test_etl_feiqiu)
|
||||
"""
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
CONN = dict(host="100.64.0.4", port=5432, user="local-Python", password="Neo-local-1991125")
|
||||
BASE = r"C:\NeoZQYY"
|
||||
|
||||
# 实际密码替换占位符 '***'
|
||||
APP_READER_PWD = "AppR3ad_2026!"
|
||||
|
||||
TARGETS = [
|
||||
("zqyy_app", os.path.join(BASE, "db", "fdw", "setup_fdw.sql")),
|
||||
("test_zqyy_app", os.path.join(BASE, "db", "fdw", "setup_fdw_test.sql")),
|
||||
]
|
||||
|
||||
for dbname, sql_path in TARGETS:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"执行 FDW 配置: {dbname} <- {os.path.basename(sql_path)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
sql = open(sql_path, encoding="utf-8").read()
|
||||
# 替换密码占位符
|
||||
sql = sql.replace("password '***'", f"password '{APP_READER_PWD}'")
|
||||
|
||||
conn = psycopg2.connect(**CONN, dbname=dbname)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# 逐条执行(按分号拆分,跳过注释和空行)
|
||||
statements = []
|
||||
current = []
|
||||
for line in sql.split("\n"):
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("--") or not stripped:
|
||||
continue
|
||||
current.append(line)
|
||||
if stripped.endswith(";"):
|
||||
statements.append("\n".join(current))
|
||||
current = []
|
||||
|
||||
success = 0
|
||||
skip = 0
|
||||
fail = 0
|
||||
for stmt in statements:
|
||||
try:
|
||||
cur.execute(stmt)
|
||||
first_line = stmt.strip().split("\n")[0][:80]
|
||||
print(f" [OK] {first_line}")
|
||||
success += 1
|
||||
except psycopg2.errors.DuplicateObject as e:
|
||||
conn.rollback()
|
||||
print(f" [SKIP] 已存在: {str(e).strip().split(chr(10))[0]}")
|
||||
skip += 1
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f" [FAIL] {str(e).strip().split(chr(10))[0]}")
|
||||
print(f" SQL: {stmt[:100]}")
|
||||
fail += 1
|
||||
|
||||
# 验证
|
||||
cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'postgres_fdw'")
|
||||
fdw_ext = cur.fetchone() is not None
|
||||
|
||||
cur.execute("SELECT srvname FROM pg_foreign_server")
|
||||
servers = [r[0] for r in cur.fetchall()]
|
||||
|
||||
cur.execute(
|
||||
"SELECT count(*) FROM information_schema.tables "
|
||||
"WHERE table_schema = 'fdw_etl'"
|
||||
)
|
||||
fdw_tables = cur.fetchone()[0]
|
||||
|
||||
print(f"\n 结果: {success} OK, {skip} SKIP, {fail} FAIL")
|
||||
print(f" 验证: fdw扩展={fdw_ext}, servers={servers}, fdw_etl表数={fdw_tables}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n完成!")
|
||||
41
tools/db/verify_all_dbs.py
Normal file
41
tools/db/verify_all_dbs.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""验证四个数据库的状态:表数量、schema 分布"""
|
||||
import psycopg2
|
||||
|
||||
CONN = dict(host="100.64.0.4", port=5432, user="local-Python", password="Neo-local-1991125")
|
||||
DBS = ["etl_feiqiu", "test_etl_feiqiu", "zqyy_app", "test_zqyy_app"]
|
||||
|
||||
for db in DBS:
|
||||
try:
|
||||
c = psycopg2.connect(**CONN, dbname=db)
|
||||
cur = c.cursor()
|
||||
cur.execute(
|
||||
"SELECT schemaname, count(*) FROM pg_tables "
|
||||
"WHERE schemaname NOT IN ('pg_catalog','information_schema') "
|
||||
"GROUP BY schemaname ORDER BY schemaname"
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
total = sum(r[1] for r in rows)
|
||||
schemas = ", ".join(f"{r[0]}({r[1]})" for r in rows)
|
||||
print(f"[OK] {db}: {total} tables | {schemas}")
|
||||
|
||||
# 物化视图数量
|
||||
cur.execute(
|
||||
"SELECT count(*) FROM pg_matviews "
|
||||
"WHERE schemaname NOT IN ('pg_catalog','information_schema')"
|
||||
)
|
||||
mv_count = cur.fetchone()[0]
|
||||
if mv_count:
|
||||
print(f" matviews: {mv_count}")
|
||||
|
||||
c.close()
|
||||
except Exception as e:
|
||||
print(f"[FAIL] {db}: {e}")
|
||||
|
||||
print("\n--- 配置文件指向 ---")
|
||||
print("ETL .env PG_DSN -> test_etl_feiqiu (已确认)")
|
||||
print("根 .env -> PG_DSN=test_etl_feiqiu, APP_DB_DSN=test_zqyy_app")
|
||||
print("后端 .env.local -> APP_DB_NAME=test_zqyy_app, ETL_DB_NAME=test_etl_feiqiu")
|
||||
print("后端 config.py 默认值 -> test_zqyy_app / test_etl_feiqiu")
|
||||
print("FDW 生产 -> setup_fdw.sql (etl_feiqiu)")
|
||||
print("FDW 测试 -> setup_fdw_test.sql (test_etl_feiqiu)")
|
||||
160
tools/db/verify_ddl_migration.py
Normal file
160
tools/db/verify_ddl_migration.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
DDL 迁移验证脚本 — admin-web-enhancement spec (Task 17.2)
|
||||
|
||||
验证 NS4.1 + P16 迁移在 test_zqyy_app 上的正确性:
|
||||
1. biz.connectors / biz.tenants / biz.sites / biz.site_code_history 四张表存在且字段正确
|
||||
2. scheduled_tasks 新增 3 个字段
|
||||
3. auth._archived_site_code_mapping 存在
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
dsn = os.environ.get("APP_DB_DSN", "")
|
||||
if not dsn:
|
||||
print("ERROR: APP_DB_DSN 未设置")
|
||||
sys.exit(1)
|
||||
if "test_" not in dsn:
|
||||
print(f"ERROR: APP_DB_DSN 不包含 'test_',拒绝连接非测试库: {dsn}")
|
||||
sys.exit(1)
|
||||
|
||||
import psycopg2 # noqa: E402
|
||||
|
||||
conn = psycopg2.connect(dsn)
|
||||
cur = conn.cursor()
|
||||
|
||||
results: list[tuple[str, bool, str]] = []
|
||||
|
||||
|
||||
def check(name: str, sql: str, expected_fn):
|
||||
"""执行 SQL 并用 expected_fn 判断结果,记录 pass/fail。"""
|
||||
try:
|
||||
cur.execute(sql)
|
||||
rows = cur.fetchall()
|
||||
ok, detail = expected_fn(rows)
|
||||
results.append((name, ok, detail))
|
||||
except Exception as e:
|
||||
results.append((name, False, f"异常: {e}"))
|
||||
|
||||
|
||||
# ── 1. biz.connectors 表字段 ──
|
||||
check(
|
||||
"biz.connectors 表存在且字段正确",
|
||||
"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = 'biz' AND table_name = 'connectors'
|
||||
ORDER BY ordinal_position
|
||||
""",
|
||||
lambda rows: (
|
||||
(cols := [r[0] for r in rows])
|
||||
and set(cols) == {"id", "connector_key", "display_name", "is_active", "created_at"},
|
||||
f"字段: {cols}",
|
||||
),
|
||||
)
|
||||
|
||||
# ── 2. biz.tenants 表字段 ──
|
||||
check(
|
||||
"biz.tenants 表存在且字段正确",
|
||||
"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = 'biz' AND table_name = 'tenants'
|
||||
ORDER BY ordinal_position
|
||||
""",
|
||||
lambda rows: (
|
||||
(cols := [r[0] for r in rows])
|
||||
and set(cols)
|
||||
== {"id", "connector_id", "tenant_id", "tenant_name", "is_active", "created_at", "updated_at"},
|
||||
f"字段: {cols}",
|
||||
),
|
||||
)
|
||||
|
||||
# ── 3. biz.sites 表字段 ──
|
||||
check(
|
||||
"biz.sites 表存在且字段正确",
|
||||
"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = 'biz' AND table_name = 'sites'
|
||||
ORDER BY ordinal_position
|
||||
""",
|
||||
lambda rows: (
|
||||
(cols := [r[0] for r in rows])
|
||||
and set(cols)
|
||||
== {
|
||||
"id", "tenant_id", "site_id", "site_name", "site_code",
|
||||
"site_label", "is_active", "created_at", "updated_at",
|
||||
},
|
||||
f"字段: {cols}",
|
||||
),
|
||||
)
|
||||
|
||||
# ── 4. biz.site_code_history 表字段 ──
|
||||
check(
|
||||
"biz.site_code_history 表存在且字段正确",
|
||||
"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = 'biz' AND table_name = 'site_code_history'
|
||||
ORDER BY ordinal_position
|
||||
""",
|
||||
lambda rows: (
|
||||
(cols := [r[0] for r in rows])
|
||||
and set(cols) == {"id", "site_id", "site_code", "is_current", "created_at", "retired_at"},
|
||||
f"字段: {cols}",
|
||||
),
|
||||
)
|
||||
|
||||
# ── 5. scheduled_tasks 新增 3 个字段 ──
|
||||
check(
|
||||
"scheduled_tasks 新增字段存在",
|
||||
"""
|
||||
SELECT column_name, data_type, column_default
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = 'public' AND table_name = 'scheduled_tasks'
|
||||
AND column_name IN ('min_run_interval_value', 'min_run_interval_unit', 'last_success_at')
|
||||
ORDER BY ordinal_position
|
||||
""",
|
||||
lambda rows: (
|
||||
len(rows) == 3,
|
||||
f"找到 {len(rows)} 个字段: {[r[0] for r in rows]}",
|
||||
),
|
||||
)
|
||||
|
||||
# ── 6. auth._archived_site_code_mapping 存在 ──
|
||||
check(
|
||||
"auth._archived_site_code_mapping 存在",
|
||||
"""
|
||||
SELECT table_name FROM information_schema.tables
|
||||
WHERE table_schema = 'auth' AND table_name = '_archived_site_code_mapping'
|
||||
""",
|
||||
lambda rows: (
|
||||
len(rows) == 1,
|
||||
f"找到 {len(rows)} 张表",
|
||||
),
|
||||
)
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
# ── 打印结果 ──
|
||||
print("\n" + "=" * 60)
|
||||
print("DDL 迁移验证结果 — admin-web-enhancement")
|
||||
print("=" * 60)
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
for name, ok, detail in results:
|
||||
status = "✅ PASS" if ok else "❌ FAIL"
|
||||
print(f" {status} {name}")
|
||||
print(f" {detail}")
|
||||
if ok:
|
||||
passed += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
print(f"\n总计: {passed} 通过, {failed} 失败")
|
||||
if failed > 0:
|
||||
sys.exit(1)
|
||||
print("全部通过 ✅")
|
||||
119
tools/health/api_health_check.py
Normal file
119
tools/health/api_health_check.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""API 健康检查脚本
|
||||
|
||||
任务 1.3:登录获取 JWT → 验证任务注册表 → 执行 sync-check
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
import requests
|
||||
|
||||
BASE_URL = "http://localhost:8000"
|
||||
ADMIN_USER = "admin"
|
||||
ADMIN_PASS = "admin123"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ok = True
|
||||
|
||||
# ── 1. 登录获取 JWT ──────────────────────────────────────
|
||||
print("=" * 60)
|
||||
print("[1/3] POST /api/auth/login — 登录获取 JWT")
|
||||
print("=" * 60)
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{BASE_URL}/api/auth/login",
|
||||
json={"username": ADMIN_USER, "password": ADMIN_PASS},
|
||||
timeout=10,
|
||||
)
|
||||
except requests.ConnectionError:
|
||||
print("✗ 无法连接后端服务,请确认 uvicorn 已在 :8000 启动")
|
||||
return 1
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"✗ 登录失败: HTTP {resp.status_code}")
|
||||
print(f" 响应: {resp.text[:500]}")
|
||||
return 1
|
||||
|
||||
tokens = resp.json()
|
||||
jwt = tokens["access_token"]
|
||||
print(f"✓ 登录成功,获取 JWT(前 40 字符): {jwt[:40]}...")
|
||||
print(f" token_type: {tokens['token_type']}")
|
||||
print()
|
||||
|
||||
headers = {"Authorization": f"Bearer {jwt}"}
|
||||
|
||||
# ── 2. 获取任务注册表 ────────────────────────────────────
|
||||
print("=" * 60)
|
||||
print("[2/3] GET /api/tasks/registry — 验证任务注册表")
|
||||
print("=" * 60)
|
||||
resp = requests.get(f"{BASE_URL}/api/tasks/registry", headers=headers, timeout=10)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"✗ 获取注册表失败: HTTP {resp.status_code}")
|
||||
print(f" 响应: {resp.text[:500]}")
|
||||
ok = False
|
||||
else:
|
||||
data = resp.json()
|
||||
groups = data.get("groups", {})
|
||||
total_tasks = sum(len(tasks) for tasks in groups.values())
|
||||
common_tasks = sum(
|
||||
1 for tasks in groups.values() for t in tasks if t.get("is_common")
|
||||
)
|
||||
|
||||
if total_tasks == 0:
|
||||
print("✗ 任务注册表为空!")
|
||||
ok = False
|
||||
else:
|
||||
print(f"✓ 任务注册表非空")
|
||||
print(f" 业务域数量: {len(groups)}")
|
||||
print(f" 总任务数: {total_tasks}")
|
||||
print(f" 常用任务数: {common_tasks}")
|
||||
print(f" 业务域列表: {', '.join(sorted(groups.keys()))}")
|
||||
# 按域打印任务数
|
||||
for domain in sorted(groups.keys()):
|
||||
tasks = groups[domain]
|
||||
n_common = sum(1 for t in tasks if t.get("is_common"))
|
||||
print(f" {domain}: {len(tasks)} 个任务({n_common} 个常用)")
|
||||
print()
|
||||
|
||||
# ── 3. Sync-Check ────────────────────────────────────────
|
||||
print("=" * 60)
|
||||
print("[3/3] GET /api/tasks/sync-check — 后端与 ETL 注册表同步检查")
|
||||
print("=" * 60)
|
||||
resp = requests.get(f"{BASE_URL}/api/tasks/sync-check", headers=headers, timeout=30)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f"✗ sync-check 请求失败: HTTP {resp.status_code}")
|
||||
print(f" 响应: {resp.text[:500]}")
|
||||
ok = False
|
||||
else:
|
||||
sc = resp.json()
|
||||
if sc.get("error"):
|
||||
print(f"⚠ sync-check 返回错误: {sc['error']}")
|
||||
ok = False
|
||||
elif sc.get("in_sync"):
|
||||
print("✓ 后端注册表与 ETL 真实注册表完全同步 (in_sync=true)")
|
||||
else:
|
||||
print("✗ 后端注册表与 ETL 真实注册表不同步 (in_sync=false)")
|
||||
if sc.get("backend_only"):
|
||||
print(f" 仅后端有(ETL 缺失): {sc['backend_only']}")
|
||||
if sc.get("etl_only"):
|
||||
print(f" 仅 ETL 有(后端缺失): {sc['etl_only']}")
|
||||
ok = False
|
||||
print()
|
||||
|
||||
# ── 汇总 ─────────────────────────────────────────────────
|
||||
print("=" * 60)
|
||||
if ok:
|
||||
print("✓ API 健康检查全部通过")
|
||||
else:
|
||||
print("✗ API 健康检查存在问题,请查看上方详情")
|
||||
print("=" * 60)
|
||||
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
209
tools/health/etl_monitor.py
Normal file
209
tools/health/etl_monitor.py
Normal file
@@ -0,0 +1,209 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""ETL 全流程联调监控脚本
|
||||
|
||||
每 30 秒轮询执行状态和日志,检测错误/警告,最长等待 30 分钟。
|
||||
将监控结果输出为 JSON 供后续报告生成使用。
|
||||
|
||||
用法: python scripts/ops/etl_monitor.py <execution_id> <jwt_token>
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
|
||||
BASE_URL = "http://localhost:8000"
|
||||
POLL_INTERVAL = 30 # 秒
|
||||
MAX_IDLE_MINUTES = 30
|
||||
MAX_IDLE_SECONDS = MAX_IDLE_MINUTES * 60
|
||||
|
||||
# 精确匹配真正的错误/警告日志行(排除 JSON 统计中的 'errors': 0 等误报)
|
||||
ERROR_PATTERN = re.compile(
|
||||
r"\b(ERROR|CRITICAL)\b.*(?!.*'errors':\s*0)", re.IGNORECASE
|
||||
)
|
||||
TRACEBACK_PATTERN = re.compile(r"Traceback \(most recent call last\)")
|
||||
EXCEPTION_PATTERN = re.compile(r"^\[stderr\].*Exception:", re.IGNORECASE)
|
||||
WARNING_PATTERN = re.compile(r"\bWARNING\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def api_get(path: str, token: str) -> dict:
|
||||
req = urllib.request.Request(
|
||||
f"{BASE_URL}{path}",
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
|
||||
def classify_log_lines(lines: list[str]) -> dict:
|
||||
"""分类日志行,返回错误和警告。"""
|
||||
errors, warnings = [], []
|
||||
for i, line in enumerate(lines):
|
||||
if TRACEBACK_PATTERN.search(line) or EXCEPTION_PATTERN.search(line):
|
||||
# 收集上下文(前后各 5 行)
|
||||
ctx_start = max(0, i - 5)
|
||||
ctx_end = min(len(lines), i + 6)
|
||||
errors.append({
|
||||
"line_no": i + 1,
|
||||
"text": line.strip(),
|
||||
"context": [l.strip() for l in lines[ctx_start:ctx_end]],
|
||||
})
|
||||
elif ERROR_PATTERN.search(line) and "'errors':" not in line:
|
||||
errors.append({"line_no": i + 1, "text": line.strip(), "context": []})
|
||||
elif WARNING_PATTERN.search(line) and "'errors':" not in line:
|
||||
warnings.append({"line_no": i + 1, "text": line.strip()})
|
||||
return {"errors": errors, "warnings": warnings}
|
||||
|
||||
|
||||
def monitor(execution_id: str, token: str) -> dict:
|
||||
"""主监控循环。返回完整监控结果。"""
|
||||
print(f"[监控] 开始监控 execution_id={execution_id}")
|
||||
print(f"[监控] 轮询间隔={POLL_INTERVAL}s, 最长空闲={MAX_IDLE_MINUTES}min")
|
||||
|
||||
start_time = datetime.now(timezone.utc)
|
||||
last_log_count = 0
|
||||
last_new_log_time = time.time()
|
||||
poll_count = 0
|
||||
all_log_text = ""
|
||||
final_status = "unknown"
|
||||
final_exit_code = None
|
||||
poll_history = []
|
||||
|
||||
while True:
|
||||
poll_count += 1
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# 获取日志
|
||||
try:
|
||||
logs_data = api_get(f"/api/execution/{execution_id}/logs", token)
|
||||
except Exception as e:
|
||||
print(f"[监控] #{poll_count} {now} 日志获取失败: {e}")
|
||||
time.sleep(POLL_INTERVAL)
|
||||
continue
|
||||
|
||||
log_text = logs_data.get("output_log") or ""
|
||||
lines = log_text.split("\n") if log_text else []
|
||||
current_count = len(lines)
|
||||
new_lines = current_count - last_log_count
|
||||
|
||||
if new_lines > 0:
|
||||
last_new_log_time = time.time()
|
||||
all_log_text = log_text
|
||||
|
||||
# 获取执行状态
|
||||
try:
|
||||
hist_data = api_get("/api/execution/history?limit=10", token)
|
||||
this_exec = next((h for h in hist_data if h["id"] == execution_id), None)
|
||||
status = this_exec["status"] if this_exec else "unknown"
|
||||
exit_code = this_exec.get("exit_code") if this_exec else None
|
||||
duration_ms = this_exec.get("duration_ms") if this_exec else None
|
||||
except Exception as e:
|
||||
print(f"[监控] #{poll_count} {now} 状态获取失败: {e}")
|
||||
status = "unknown"
|
||||
exit_code = None
|
||||
duration_ms = None
|
||||
|
||||
poll_record = {
|
||||
"poll": poll_count,
|
||||
"time": now,
|
||||
"log_lines": current_count,
|
||||
"new_lines": new_lines,
|
||||
"status": status,
|
||||
}
|
||||
poll_history.append(poll_record)
|
||||
|
||||
# 打印最新几行日志
|
||||
if new_lines > 0:
|
||||
recent = lines[last_log_count:current_count]
|
||||
for line in recent[-3:]:
|
||||
print(f" {line.strip()}")
|
||||
|
||||
print(
|
||||
f"[监控] #{poll_count} {now} | "
|
||||
f"日志行={current_count}(+{new_lines}) | 状态={status}"
|
||||
)
|
||||
|
||||
last_log_count = current_count
|
||||
|
||||
# 检查完成条件
|
||||
if status in ("success", "failed", "cancelled"):
|
||||
final_status = status
|
||||
final_exit_code = exit_code
|
||||
print(f"[监控] 任务完成: status={status}, exit_code={exit_code}, duration_ms={duration_ms}")
|
||||
break
|
||||
|
||||
# 检查超时
|
||||
idle_seconds = time.time() - last_new_log_time
|
||||
if idle_seconds > MAX_IDLE_SECONDS:
|
||||
print(f"[监控] 超时警告: {MAX_IDLE_MINUTES}分钟无新日志")
|
||||
final_status = "timeout_warning"
|
||||
break
|
||||
|
||||
time.sleep(POLL_INTERVAL)
|
||||
|
||||
end_time = datetime.now(timezone.utc)
|
||||
|
||||
# 分类日志
|
||||
all_lines = all_log_text.split("\n") if all_log_text else []
|
||||
classified = classify_log_lines(all_lines)
|
||||
|
||||
result = {
|
||||
"execution_id": execution_id,
|
||||
"start_time": start_time.isoformat(),
|
||||
"end_time": end_time.isoformat(),
|
||||
"monitor_duration_s": (end_time - start_time).total_seconds(),
|
||||
"final_status": final_status,
|
||||
"final_exit_code": final_exit_code,
|
||||
"total_log_lines": len(all_lines),
|
||||
"total_polls": poll_count,
|
||||
"errors": classified["errors"],
|
||||
"warnings": classified["warnings"],
|
||||
"error_count": len(classified["errors"]),
|
||||
"warning_count": len(classified["warnings"]),
|
||||
"poll_history": poll_history,
|
||||
"full_log": all_log_text,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("用法: python scripts/ops/etl_monitor.py <execution_id> <jwt_token>")
|
||||
sys.exit(1)
|
||||
|
||||
exec_id = sys.argv[1]
|
||||
jwt = sys.argv[2]
|
||||
result = monitor(exec_id, jwt)
|
||||
|
||||
# 输出结果到 JSON 文件(临时位置,后续报告脚本读取)
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# 加载环境变量
|
||||
root_env = Path(__file__).resolve().parent.parent.parent / ".env"
|
||||
load_dotenv(root_env)
|
||||
|
||||
log_root = os.environ.get("SYSTEM_LOG_ROOT")
|
||||
if not log_root:
|
||||
raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置")
|
||||
|
||||
out_dir = Path(log_root)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
out_path = out_dir / f"{date_str}__etl_monitor_result.json"
|
||||
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
# 不输出完整日志到 JSON(太大),单独保存
|
||||
result_slim = {k: v for k, v in result.items() if k != "full_log"}
|
||||
json.dump(result_slim, f, ensure_ascii=False, indent=2)
|
||||
|
||||
log_path = out_dir / f"{date_str}__etl_full_log.txt"
|
||||
with open(log_path, "w", encoding="utf-8") as f:
|
||||
f.write(result["full_log"])
|
||||
|
||||
print(f"[监控] 结果已保存: {out_path}")
|
||||
print(f"[监控] 完整日志已保存: {log_path}")
|
||||
print(f"[监控] 最终状态: {result['final_status']}, 错误数: {result['error_count']}, 警告数: {result['warning_count']}")
|
||||
1399
tools/reporting/dataflow_analyzer.py
Normal file
1399
tools/reporting/dataflow_analyzer.py
Normal file
File diff suppressed because it is too large
Load Diff
340
tools/reporting/gen_dataflow_doc.py
Normal file
340
tools/reporting/gen_dataflow_doc.py
Normal file
@@ -0,0 +1,340 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
从源代码和 DDL 中提取 API → ODS → DWD 数据流映射,生成 Markdown 文档。
|
||||
用法: python scripts/ops/gen_dataflow_doc.py
|
||||
输出: $FULL_DATAFLOW_DOC_ROOT/dataflow_api_ods_dwd.md(由 .env 配置)
|
||||
"""
|
||||
import re
|
||||
import ast
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
ETL = ROOT / "apps" / "etl" / "pipelines" / "feiqiu"
|
||||
DB = ROOT / "db" / "etl_feiqiu" / "schemas"
|
||||
from _env_paths import get_output_path as _get_path
|
||||
OUT = _get_path("FULL_DATAFLOW_DOC_ROOT") / "dataflow_api_ods_dwd.md"
|
||||
|
||||
|
||||
# ── 1. 从 DDL 解析表结构 ──────────────────────────────────────────
|
||||
def parse_ddl_tables(sql_path: Path, schema: str) -> dict[str, list[dict]]:
|
||||
"""解析 CREATE TABLE 语句,返回 {schema.table: [{col, type}, ...]}"""
|
||||
text = sql_path.read_text(encoding="utf-8")
|
||||
tables: dict[str, list[dict]] = {}
|
||||
# 匹配 CREATE TABLE IF NOT EXISTS table_name (...)
|
||||
pattern = re.compile(
|
||||
r"CREATE\s+TABLE\s+IF\s+NOT\s+EXISTS\s+"
|
||||
r"(?:(\w+)\.)?(\w+)\s*\((.*?)\)\s*;",
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
for m in pattern.finditer(text):
|
||||
s = m.group(1) or schema
|
||||
tname = m.group(2)
|
||||
body = m.group(3)
|
||||
cols = []
|
||||
for line in body.split("\n"):
|
||||
line = line.strip().rstrip(",")
|
||||
if not line or line.upper().startswith("PRIMARY") or line.startswith("--"):
|
||||
continue
|
||||
# 跳过约束行
|
||||
if re.match(r"^(CONSTRAINT|UNIQUE|CHECK|FOREIGN|EXCLUDE)\b", line, re.I):
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
col_name = parts[0].strip('"')
|
||||
col_type = parts[1]
|
||||
# 合并类型修饰符
|
||||
if len(parts) > 2 and parts[2].startswith("("):
|
||||
col_type += parts[2]
|
||||
cols.append({"col": col_name, "type": col_type})
|
||||
full = f"{s}.{tname}"
|
||||
tables[full] = cols
|
||||
return tables
|
||||
|
||||
|
||||
# ── 2. 从 Python 源码解析 TABLE_MAP ──────────────────────────────
|
||||
def parse_table_map(py_path: Path) -> dict[str, str]:
|
||||
"""解析 TABLE_MAP: dict[str, str] = {...}"""
|
||||
text = py_path.read_text(encoding="utf-8")
|
||||
# 找到 TABLE_MAP 字典
|
||||
match = re.search(
|
||||
r"TABLE_MAP\s*(?::\s*dict\[.*?\])?\s*=\s*\{(.*?)\}",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not match:
|
||||
return {}
|
||||
body = match.group(1)
|
||||
result = {}
|
||||
for m in re.finditer(r'"([^"]+)"\s*:\s*"([^"]+)"', body):
|
||||
result[m.group(1)] = m.group(2)
|
||||
return result
|
||||
|
||||
|
||||
# ── 3. 从 Python 源码解析 FACT_MAPPINGS ──────────────────────────
|
||||
def parse_fact_mappings(py_path: Path) -> dict[str, list[tuple]]:
|
||||
"""解析 FACT_MAPPINGS 字典,返回 {dwd_table: [(dwd_col, ods_expr, cast), ...]}"""
|
||||
text = py_path.read_text(encoding="utf-8")
|
||||
# 找到 FACT_MAPPINGS 块
|
||||
start = text.find("FACT_MAPPINGS")
|
||||
if start < 0:
|
||||
return {}
|
||||
# 找到第一个 { 后的内容
|
||||
brace_start = text.find("{", start)
|
||||
if brace_start < 0:
|
||||
return {}
|
||||
|
||||
# 手动匹配大括号
|
||||
depth = 0
|
||||
end = brace_start
|
||||
for i in range(brace_start, len(text)):
|
||||
if text[i] == "{":
|
||||
depth += 1
|
||||
elif text[i] == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end = i + 1
|
||||
break
|
||||
|
||||
block = text[brace_start:end]
|
||||
result = {}
|
||||
# 匹配每个表的映射列表
|
||||
table_pattern = re.compile(r'"([^"]+)"\s*:\s*\[', re.DOTALL)
|
||||
for tm in table_pattern.finditer(block):
|
||||
table_name = tm.group(1)
|
||||
list_start = tm.end()
|
||||
# 找到对应的 ]
|
||||
bracket_depth = 1
|
||||
list_end = list_start
|
||||
for i in range(list_start, len(block)):
|
||||
if block[i] == "[":
|
||||
bracket_depth += 1
|
||||
elif block[i] == "]":
|
||||
bracket_depth -= 1
|
||||
if bracket_depth == 0:
|
||||
list_end = i
|
||||
break
|
||||
list_body = block[list_start:list_end]
|
||||
# 匹配 (dwd_col, ods_expr, cast|None)
|
||||
tuples = []
|
||||
tuple_pattern = re.compile(
|
||||
r'\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*(?:"([^"]+)"|None)\s*\)'
|
||||
)
|
||||
for tp in tuple_pattern.finditer(list_body):
|
||||
tuples.append((tp.group(1), tp.group(2), tp.group(3)))
|
||||
result[table_name] = tuples
|
||||
return result
|
||||
|
||||
|
||||
# ── 4. 从 Python 源码解析 ODS_TASK_SPECS ─────────────────────────
|
||||
def parse_ods_specs(py_path: Path) -> list[dict]:
|
||||
"""解析 ODS_TASK_SPECS,提取 code, table_name, endpoint, list_key, description"""
|
||||
text = py_path.read_text(encoding="utf-8")
|
||||
specs = []
|
||||
# 匹配每个 OdsTaskSpec(...)
|
||||
pattern = re.compile(r"OdsTaskSpec\s*\((.*?)\)\s*,", re.DOTALL)
|
||||
for m in pattern.finditer(text):
|
||||
body = m.group(1)
|
||||
spec = {}
|
||||
for key in ("code", "table_name", "endpoint", "list_key", "description"):
|
||||
km = re.search(rf'{key}\s*=\s*"([^"]*)"', body)
|
||||
if km:
|
||||
spec[key] = km.group(1)
|
||||
if "code" in spec:
|
||||
specs.append(spec)
|
||||
return specs
|
||||
|
||||
|
||||
# ── 5. 生成文档 ──────────────────────────────────────────────────
|
||||
def generate_doc():
|
||||
ods_ddl = parse_ddl_tables(DB / "ods.sql", "ods")
|
||||
dwd_ddl = parse_ddl_tables(DB / "dwd.sql", "dwd")
|
||||
|
||||
dwd_task_py = ETL / "tasks" / "dwd" / "dwd_load_task.py"
|
||||
table_map = parse_table_map(dwd_task_py)
|
||||
fact_mappings = parse_fact_mappings(dwd_task_py)
|
||||
|
||||
ods_specs = parse_ods_specs(ETL / "tasks" / "ods" / "ods_tasks.py")
|
||||
# ODS 表 → API 端点映射
|
||||
ods_to_api: dict[str, dict] = {}
|
||||
for spec in ods_specs:
|
||||
tn = spec.get("table_name", "")
|
||||
ods_to_api[tn] = spec
|
||||
|
||||
lines = []
|
||||
lines.append("# API → ODS → DWD 数据流对比文档")
|
||||
lines.append("")
|
||||
lines.append("> 自动生成于 `scripts/ops/gen_dataflow_doc.py`,基于 DDL 和 ETL 源码解析。")
|
||||
lines.append("")
|
||||
lines.append("## 概览")
|
||||
lines.append("")
|
||||
lines.append(f"- ODS 表数量: {len(ods_ddl)}")
|
||||
lines.append(f"- DWD 表数量: {len(dwd_ddl)}")
|
||||
lines.append(f"- TABLE_MAP 映射条目: {len(table_map)}")
|
||||
lines.append(f"- ODS 任务数量: {len(ods_specs)}")
|
||||
lines.append("")
|
||||
|
||||
# ── 按 ODS 表分组 ──
|
||||
# 先建立 ODS 表 → DWD 表列表的反向映射
|
||||
ods_to_dwd: dict[str, list[str]] = {}
|
||||
for dwd_t, ods_t in table_map.items():
|
||||
ods_to_dwd.setdefault(ods_t, []).append(dwd_t)
|
||||
|
||||
# 收集所有涉及的 ODS 表(去重、排序)
|
||||
all_ods = sorted(set(list(ods_to_dwd.keys()) + [s.get("table_name", "") for s in ods_specs]))
|
||||
|
||||
lines.append("## 目录")
|
||||
lines.append("")
|
||||
for i, ods_t in enumerate(all_ods, 1):
|
||||
anchor = ods_t.replace(".", "").replace("_", "-")
|
||||
short = ods_t.split(".")[-1] if "." in ods_t else ods_t
|
||||
lines.append(f"{i}. [{short}](#{anchor})")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
# ── 逐表详情 ──
|
||||
for ods_t in all_ods:
|
||||
short = ods_t.split(".")[-1] if "." in ods_t else ods_t
|
||||
lines.append(f"## {short}")
|
||||
lines.append("")
|
||||
|
||||
# API 信息
|
||||
api_info = ods_to_api.get(ods_t, {})
|
||||
if api_info:
|
||||
lines.append("### API 端点")
|
||||
lines.append("")
|
||||
lines.append(f"- 任务编码: `{api_info.get('code', 'N/A')}`")
|
||||
lines.append(f"- 端点: `{api_info.get('endpoint', 'N/A')}`")
|
||||
lk = api_info.get("list_key")
|
||||
if lk:
|
||||
lines.append(f"- 数据路径: `data.{lk}`")
|
||||
desc = api_info.get("description", "")
|
||||
if desc:
|
||||
lines.append(f"- 说明: {desc}")
|
||||
lines.append("")
|
||||
|
||||
# ODS 表字段
|
||||
ods_cols = ods_ddl.get(ods_t, [])
|
||||
if ods_cols:
|
||||
lines.append(f"### ODS 表: `{ods_t}` ({len(ods_cols)} 列)")
|
||||
lines.append("")
|
||||
lines.append("| # | 列名 | 类型 |")
|
||||
lines.append("|---|------|------|")
|
||||
for idx, c in enumerate(ods_cols, 1):
|
||||
lines.append(f"| {idx} | `{c['col']}` | {c['type']} |")
|
||||
lines.append("")
|
||||
|
||||
# DWD 表
|
||||
dwd_tables = ods_to_dwd.get(ods_t, [])
|
||||
if dwd_tables:
|
||||
for dwd_t in sorted(dwd_tables):
|
||||
dwd_cols = dwd_ddl.get(dwd_t, [])
|
||||
is_dim = "dim_" in dwd_t
|
||||
is_ex = dwd_t.endswith("_ex")
|
||||
table_type = "维度" if is_dim else "事实"
|
||||
if is_ex:
|
||||
table_type += "(扩展)"
|
||||
|
||||
mappings = fact_mappings.get(dwd_t, [])
|
||||
|
||||
lines.append(f"### DWD 表: `{dwd_t}` — {table_type} ({len(dwd_cols)} 列)")
|
||||
lines.append("")
|
||||
|
||||
# 字段对比表
|
||||
lines.append("| # | DWD 列名 | DWD 类型 | ODS 来源表达式 | 转换 | 备注 |")
|
||||
lines.append("|---|----------|----------|----------------|------|------|")
|
||||
|
||||
# 建立映射查找
|
||||
mapping_dict = {m[0]: (m[1], m[2]) for m in mappings}
|
||||
|
||||
for idx, c in enumerate(dwd_cols, 1):
|
||||
col_name = c["col"]
|
||||
col_type = c["type"]
|
||||
|
||||
# SCD2 列
|
||||
scd2_cols = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
|
||||
if col_name.lower().replace("scd2_", "scd2_") in scd2_cols or col_name.lower() in scd2_cols:
|
||||
lines.append(f"| {idx} | `{col_name}` | {col_type} | — | — | DWD 慢变元数据 |")
|
||||
continue
|
||||
|
||||
if col_name in mapping_dict:
|
||||
ods_expr, cast = mapping_dict[col_name]
|
||||
cast_str = f"CAST → {cast}" if cast else "直接映射"
|
||||
# 判断是否为 JSONB 提取
|
||||
note = ""
|
||||
if "->>" in ods_expr:
|
||||
note = "JSONB 提取"
|
||||
elif "CASE" in ods_expr.upper():
|
||||
note = "派生计算"
|
||||
elif ods_expr != col_name:
|
||||
note = "字段重命名"
|
||||
lines.append(f"| {idx} | `{col_name}` | {col_type} | `{ods_expr}` | {cast_str} | {note} |")
|
||||
else:
|
||||
# 同名直传
|
||||
ods_col_names = {oc["col"].lower() for oc in ods_cols}
|
||||
if col_name.lower() in ods_col_names:
|
||||
lines.append(f"| {idx} | `{col_name}` | {col_type} | `{col_name}` | 直接映射 | 同名直传 |")
|
||||
else:
|
||||
lines.append(f"| {idx} | `{col_name}` | {col_type} | — | — | 未在 FACT_MAPPINGS 中显式映射 |")
|
||||
|
||||
lines.append("")
|
||||
else:
|
||||
lines.append(f"*该 ODS 表暂无 DWD 映射(仅用于 DWS 或其他下游)*")
|
||||
lines.append("")
|
||||
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
# ── 附录:ETL 元数据列说明 ──
|
||||
lines.append("## 附录:ETL 元数据列")
|
||||
lines.append("")
|
||||
lines.append("所有 ODS 表均包含以下 ETL 元数据列,不映射到 DWD:")
|
||||
lines.append("")
|
||||
lines.append("| 列名 | 类型 | 说明 |")
|
||||
lines.append("|------|------|------|")
|
||||
lines.append("| `content_hash` | TEXT | 记录内容哈希,用于去重和变更检测 |")
|
||||
lines.append("| `source_file` | TEXT | 原始导出文件名,用于数据追溯 |")
|
||||
lines.append("| `source_endpoint` | TEXT | 采集来源接口/文件路径 |")
|
||||
lines.append("| `fetched_at` | TIMESTAMPTZ | 采集/入库时间戳 |")
|
||||
lines.append("| `payload` | JSONB | 完整原始 JSON 记录快照 |")
|
||||
lines.append("")
|
||||
|
||||
lines.append("## 附录:DWD 维度表 SCD2 列")
|
||||
lines.append("")
|
||||
lines.append("所有 DWD 维度表(`dim_*`)均包含以下 SCD2 慢变维度列:")
|
||||
lines.append("")
|
||||
lines.append("| 列名 | 类型 | 说明 |")
|
||||
lines.append("|------|------|------|")
|
||||
lines.append("| `scd2_start_time` | TIMESTAMPTZ | 版本生效起点 |")
|
||||
lines.append("| `scd2_end_time` | TIMESTAMPTZ | 版本失效时间(9999-12-31 = 当前) |")
|
||||
lines.append("| `scd2_is_current` | INT | 当前版本标记(1=当前,0=历史) |")
|
||||
lines.append("| `scd2_version` | INT | 版本号(自增) |")
|
||||
lines.append("")
|
||||
|
||||
lines.append("## 附录:DWD 事实表增量策略")
|
||||
lines.append("")
|
||||
lines.append("事实表按时间窗口增量写入,优先使用以下业务时间列进行过滤(按优先级排序):")
|
||||
lines.append("")
|
||||
lines.append("1. `pay_time` — 支付时间")
|
||||
lines.append("2. `create_time` — 创建时间")
|
||||
lines.append("3. `update_time` — 更新时间")
|
||||
lines.append("4. `occur_time` — 发生时间")
|
||||
lines.append("5. `settle_time` — 结算时间")
|
||||
lines.append("6. `start_use_time` — 开始使用时间")
|
||||
lines.append("7. `fetched_at` — 入库时间(兜底)")
|
||||
lines.append("")
|
||||
|
||||
# 写入文件
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
OUT.write_text("\n".join(lines), encoding="utf-8")
|
||||
print(f"文档已生成: {OUT}")
|
||||
print(f" ODS 表: {len(ods_ddl)}, DWD 表: {len(dwd_ddl)}")
|
||||
print(f" TABLE_MAP: {len(table_map)} 条, FACT_MAPPINGS: {len(fact_mappings)} 条")
|
||||
print(f" ODS 任务: {len(ods_specs)} 个")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_doc()
|
||||
956
tools/reporting/gen_dataflow_report.py
Normal file
956
tools/reporting/gen_dataflow_report.py
Normal file
@@ -0,0 +1,956 @@
|
||||
"""
|
||||
数据流结构分析报告生成器(v3)
|
||||
|
||||
读取 analyze_dataflow.py 采集的数据,生成带锚点链接、上下游映射列、
|
||||
业务描述、多示例值、字段差异报告的 Markdown 报告。
|
||||
|
||||
增强内容(v3):
|
||||
- 总览表增加 API JSON 字段数列
|
||||
- 覆盖率表增加业务描述列
|
||||
- 逐表详情增加业务描述列(来自 BD_manual 文档)
|
||||
- 说明+示例值合并,多示例展示,枚举值解释
|
||||
- 总览章节增加 API↔ODS↔DWD 字段对比差异报告
|
||||
|
||||
用法:
|
||||
python scripts/ops/gen_dataflow_report.py
|
||||
python scripts/ops/gen_dataflow_report.py --output-dir /path/to/output
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv # noqa: F401 — _env_paths 负责加载,此处保留以防其他模块间接引用
|
||||
|
||||
# ── 白名单定义 ──────────────────────────────────────────────────────────
|
||||
# 白名单字段仍然参与检查和统计,但在报告的 1.1 差异明细表格和 3. 逐表详情表格中
|
||||
# 折叠显示(不展开详细行),并注明白名单原因。
|
||||
# CHANGE 2026-02-21 | 重构白名单逻辑:统一术语为"白名单",字段仍正常检查,仅报告展示折叠
|
||||
|
||||
# ODS 层 ETL 元数据列(非业务字段,ETL 流程自动生成)
|
||||
WHITELIST_ETL_META_COLS = {
|
||||
"source_file", "source_endpoint", "fetched_at", "payload", "content_hash",
|
||||
}
|
||||
|
||||
# DWD 维表 SCD2 管理列(ETL 框架自动维护,非业务映射)
|
||||
# CHANGE 2026-03-26 | 补充 scd2_* 新版列名,与 etl_consistency_check.py 保持一致
|
||||
WHITELIST_DWD_SCD2_COLS = {
|
||||
"valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id",
|
||||
"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version",
|
||||
}
|
||||
|
||||
# API 嵌套对象前缀(上游 API 的门店信息嵌套结构,已通过 site_id 关联,不逐字段映射)
|
||||
WHITELIST_API_NESTED_PREFIXES = ("siteProfile.",)
|
||||
|
||||
|
||||
def is_whitelist_etl_meta(col_name: str) -> bool:
|
||||
"""判断是否为 ETL 元数据白名单列"""
|
||||
return col_name in WHITELIST_ETL_META_COLS
|
||||
|
||||
|
||||
def is_whitelist_scd2(col_name: str) -> bool:
|
||||
"""判断是否为 DWD SCD2 管理白名单列"""
|
||||
return col_name in WHITELIST_DWD_SCD2_COLS
|
||||
|
||||
|
||||
def is_whitelist_api_nested(json_path: str) -> bool:
|
||||
"""判断是否为 API 嵌套对象白名单字段"""
|
||||
return any(json_path.startswith(p) for p in WHITELIST_API_NESTED_PREFIXES)
|
||||
|
||||
|
||||
def whitelist_reason(col_name: str, json_path: str = "", layer: str = "") -> str:
|
||||
"""返回白名单原因描述,非白名单返回空字符串"""
|
||||
if is_whitelist_etl_meta(col_name):
|
||||
return "ETL 元数据列"
|
||||
if is_whitelist_scd2(col_name):
|
||||
return "SCD2 管理列"
|
||||
if json_path and is_whitelist_api_nested(json_path):
|
||||
return "API 嵌套对象(siteProfile)"
|
||||
return ""
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict | list | None:
|
||||
if not path.exists():
|
||||
return None
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="生成数据流结构分析 Markdown 报告")
|
||||
parser.add_argument("--output-dir", type=str, default=None,
|
||||
help="输出目录(默认读取 .env 中的 SYSTEM_ANALYZE_ROOT)")
|
||||
return parser
|
||||
|
||||
|
||||
def resolve_data_dir(override: str | None = None) -> Path:
|
||||
if override:
|
||||
return Path(override)
|
||||
from _env_paths import get_output_path
|
||||
return get_output_path("SYSTEM_ANALYZE_ROOT")
|
||||
|
||||
|
||||
def _esc(s: str) -> str:
|
||||
"""转义 Markdown 表格中的管道符"""
|
||||
return str(s).replace("|", "\\|").replace("\n", " ") if s else ""
|
||||
|
||||
|
||||
|
||||
|
||||
def _format_samples(samples: list[str], max_show: int = 5) -> str:
|
||||
"""格式化多示例值,截断过长的值"""
|
||||
if not samples:
|
||||
return ""
|
||||
shown = []
|
||||
for s in samples[:max_show]:
|
||||
s = _esc(s)
|
||||
if len(s) > 30:
|
||||
s = s[:27] + "..."
|
||||
shown.append(f"`{s}`")
|
||||
result = ", ".join(shown)
|
||||
if len(samples) > max_show:
|
||||
result += f" …共{len(samples)}种"
|
||||
return result
|
||||
|
||||
|
||||
def _is_enum_like(samples: list[str], total_records: int) -> bool:
|
||||
"""判断字段是否像枚举(不同值少且记录数足够多)"""
|
||||
if total_records < 5:
|
||||
return False
|
||||
return 1 < len(samples) <= 8
|
||||
|
||||
|
||||
def _write_source_file_manifest(w, data_dir: Path, tables: list[dict], fm_dir: Path | None = None):
|
||||
"""在报告开头输出本次分析用到的所有 JSON 数据源文件清单"""
|
||||
if fm_dir is None:
|
||||
fm_dir = data_dir / "field_mappings"
|
||||
w("## 数据源文件清单")
|
||||
w()
|
||||
w("本报告基于以下 JSON 数据文件生成:")
|
||||
w()
|
||||
|
||||
categories = [
|
||||
("collection_manifest.json", "采集元数据(表清单、日期范围、记录数)"),
|
||||
("json_trees/", "API JSON 字段结构(递归展开后的字段路径、类型、示例值)"),
|
||||
("field_mappings/", "三层字段映射(API→ODS→DWD 映射关系)"),
|
||||
("db_schemas/", "数据库表结构(ODS/DWD 列定义,来自 PostgreSQL)"),
|
||||
("bd_descriptions/", "业务描述(来自 BD_manual 文档)"),
|
||||
]
|
||||
|
||||
for cat_path, cat_desc in categories:
|
||||
if cat_path.endswith("/"):
|
||||
# 子目录:列出实际存在的文件
|
||||
# CHANGE 2026-02-21 | field_mappings 使用传入的 fm_dir(可能是 field_mappings_new)
|
||||
if cat_path.rstrip("/") == "field_mappings":
|
||||
sub_dir = fm_dir
|
||||
else:
|
||||
sub_dir = data_dir / cat_path.rstrip("/")
|
||||
if sub_dir.is_dir():
|
||||
try:
|
||||
files = sorted(f.name for f in sub_dir.iterdir() if f.suffix == ".json")
|
||||
except PermissionError:
|
||||
w(f"**{cat_path}** — {cat_desc}(目录权限拒绝)")
|
||||
w()
|
||||
continue
|
||||
if sub_dir.is_dir():
|
||||
files = sorted(f.name for f in sub_dir.iterdir() if f.suffix == ".json")
|
||||
w(f"**{cat_path}** — {cat_desc}({len(files)} 个文件)")
|
||||
w()
|
||||
for fn in files:
|
||||
w(f"- `{cat_path}{fn}`")
|
||||
w()
|
||||
else:
|
||||
w(f"**{cat_path}** — {cat_desc}(目录不存在)")
|
||||
w()
|
||||
else:
|
||||
# 单文件
|
||||
fp = data_dir / cat_path
|
||||
status = "✓" if fp.exists() else "✗ 缺失"
|
||||
w(f"- `{cat_path}` — {cat_desc}({status})")
|
||||
w()
|
||||
|
||||
w("---")
|
||||
w()
|
||||
|
||||
|
||||
def generate_report(data_dir: Path) -> str:
|
||||
"""生成完整的 Markdown 报告"""
|
||||
manifest = load_json(data_dir / "collection_manifest.json")
|
||||
if not manifest:
|
||||
raise FileNotFoundError(f"找不到 collection_manifest.json: {data_dir}")
|
||||
|
||||
# CHANGE 2026-02-21 | Windows 文件锁 fallback:field_mappings_new 优先于被锁的 field_mappings
|
||||
_fm_dir = data_dir / "field_mappings"
|
||||
_fm_new = data_dir / "field_mappings_new"
|
||||
if _fm_new.exists() and any(_fm_new.iterdir()):
|
||||
_fm_dir = _fm_new
|
||||
|
||||
tables = manifest["tables"]
|
||||
now = datetime.now()
|
||||
lines: list[str] = []
|
||||
|
||||
def w(s: str = ""):
|
||||
lines.append(s)
|
||||
|
||||
# ── 从 manifest 读取 API 请求日期范围 ──
|
||||
api_date_from = manifest.get("date_from")
|
||||
api_date_to = manifest.get("date_to")
|
||||
total_records_all = sum(t.get("record_count", 0) for t in tables)
|
||||
|
||||
# ── 报告头 ──
|
||||
w("# 飞球连接器 — 数据流结构分析报告")
|
||||
w()
|
||||
w(f"> 生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} CST")
|
||||
w(f"> 分析范围:飞球(feiqiu)连接器,共 {len(tables)} 张 ODS 表")
|
||||
w("> 数据来源:API JSON 采样 + PostgreSQL ODS/DWD 表结构 + 三层字段映射 + BD_manual 业务文档")
|
||||
if api_date_from or api_date_to:
|
||||
w(f"> API 请求日期范围:{api_date_from or '—'} ~ {api_date_to or '—'}")
|
||||
w(f"> JSON 数据总量:{total_records_all} 条记录")
|
||||
w()
|
||||
|
||||
# ── 数据源文件清单 ──
|
||||
_write_source_file_manifest(w, data_dir, tables, fm_dir=_fm_dir)
|
||||
|
||||
# ── 1. 总览表(增加 API JSON 字段数列) ──
|
||||
w("## 1. 总览")
|
||||
w()
|
||||
w("| # | ODS 表名 | 业务描述 | 采样记录数 | API JSON 字段数 | ODS 列数 | DWD 目标表 | DWD 总列数 |")
|
||||
w("|---|---------|---------|-----------|---------------|---------|-----------|-----------|")
|
||||
total_records = 0
|
||||
total_ods_cols = 0
|
||||
total_dwd_cols = 0
|
||||
total_json_fields = 0
|
||||
for i, t in enumerate(tables, 1):
|
||||
dwd_names = ", ".join(t["dwd_tables"]) if t["dwd_tables"] else "—"
|
||||
json_fc = t.get("json_field_count", 0)
|
||||
w(f"| {i} | `{t['table']}` | {t['description']} | {t['record_count']} | {json_fc} | {t['ods_column_count']} | {dwd_names} | {t['dwd_column_count']} |")
|
||||
total_records += t["record_count"]
|
||||
total_ods_cols += t["ods_column_count"]
|
||||
total_dwd_cols += t["dwd_column_count"]
|
||||
total_json_fields += json_fc
|
||||
w(f"| | **合计** | | **{total_records}** | **{total_json_fields}** | **{total_ods_cols}** | | **{total_dwd_cols}** |")
|
||||
w()
|
||||
|
||||
# ── 1.1 字段对比差异报告 ──
|
||||
_write_field_diff_report(w, data_dir, tables, fm_dir=_fm_dir)
|
||||
|
||||
# ── 2. 全局统计 ──
|
||||
w("## 2. 全局统计")
|
||||
w()
|
||||
|
||||
# 2.1 JSON→ODS 映射覆盖
|
||||
total_json = 0
|
||||
total_mapped = 0
|
||||
per_table_stats: list[dict] = []
|
||||
for t in tables:
|
||||
fm = load_json(_fm_dir / f"{t['table']}.json")
|
||||
if not fm or "json_to_ods" not in fm:
|
||||
per_table_stats.append({
|
||||
"table": t["table"], "description": t["description"],
|
||||
"json_count": 0, "mapped": 0, "unmapped": 0, "pct": "—",
|
||||
})
|
||||
continue
|
||||
j2o = fm["json_to_ods"]
|
||||
json_count = len(j2o)
|
||||
mapped = sum(1 for m in j2o if m.get("ods_col") is not None)
|
||||
unmapped = json_count - mapped
|
||||
pct = f"{mapped / json_count * 100:.1f}%" if json_count > 0 else "—"
|
||||
per_table_stats.append({
|
||||
"table": t["table"], "description": t["description"],
|
||||
"json_count": json_count, "mapped": mapped, "unmapped": unmapped, "pct": pct,
|
||||
})
|
||||
total_json += json_count
|
||||
total_mapped += mapped
|
||||
|
||||
total_unmapped = total_json - total_mapped
|
||||
w("### 2.1 JSON→ODS 映射覆盖")
|
||||
w()
|
||||
w(f"- JSON 字段总数:{total_json}")
|
||||
if total_json > 0:
|
||||
w(f"- 已映射到 ODS 列:{total_mapped}({total_mapped / total_json * 100:.1f}%)")
|
||||
w(f"- 仅存于 payload:{total_unmapped}({total_unmapped / total_json * 100:.1f}%)")
|
||||
else:
|
||||
w("- 已映射到 ODS 列:0")
|
||||
w("- 仅存于 payload:0")
|
||||
w()
|
||||
|
||||
# 2.2 ODS→DWD 映射覆盖
|
||||
w("### 2.2 ODS→DWD 映射覆盖")
|
||||
w()
|
||||
w(f"- DWD 列总数:{total_dwd_cols}")
|
||||
w()
|
||||
|
||||
# 2.3 各表覆盖率(增加业务描述列)
|
||||
w("### 2.3 各表 JSON→ODS 映射覆盖率")
|
||||
w()
|
||||
w("| ODS 表名 | 业务描述 | JSON 字段数 | 已映射 | 仅 payload | 覆盖率 |")
|
||||
w("|---------|---------|-----------|-------|-----------|-------|")
|
||||
sorted_stats = sorted(per_table_stats, key=lambda x: (0 if x["pct"] == "—" else -float(x["pct"].rstrip("%"))))
|
||||
for s in sorted_stats:
|
||||
w(f"| `{s['table']}` | {s['description']} | {s['json_count']} | {s['mapped']} | {s['unmapped']} | {s['pct']} |")
|
||||
w()
|
||||
|
||||
# ── 3. 逐表详情 ──
|
||||
w("## 3. 逐表详情")
|
||||
w()
|
||||
|
||||
for idx, t in enumerate(tables, 1):
|
||||
table_name = t["table"]
|
||||
fm = load_json(_fm_dir / f"{table_name}.json")
|
||||
jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
|
||||
ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
|
||||
bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
|
||||
|
||||
# 锚点 ID
|
||||
anchors = fm.get("anchors", {}) if fm else {}
|
||||
api_anchor = anchors.get("api", f"api-{table_name}")
|
||||
ods_anchor = anchors.get("ods", f"ods-{table_name}")
|
||||
dwd_anchors = anchors.get("dwd", {})
|
||||
|
||||
dwd_tables_list = t.get("dwd_tables", [])
|
||||
json_fc = t.get("json_field_count", 0)
|
||||
|
||||
w(f"### 3.{idx} {table_name}({t['description']})")
|
||||
w()
|
||||
w(f"- 任务代码:`{t['task_code']}`")
|
||||
w(f"- 采样记录数:{t['record_count']}")
|
||||
w(f"- API JSON 字段数:{json_fc}")
|
||||
w(f"- ODS 列数:{t['ods_column_count']}")
|
||||
if dwd_tables_list:
|
||||
w(f"- DWD 目标表:{', '.join(dwd_tables_list)}")
|
||||
else:
|
||||
w("- DWD 目标表:—(仅 ODS 落地)")
|
||||
w()
|
||||
|
||||
# ── API 源字段区块 ──
|
||||
_write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor)
|
||||
|
||||
# ── ODS 表结构区块 ──
|
||||
_write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors)
|
||||
|
||||
# ── DWD 表结构区块 ──
|
||||
for dwd_name in dwd_tables_list:
|
||||
dwd_anchor = dwd_anchors.get(dwd_name, f"dwd-{dwd_name}")
|
||||
dwd_schema = load_json(data_dir / "db_schemas" / f"dwd_{dwd_name}.json")
|
||||
_write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
|
||||
def _write_field_diff_report(w, data_dir: Path, tables: list[dict], fm_dir: Path | None = None):
|
||||
"""生成 API↔ODS↔DWD 字段对比差异报告(汇总表 + 逐表分表)"""
|
||||
if fm_dir is None:
|
||||
fm_dir = data_dir / "field_mappings"
|
||||
w("### 1.1 API↔ODS↔DWD 字段对比差异")
|
||||
w()
|
||||
w("以下汇总各表在三层之间的字段差异(点击数字跳转至分表详情):")
|
||||
w()
|
||||
w("| ODS 表名 | API→ODS 未映射 | ODS 无 JSON 源 | ODS→DWD 未映射 | DWD 无 ODS 源 | 主要差异原因 |")
|
||||
w("|---------|--------------|--------------|--------------|-------------|------------|")
|
||||
|
||||
# CHANGE 2026-02-21 | 重构白名单逻辑:字段仍正常检查计数,白名单字段在分表详情中折叠
|
||||
# 收集每表差异数据,用于汇总表和分表
|
||||
diff_rows: list[dict] = []
|
||||
|
||||
for t in tables:
|
||||
table_name = t["table"]
|
||||
fm = load_json(fm_dir / f"{table_name}.json")
|
||||
if not fm:
|
||||
w(f"| `{table_name}` | — | — | — | — | 无映射数据 |")
|
||||
diff_rows.append(None)
|
||||
continue
|
||||
|
||||
anchors = fm.get("anchors", {})
|
||||
api_anchor = anchors.get("api", f"api-{table_name.replace('_', '-')}")
|
||||
ods_anchor = anchors.get("ods", f"ods-{table_name.replace('_', '-')}")
|
||||
dwd_anchors = anchors.get("dwd", {})
|
||||
diff_anchor = f"diff-{table_name.replace('_', '-')}"
|
||||
|
||||
j2o = fm.get("json_to_ods", [])
|
||||
o2d = fm.get("ods_to_dwd", {})
|
||||
d2o = fm.get("dwd_to_ods", {})
|
||||
|
||||
# ── API→ODS 未映射字段(全部检查,含白名单) ──
|
||||
api_unmapped_flat: list[str] = []
|
||||
api_unmapped_nested: list[str] = []
|
||||
api_unmapped_whitelist: list[tuple[str, str]] = [] # (json_path, reason)
|
||||
for m in j2o:
|
||||
if m.get("ods_col") is None:
|
||||
jp = m.get("json_path", "")
|
||||
wl_reason = whitelist_reason("", json_path=jp)
|
||||
if wl_reason:
|
||||
api_unmapped_whitelist.append((jp, wl_reason))
|
||||
elif "." in jp:
|
||||
api_unmapped_nested.append(jp)
|
||||
else:
|
||||
api_unmapped_flat.append(jp)
|
||||
api_unmapped_total = len(api_unmapped_flat) + len(api_unmapped_nested) + len(api_unmapped_whitelist)
|
||||
|
||||
# ── ODS 无 JSON 源(全部检查,含白名单) ──
|
||||
ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
|
||||
ods_mapped_cols = {m["ods_col"] for m in j2o if m.get("ods_col")}
|
||||
ods_no_json_fields: list[str] = []
|
||||
ods_no_json_whitelist: list[tuple[str, str]] = [] # (col_name, reason)
|
||||
if ods_schema and "columns" in ods_schema:
|
||||
for col in ods_schema["columns"]:
|
||||
if col["name"] not in ods_mapped_cols:
|
||||
wl_reason = whitelist_reason(col["name"])
|
||||
if wl_reason:
|
||||
ods_no_json_whitelist.append((col["name"], wl_reason))
|
||||
else:
|
||||
ods_no_json_fields.append(col["name"])
|
||||
|
||||
# ── ODS→DWD 未映射(全部检查,含白名单) ──
|
||||
ods_cols_with_dwd = set(o2d.keys())
|
||||
ods_no_dwd_fields: list[str] = []
|
||||
ods_no_dwd_whitelist: list[tuple[str, str]] = []
|
||||
if ods_schema and "columns" in ods_schema:
|
||||
for col in ods_schema["columns"]:
|
||||
if col["name"] not in ods_cols_with_dwd:
|
||||
wl_reason = whitelist_reason(col["name"])
|
||||
if wl_reason:
|
||||
ods_no_dwd_whitelist.append((col["name"], wl_reason))
|
||||
else:
|
||||
ods_no_dwd_fields.append(col["name"])
|
||||
|
||||
# ── DWD 无 ODS 源(全部检查,含白名单) ──
|
||||
dwd_no_ods_fields: list[tuple[str, str]] = [] # (dwd_table, dwd_col)
|
||||
dwd_no_ods_whitelist: list[tuple[str, str, str]] = [] # (dwd_table, dwd_col, reason)
|
||||
for dwd_name, entries in d2o.items():
|
||||
for entry in entries:
|
||||
if entry.get("ods_source") == "—":
|
||||
wl_reason = whitelist_reason(entry["dwd_col"])
|
||||
if wl_reason:
|
||||
dwd_no_ods_whitelist.append((dwd_name, entry["dwd_col"], wl_reason))
|
||||
else:
|
||||
dwd_no_ods_fields.append((dwd_name, entry["dwd_col"]))
|
||||
|
||||
# 差异原因(含白名单统计)
|
||||
reasons: list[str] = []
|
||||
if api_unmapped_nested:
|
||||
reasons.append(f"嵌套对象 {len(api_unmapped_nested)} 个")
|
||||
if api_unmapped_flat:
|
||||
reasons.append(f"平层未映射 {len(api_unmapped_flat)} 个")
|
||||
if dwd_no_ods_fields:
|
||||
reasons.append(f"SCD2/派生列 {len(dwd_no_ods_fields)} 个")
|
||||
wl_total = len(api_unmapped_whitelist) + len(ods_no_json_whitelist) + len(ods_no_dwd_whitelist) + len(dwd_no_ods_whitelist)
|
||||
if wl_total:
|
||||
reasons.append(f"白名单 {wl_total} 个")
|
||||
reason_str = ";".join(reasons) if reasons else "—"
|
||||
|
||||
# 汇总表单元格:数量 + 跳转链接(白名单字段也计入总数)
|
||||
def _cell(count: int) -> str:
|
||||
if count == 0:
|
||||
return "0"
|
||||
return f"[{count}](#{diff_anchor})"
|
||||
|
||||
w(f"| `{table_name}` | {_cell(api_unmapped_total)} | {_cell(len(ods_no_json_fields) + len(ods_no_json_whitelist))} | {_cell(len(ods_no_dwd_fields) + len(ods_no_dwd_whitelist))} | {_cell(len(dwd_no_ods_fields) + len(dwd_no_ods_whitelist))} | {reason_str} |")
|
||||
|
||||
diff_rows.append({
|
||||
"table_name": table_name,
|
||||
"diff_anchor": diff_anchor,
|
||||
"api_anchor": api_anchor,
|
||||
"ods_anchor": ods_anchor,
|
||||
"dwd_anchors": dwd_anchors,
|
||||
"api_unmapped_flat": api_unmapped_flat,
|
||||
"api_unmapped_nested": api_unmapped_nested,
|
||||
"api_unmapped_whitelist": api_unmapped_whitelist,
|
||||
"ods_no_json_fields": ods_no_json_fields,
|
||||
"ods_no_json_whitelist": ods_no_json_whitelist,
|
||||
"ods_no_dwd_fields": ods_no_dwd_fields,
|
||||
"ods_no_dwd_whitelist": ods_no_dwd_whitelist,
|
||||
"dwd_no_ods_fields": dwd_no_ods_fields,
|
||||
"dwd_no_ods_whitelist": dwd_no_ods_whitelist,
|
||||
})
|
||||
|
||||
w()
|
||||
|
||||
# ── 逐表差异分表 ──
|
||||
# CHANGE 2026-02-21 | 白名单字段折叠显示,不展开详细表格行,注明白名单原因
|
||||
sub_idx = 0
|
||||
for row in diff_rows:
|
||||
if row is None:
|
||||
continue
|
||||
has_any = (row["api_unmapped_flat"] or row["api_unmapped_nested"]
|
||||
or row["api_unmapped_whitelist"]
|
||||
or row["ods_no_json_fields"] or row["ods_no_json_whitelist"]
|
||||
or row["ods_no_dwd_fields"] or row["ods_no_dwd_whitelist"]
|
||||
or row["dwd_no_ods_fields"] or row["dwd_no_ods_whitelist"])
|
||||
if not has_any:
|
||||
continue
|
||||
|
||||
sub_idx += 1
|
||||
table_name = row["table_name"]
|
||||
w(f'<a id="{row["diff_anchor"]}"></a>')
|
||||
w()
|
||||
w(f"#### 1.1.{sub_idx} {table_name} 字段差异明细")
|
||||
w()
|
||||
|
||||
api_anchor = row["api_anchor"]
|
||||
ods_anchor = row["ods_anchor"]
|
||||
dwd_anchors = row["dwd_anchors"]
|
||||
|
||||
# 加载辅助数据:json_trees(示例值)、bd_descriptions(业务说明)
|
||||
jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
|
||||
bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
|
||||
jt_lookup: dict[str, dict] = {}
|
||||
if jt and "fields" in jt:
|
||||
for fld in jt["fields"]:
|
||||
jt_lookup[fld["path"]] = fld
|
||||
ods_descs = bd.get("ods_fields", {}) if bd else {}
|
||||
dwd_descs_all = bd.get("dwd_fields", {}) if bd else {}
|
||||
|
||||
def _sample_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
|
||||
"""从 json_trees 或 bd_descriptions 获取示例值字符串"""
|
||||
if layer == "API":
|
||||
entry = jt_lookup.get(field_name, {})
|
||||
samples = entry.get("samples", [])
|
||||
total_recs = entry.get("total_records", 0)
|
||||
if not samples:
|
||||
single = entry.get("sample", "")
|
||||
if single:
|
||||
samples = [str(single)]
|
||||
if _is_enum_like(samples, total_recs):
|
||||
return ", ".join(f"`{_esc(s)}`" for s in samples[:5])
|
||||
if samples:
|
||||
return _format_samples(samples, max_show=3)
|
||||
return ""
|
||||
|
||||
def _desc_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
|
||||
"""从 bd_descriptions 获取业务说明"""
|
||||
key = field_name.split(".")[-1].replace("[]", "").lower()
|
||||
if layer in ("ODS", "API"):
|
||||
desc = ods_descs.get(key, "")
|
||||
elif layer == "DWD" and dwd_tbl:
|
||||
desc = dwd_descs_all.get(dwd_tbl, {}).get(key, "")
|
||||
else:
|
||||
desc = ""
|
||||
if desc and len(desc) > 40:
|
||||
desc = desc[:37] + "..."
|
||||
return _esc(desc)
|
||||
|
||||
def _write_whitelist_summary(w, items: list, category: str):
|
||||
"""白名单字段折叠汇总(不展开详细表格行)"""
|
||||
if not items:
|
||||
return
|
||||
# 按原因分组
|
||||
by_reason: dict[str, list[str]] = {}
|
||||
for item in items:
|
||||
if isinstance(item, tuple) and len(item) == 3:
|
||||
name, _, reason = item # (dwd_table, dwd_col, reason)
|
||||
elif isinstance(item, tuple) and len(item) == 2:
|
||||
name, reason = item
|
||||
else:
|
||||
name, reason = str(item), "白名单"
|
||||
by_reason.setdefault(reason, []).append(name)
|
||||
parts = []
|
||||
for reason, names in by_reason.items():
|
||||
parts.append(f"{reason}: `{'`, `'.join(names[:5])}`{'...' if len(names) > 5 else ''} ({len(names)} 个)")
|
||||
w(f"> ℹ️ {category}白名单字段(已检查,不展开详情):{';'.join(parts)}")
|
||||
w()
|
||||
|
||||
# ── API→ODS 未映射(平层) ──
|
||||
if row["api_unmapped_flat"]:
|
||||
w(f"**API→ODS 未映射(平层)** — {len(row['api_unmapped_flat'])} 个")
|
||||
w()
|
||||
w("| # | JSON 字段 | 示例值 | 说明 | 状态 |")
|
||||
w("|---|----------|-------|------|------|")
|
||||
for i, f in enumerate(row["api_unmapped_flat"], 1):
|
||||
sample = _sample_str(f, "API")
|
||||
desc = _desc_str(f, "API")
|
||||
w(f"| {i} | **[`{_esc(f)}`](#{api_anchor})** | {sample} | {desc} | **⚠️ 未映射** |")
|
||||
w()
|
||||
|
||||
# ── API→ODS 未映射(嵌套对象,非白名单) ──
|
||||
if row["api_unmapped_nested"]:
|
||||
w(f"<details><summary>API→ODS 未映射(嵌套对象)— {len(row['api_unmapped_nested'])} 个</summary>")
|
||||
w()
|
||||
w("| # | JSON 字段 | 示例值 | 说明 | 状态 |")
|
||||
w("|---|----------|-------|------|------|")
|
||||
for i, f in enumerate(row["api_unmapped_nested"], 1):
|
||||
sample = _sample_str(f, "API")
|
||||
desc = _desc_str(f, "API")
|
||||
w(f"| {i} | [`{_esc(f)}`](#{api_anchor}) | {sample} | {desc} | 📦 嵌套 |")
|
||||
w()
|
||||
w("</details>")
|
||||
w()
|
||||
|
||||
# ── API 白名单字段汇总 ──
|
||||
_write_whitelist_summary(w, row["api_unmapped_whitelist"], "API→ODS ")
|
||||
|
||||
# ── ODS 无 JSON 源 ──
|
||||
if row["ods_no_json_fields"]:
|
||||
w(f"**ODS 无 JSON 源** — {len(row['ods_no_json_fields'])} 个")
|
||||
w()
|
||||
w("| # | ODS 列 | 说明 | 状态 |")
|
||||
w("|---|-------|------|------|")
|
||||
for i, f in enumerate(row["ods_no_json_fields"], 1):
|
||||
desc = _desc_str(f, "ODS")
|
||||
w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {desc} | **⚠️ 无 JSON 源** |")
|
||||
w()
|
||||
|
||||
# ── ODS 无 JSON 源 白名单汇总 ──
|
||||
_write_whitelist_summary(w, row["ods_no_json_whitelist"], "ODS 无 JSON 源 ")
|
||||
|
||||
# ── ODS→DWD 未映射 ──
|
||||
if row["ods_no_dwd_fields"]:
|
||||
w(f"**ODS→DWD 未映射** — {len(row['ods_no_dwd_fields'])} 个")
|
||||
w()
|
||||
w("| # | ODS 列 | 说明 | 状态 |")
|
||||
w("|---|-------|------|------|")
|
||||
for i, f in enumerate(row["ods_no_dwd_fields"], 1):
|
||||
desc = _desc_str(f, "ODS")
|
||||
w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {desc} | **⚠️ 无 DWD 目标** |")
|
||||
w()
|
||||
|
||||
# ── ODS→DWD 白名单汇总 ──
|
||||
_write_whitelist_summary(w, row["ods_no_dwd_whitelist"], "ODS→DWD ")
|
||||
|
||||
# ── DWD 无 ODS 源 ──
|
||||
if row["dwd_no_ods_fields"]:
|
||||
w(f"**DWD 无 ODS 源** — {len(row['dwd_no_ods_fields'])} 个")
|
||||
w()
|
||||
w("| # | DWD 表 | DWD 列 | 说明 | 状态 |")
|
||||
w("|---|-------|-------|------|------|")
|
||||
for i, (dwd_name, dwd_col) in enumerate(row["dwd_no_ods_fields"], 1):
|
||||
dwd_a = dwd_anchors.get(dwd_name, f"dwd-{dwd_name.replace('_', '-')}")
|
||||
desc = _desc_str(dwd_col, "DWD", dwd_tbl=dwd_name)
|
||||
w(f"| {i} | {dwd_name} | **[`{_esc(dwd_col)}`](#{dwd_a})** | {desc} | **⚠️ 无 ODS 源** |")
|
||||
w()
|
||||
|
||||
# ── DWD 无 ODS 源 白名单汇总 ──
|
||||
_write_whitelist_summary(w, row["dwd_no_ods_whitelist"], "DWD 无 ODS 源 ")
|
||||
|
||||
w()
|
||||
|
||||
|
||||
|
||||
|
||||
def _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor):
|
||||
"""生成 API 源字段区块(增加业务描述列,合并说明+示例值,白名单字段折叠)"""
|
||||
w(f'<a id="{api_anchor}"></a>')
|
||||
w()
|
||||
w(f"#### API 源字段 — {table_name} [🔗 ODS](#{ods_anchor})")
|
||||
w()
|
||||
|
||||
if not fm or "json_to_ods" not in fm:
|
||||
w("_无 field_mappings 数据_")
|
||||
w()
|
||||
return
|
||||
|
||||
j2o = fm["json_to_ods"]
|
||||
# 构建 json_tree 查找表(含 samples)
|
||||
jt_lookup: dict[str, dict] = {}
|
||||
if jt and "fields" in jt:
|
||||
for f in jt["fields"]:
|
||||
jt_lookup[f["path"]] = f
|
||||
|
||||
# BD_manual ODS 描述(用于交叉引用 JSON 字段的业务含义)
|
||||
ods_descs = bd.get("ods_fields", {}) if bd else {}
|
||||
|
||||
# CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总
|
||||
normal_items: list[dict] = []
|
||||
whitelist_items: list[tuple[str, str]] = [] # (json_path, reason)
|
||||
for m in j2o:
|
||||
jp = m.get("json_path", "")
|
||||
wl_reason = whitelist_reason("", json_path=jp)
|
||||
if wl_reason:
|
||||
whitelist_items.append((jp, wl_reason))
|
||||
else:
|
||||
normal_items.append(m)
|
||||
|
||||
mapped_count = sum(1 for m in j2o if m.get("ods_col") is not None)
|
||||
total_count = len(j2o)
|
||||
if total_count > 0:
|
||||
w(f"已映射 {mapped_count}/{total_count},覆盖率 {mapped_count / total_count * 100:.1f}%")
|
||||
if whitelist_items:
|
||||
w(f"(其中 {len(whitelist_items)} 个白名单字段已折叠)")
|
||||
else:
|
||||
w("无字段")
|
||||
w()
|
||||
w("| # | JSON 字段 | 类型 | → ODS 列 | 业务描述 | 示例值与说明 |")
|
||||
w("|---|----------|------|---------|---------|------------|")
|
||||
|
||||
for i, m in enumerate(normal_items, 1):
|
||||
json_path = m["json_path"]
|
||||
json_type = m.get("json_type", "")
|
||||
ods_col = m.get("ods_col")
|
||||
match_type = m.get("match_type", "")
|
||||
occurrence_pct = m.get("occurrence_pct", 0)
|
||||
|
||||
# 从 json_tree 获取示例值(优先用 samples 多示例)
|
||||
jt_entry = jt_lookup.get(json_path, {})
|
||||
samples = jt_entry.get("samples", [])
|
||||
total_recs = jt_entry.get("total_records", 0)
|
||||
if not samples:
|
||||
single = jt_entry.get("sample", "")
|
||||
if single:
|
||||
samples = [str(single)]
|
||||
|
||||
# 构建 ODS 列链接
|
||||
if ods_col:
|
||||
ods_link = f"[`{ods_col}`](#{ods_anchor})"
|
||||
else:
|
||||
ods_link = "⚠️ 未映射"
|
||||
|
||||
# 业务描述(从 BD_manual 查找,用 ODS 列名或 JSON 叶子名)
|
||||
leaf = json_path.split(".")[-1].replace("[]", "").lower()
|
||||
biz_desc = ods_descs.get(leaf, "")
|
||||
if biz_desc and len(biz_desc) > 60:
|
||||
biz_desc = biz_desc[:57] + "..."
|
||||
biz_desc = _esc(biz_desc)
|
||||
|
||||
# 合并说明+示例值
|
||||
notes_parts: list[str] = []
|
||||
if "." in json_path and match_type == "unmapped":
|
||||
notes_parts.append("📦 嵌套对象")
|
||||
if match_type == "case_insensitive":
|
||||
notes_parts.append("大小写匹配")
|
||||
if occurrence_pct < 100:
|
||||
notes_parts.append(f"出现率 {occurrence_pct:.0f}%")
|
||||
|
||||
# 示例值展示
|
||||
if _is_enum_like(samples, total_recs):
|
||||
notes_parts.append(f"枚举值: {', '.join(f'`{_esc(s)}`' for s in samples[:8])}")
|
||||
elif samples:
|
||||
notes_parts.append(f"示例: {_format_samples(samples)}")
|
||||
|
||||
note_str = ";".join(notes_parts) if notes_parts else ""
|
||||
|
||||
w(f"| {i} | `{_esc(json_path)}` | {json_type} | {ods_link} | {biz_desc} | {note_str} |")
|
||||
|
||||
w()
|
||||
|
||||
# 白名单字段折叠汇总
|
||||
if whitelist_items:
|
||||
by_reason: dict[str, list[str]] = {}
|
||||
for jp, reason in whitelist_items:
|
||||
by_reason.setdefault(reason, []).append(jp)
|
||||
parts = []
|
||||
for reason, names in by_reason.items():
|
||||
parts.append(f"{reason}: `{'`, `'.join(names[:5])}`{'...' if len(names) > 5 else ''} ({len(names)} 个)")
|
||||
w(f"> ℹ️ 白名单字段(已检查,不展开详情):{';'.join(parts)}")
|
||||
w()
|
||||
|
||||
|
||||
def _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors):
|
||||
"""生成 ODS 表结构区块(含上下游双向映射列 + 业务描述,白名单字段折叠)"""
|
||||
w(f'<a id="{ods_anchor}"></a>')
|
||||
w()
|
||||
w(f"#### ODS 表结构 — ods.{table_name} [🔗 API](#{api_anchor})")
|
||||
w()
|
||||
|
||||
if not ods_schema or "columns" not in ods_schema:
|
||||
w("_无 DB schema 数据_")
|
||||
w()
|
||||
return
|
||||
|
||||
# 构建 json_to_ods 反向查找:ods_col → json_path
|
||||
ods_to_json: dict[str, str] = {}
|
||||
if fm and "json_to_ods" in fm:
|
||||
for m in fm["json_to_ods"]:
|
||||
if m.get("ods_col"):
|
||||
ods_to_json.setdefault(m["ods_col"], m["json_path"])
|
||||
|
||||
# 构建 ods_to_dwd 查找
|
||||
ods_to_dwd: dict[str, list[dict]] = {}
|
||||
if fm and "ods_to_dwd" in fm:
|
||||
ods_to_dwd = fm["ods_to_dwd"]
|
||||
|
||||
# BD_manual ODS 描述
|
||||
ods_descs = bd.get("ods_fields", {}) if bd else {}
|
||||
|
||||
cols = ods_schema["columns"]
|
||||
|
||||
# CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总
|
||||
normal_cols: list[dict] = []
|
||||
whitelist_cols: list[tuple[str, str]] = [] # (col_name, reason)
|
||||
for col in cols:
|
||||
wl_reason = whitelist_reason(col["name"])
|
||||
if wl_reason:
|
||||
whitelist_cols.append((col["name"], wl_reason))
|
||||
else:
|
||||
normal_cols.append(col)
|
||||
|
||||
w(f"共 {len(cols)} 列")
|
||||
if whitelist_cols:
|
||||
w(f"(其中 {len(whitelist_cols)} 个白名单列已折叠)")
|
||||
w()
|
||||
w("| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 | 业务描述 |")
|
||||
w("|---|---------|------|----------|-----------|---------|")
|
||||
|
||||
for i, col in enumerate(normal_cols, 1):
|
||||
col_name = col["name"]
|
||||
col_type = col["data_type"]
|
||||
|
||||
# ← JSON 源
|
||||
json_src = ods_to_json.get(col_name)
|
||||
if json_src:
|
||||
json_link = f"[`{_esc(json_src)}`](#{api_anchor})"
|
||||
else:
|
||||
json_link = "—"
|
||||
|
||||
# → DWD 目标
|
||||
dwd_targets = ods_to_dwd.get(col_name, [])
|
||||
if dwd_targets:
|
||||
dwd_links = []
|
||||
for dt in dwd_targets:
|
||||
dwd_tbl = dt["dwd_table"]
|
||||
dwd_col = dt["dwd_col"]
|
||||
dwd_anc = dwd_anchors.get(dwd_tbl, f"dwd-{dwd_tbl}")
|
||||
dwd_links.append(f"[`{dwd_tbl}.{dwd_col}`](#{dwd_anc})")
|
||||
dwd_link = ", ".join(dwd_links)
|
||||
else:
|
||||
dwd_link = "—"
|
||||
|
||||
# 业务描述
|
||||
biz_desc = ods_descs.get(col_name.lower(), "")
|
||||
if biz_desc and len(biz_desc) > 60:
|
||||
biz_desc = biz_desc[:57] + "..."
|
||||
biz_desc = _esc(biz_desc)
|
||||
|
||||
w(f"| {i} | `{col_name}` | {col_type} | {json_link} | {dwd_link} | {biz_desc} |")
|
||||
|
||||
w()
|
||||
|
||||
# 白名单列折叠汇总
|
||||
if whitelist_cols:
|
||||
by_reason: dict[str, list[str]] = {}
|
||||
for cn, reason in whitelist_cols:
|
||||
by_reason.setdefault(reason, []).append(cn)
|
||||
parts = []
|
||||
for reason, names in by_reason.items():
|
||||
parts.append(f"{reason}: `{'`, `'.join(names)}` ({len(names)} 个)")
|
||||
w(f"> ℹ️ 白名单列(已检查,不展开详情):{';'.join(parts)}")
|
||||
w()
|
||||
|
||||
|
||||
def _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name):
|
||||
"""生成 DWD 表结构区块(增加业务描述列,白名单字段折叠)"""
|
||||
w(f'<a id="{dwd_anchor}"></a>')
|
||||
w()
|
||||
w(f"#### DWD 表结构 — dwd.{dwd_name} [🔗 ODS](#{ods_anchor})")
|
||||
w()
|
||||
|
||||
if not dwd_schema or "columns" not in dwd_schema:
|
||||
w("_无 DB schema 数据_")
|
||||
w()
|
||||
return
|
||||
|
||||
# 构建 dwd_to_ods 查找
|
||||
dwd_to_ods_map: dict[str, dict] = {}
|
||||
if fm and "dwd_to_ods" in fm and dwd_name in fm["dwd_to_ods"]:
|
||||
for entry in fm["dwd_to_ods"][dwd_name]:
|
||||
dwd_to_ods_map[entry["dwd_col"]] = entry
|
||||
|
||||
# BD_manual DWD 描述
|
||||
dwd_descs = {}
|
||||
if bd and "dwd_fields" in bd:
|
||||
dwd_descs = bd["dwd_fields"].get(dwd_name, {})
|
||||
|
||||
cols = dwd_schema["columns"]
|
||||
|
||||
# CHANGE 2026-02-21 | 白名单字段从表格中排除,折叠汇总
|
||||
normal_cols: list[dict] = []
|
||||
whitelist_cols: list[tuple[str, str]] = [] # (col_name, reason)
|
||||
for col in cols:
|
||||
wl_reason = whitelist_reason(col["name"])
|
||||
if wl_reason:
|
||||
whitelist_cols.append((col["name"], wl_reason))
|
||||
else:
|
||||
normal_cols.append(col)
|
||||
|
||||
w(f"共 {len(cols)} 列")
|
||||
if whitelist_cols:
|
||||
w(f"(其中 {len(whitelist_cols)} 个白名单列已折叠)")
|
||||
w()
|
||||
w("| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 业务描述 |")
|
||||
w("|---|---------|------|----------|------|---------|")
|
||||
|
||||
for i, col in enumerate(normal_cols, 1):
|
||||
col_name = col["name"]
|
||||
col_type = col["data_type"]
|
||||
|
||||
mapping = dwd_to_ods_map.get(col_name)
|
||||
if mapping:
|
||||
ods_src = mapping.get("ods_source", "")
|
||||
ods_link = f"[`{ods_src}`](#{ods_anchor})" if ods_src and ods_src != "—" else "—"
|
||||
transform = mapping.get("mapping_type", "")
|
||||
note = mapping.get("note", "")
|
||||
else:
|
||||
ods_link = "—"
|
||||
transform = ""
|
||||
note = ""
|
||||
|
||||
# 业务描述(优先 BD_manual,其次 mapping note,最后 DB comment)
|
||||
biz_desc = dwd_descs.get(col_name.lower(), "")
|
||||
if not biz_desc and note:
|
||||
biz_desc = note
|
||||
if not biz_desc:
|
||||
db_comment = col.get("comment", "")
|
||||
if db_comment:
|
||||
if "【说明】" in db_comment:
|
||||
desc_part = db_comment.split("【说明】")[1]
|
||||
if "【" in desc_part:
|
||||
desc_part = desc_part.split("【")[0]
|
||||
biz_desc = desc_part.strip().rstrip("。").strip()
|
||||
else:
|
||||
biz_desc = db_comment
|
||||
if biz_desc and len(biz_desc) > 60:
|
||||
biz_desc = biz_desc[:57] + "..."
|
||||
biz_desc = _esc(biz_desc)
|
||||
|
||||
w(f"| {i} | `{col_name}` | {col_type} | {ods_link} | {_esc(transform)} | {biz_desc} |")
|
||||
|
||||
w()
|
||||
|
||||
# 白名单列折叠汇总
|
||||
if whitelist_cols:
|
||||
by_reason: dict[str, list[str]] = {}
|
||||
for cn, reason in whitelist_cols:
|
||||
by_reason.setdefault(reason, []).append(cn)
|
||||
parts = []
|
||||
for reason, names in by_reason.items():
|
||||
parts.append(f"{reason}: `{'`, `'.join(names)}` ({len(names)} 个)")
|
||||
w(f"> ℹ️ 白名单列(已检查,不展开详情):{';'.join(parts)}")
|
||||
w()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
# _env_paths 在 import 时已通过绝对路径加载根 .env,无需相对路径 load_dotenv
|
||||
# CHANGE 2026-02-21 | 移除 load_dotenv(Path(".env")),避免 cwd 不在项目根时失效
|
||||
from _env_paths import get_output_path # noqa: F401 — 触发 .env 加载
|
||||
|
||||
parser = build_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
data_dir = resolve_data_dir(args.output_dir)
|
||||
if not data_dir.exists():
|
||||
print(f"错误:数据目录不存在: {data_dir}")
|
||||
return
|
||||
|
||||
print(f"读取数据目录: {data_dir}")
|
||||
report = generate_report(data_dir)
|
||||
|
||||
now = datetime.now()
|
||||
filename = f"dataflow_{now.strftime('%Y-%m-%d_%H%M%S')}.md"
|
||||
output_path = data_dir / filename
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(report)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"报告生成完成")
|
||||
print(f"{'='*60}")
|
||||
print(f" 输出路径: {output_path}")
|
||||
print(f" 文件大小: {output_path.stat().st_size / 1024:.1f} KB")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1130
tools/reporting/gen_full_dataflow_doc.py
Normal file
1130
tools/reporting/gen_full_dataflow_doc.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user