DWD完成

2025-12-09 04:57:05 +08:00
parent f301cc1fd5
commit 561c640700
46 changed files with 26181 additions and 3540 deletions
--- a/etl_billiards/scripts/check_ods_json_vs_table.py
+++ b/etl_billiards/scripts/check_ods_json_vs_table.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+ODS JSON 字段核对脚本：对照当前数据库中的 ODS 表字段，检查示例 JSON（默认目录 C:\\dev\\LLTQ\\export\\test-json-doc）
+是否包含同名键，并输出每表未命中的字段，便于补充映射或确认确实无源字段。
+
+使用方法：
+    set PG_DSN=postgresql://...               # 如 .env 中配置
+    python -m etl_billiards.scripts.check_ods_json_vs_table
+"""
+from __future__ import annotations
+
+import json
+import os
+import pathlib
+from typing import Dict, Iterable, Set, Tuple
+
+import psycopg2
+
+from etl_billiards.tasks.manual_ingest_task import ManualIngestTask
+
+
+def _flatten_keys(obj, prefix: str = "") -> Set[str]:
+    """递归展开 JSON 所有键路径，返回形如 data.assistantInfos.id 的集合。列表不保留索引，仅继续向下展开。"""
+    keys: Set[str] = set()
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            new_prefix = f"{prefix}.{k}" if prefix else k
+            keys.add(new_prefix)
+            keys |= _flatten_keys(v, new_prefix)
+    elif isinstance(obj, list):
+        for item in obj:
+            keys |= _flatten_keys(item, prefix)
+    return keys
+
+
+def _load_json_keys(path: pathlib.Path) -> Tuple[Set[str], dict[str, Set[str]]]:
+    """读取单个 JSON 文件并返回展开后的键集合以及末段->路径列表映射，若文件不存在或无法解析则返回空集合。"""
+    if not path.exists():
+        return set(), {}
+    data = json.loads(path.read_text(encoding="utf-8"))
+    paths = _flatten_keys(data)
+    last_map: dict[str, Set[str]] = {}
+    for p in paths:
+        last = p.split(".")[-1].lower()
+        last_map.setdefault(last, set()).add(p)
+    return paths, last_map
+
+
+def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
+    """从数据库读取 billiards_ods.* 的列名集合，按表返回。"""
+    conn = psycopg2.connect(dsn)
+    cur = conn.cursor()
+    cur.execute(
+        """
+        SELECT table_name, column_name
+        FROM information_schema.columns
+        WHERE table_schema='billiards_ods'
+        ORDER BY table_name, ordinal_position
+        """
+    )
+    result: Dict[str, Set[str]] = {}
+    for table, col in cur.fetchall():
+        result.setdefault(table, set()).add(col.lower())
+    cur.close()
+    conn.close()
+    return result
+
+
+def main() -> None:
+    """主流程：遍历 FILE_MAPPING 中的 ODS 表，检查 JSON 键覆盖情况并打印报告。"""
+    dsn = os.environ.get("PG_DSN")
+    json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", r"C:\dev\LLTQ\export\test-json-doc"))
+
+    ods_cols_map = _load_ods_columns(dsn)
+
+    print(f"使用 JSON 目录: {json_dir}")
+    print(f"连接 DSN: {dsn}")
+    print("=" * 80)
+
+    for keywords, ods_table in ManualIngestTask.FILE_MAPPING:
+        table = ods_table.split(".")[-1]
+        cols = ods_cols_map.get(table, set())
+        file_name = f"{keywords[0]}.json"
+        file_path = json_dir / file_name
+        keys_full, path_map = _load_json_keys(file_path)
+        key_last_parts = set(path_map.keys())
+
+        missing: Set[str] = set()
+        extra_keys: Set[str] = set()
+        present: Set[str] = set()
+        for col in sorted(cols):
+            if col in key_last_parts:
+                present.add(col)
+            else:
+                missing.add(col)
+        for k in key_last_parts:
+            if k not in cols:
+                extra_keys.add(k)
+
+        print(f"[{table}] 文件={file_name} 列数={len(cols)} JSON键(末段)覆盖={len(present)}/{len(cols)}")
+        if missing:
+            print("  未命中列:", ", ".join(sorted(missing)))
+        else:
+            print("  未命中列: 无")
+        if extra_keys:
+            extras = []
+            for k in sorted(extra_keys):
+                paths = ", ".join(sorted(path_map.get(k, [])))
+                extras.append(f"{k} ({paths})")
+            print("  JSON 仅有(表无此列):", "; ".join(extras))
+        else:
+            print("  JSON 仅有(表无此列): 无")
+        print("-" * 80)
+
+
+if __name__ == "__main__":
+    main()