DWD完成
This commit is contained in:
117
etl_billiards/scripts/check_ods_json_vs_table.py
Normal file
117
etl_billiards/scripts/check_ods_json_vs_table.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON(默认目录 C:\\dev\\LLTQ\\export\\test-json-doc)
|
||||
是否包含同名键,并输出每表未命中的字段,便于补充映射或确认确实无源字段。
|
||||
|
||||
使用方法:
|
||||
set PG_DSN=postgresql://... # 如 .env 中配置
|
||||
python -m etl_billiards.scripts.check_ods_json_vs_table
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict, Iterable, Set, Tuple
|
||||
|
||||
import psycopg2
|
||||
|
||||
from etl_billiards.tasks.manual_ingest_task import ManualIngestTask
|
||||
|
||||
|
||||
def _flatten_keys(obj, prefix: str = "") -> Set[str]:
|
||||
"""递归展开 JSON 所有键路径,返回形如 data.assistantInfos.id 的集合。列表不保留索引,仅继续向下展开。"""
|
||||
keys: Set[str] = set()
|
||||
if isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
new_prefix = f"{prefix}.{k}" if prefix else k
|
||||
keys.add(new_prefix)
|
||||
keys |= _flatten_keys(v, new_prefix)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
keys |= _flatten_keys(item, prefix)
|
||||
return keys
|
||||
|
||||
|
||||
def _load_json_keys(path: pathlib.Path) -> Tuple[Set[str], dict[str, Set[str]]]:
|
||||
"""读取单个 JSON 文件并返回展开后的键集合以及末段->路径列表映射,若文件不存在或无法解析则返回空集合。"""
|
||||
if not path.exists():
|
||||
return set(), {}
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
paths = _flatten_keys(data)
|
||||
last_map: dict[str, Set[str]] = {}
|
||||
for p in paths:
|
||||
last = p.split(".")[-1].lower()
|
||||
last_map.setdefault(last, set()).add(p)
|
||||
return paths, last_map
|
||||
|
||||
|
||||
def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
|
||||
"""从数据库读取 billiards_ods.* 的列名集合,按表返回。"""
|
||||
conn = psycopg2.connect(dsn)
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT table_name, column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema='billiards_ods'
|
||||
ORDER BY table_name, ordinal_position
|
||||
"""
|
||||
)
|
||||
result: Dict[str, Set[str]] = {}
|
||||
for table, col in cur.fetchall():
|
||||
result.setdefault(table, set()).add(col.lower())
|
||||
cur.close()
|
||||
conn.close()
|
||||
return result
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""主流程:遍历 FILE_MAPPING 中的 ODS 表,检查 JSON 键覆盖情况并打印报告。"""
|
||||
dsn = os.environ.get("PG_DSN")
|
||||
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", r"C:\dev\LLTQ\export\test-json-doc"))
|
||||
|
||||
ods_cols_map = _load_ods_columns(dsn)
|
||||
|
||||
print(f"使用 JSON 目录: {json_dir}")
|
||||
print(f"连接 DSN: {dsn}")
|
||||
print("=" * 80)
|
||||
|
||||
for keywords, ods_table in ManualIngestTask.FILE_MAPPING:
|
||||
table = ods_table.split(".")[-1]
|
||||
cols = ods_cols_map.get(table, set())
|
||||
file_name = f"{keywords[0]}.json"
|
||||
file_path = json_dir / file_name
|
||||
keys_full, path_map = _load_json_keys(file_path)
|
||||
key_last_parts = set(path_map.keys())
|
||||
|
||||
missing: Set[str] = set()
|
||||
extra_keys: Set[str] = set()
|
||||
present: Set[str] = set()
|
||||
for col in sorted(cols):
|
||||
if col in key_last_parts:
|
||||
present.add(col)
|
||||
else:
|
||||
missing.add(col)
|
||||
for k in key_last_parts:
|
||||
if k not in cols:
|
||||
extra_keys.add(k)
|
||||
|
||||
print(f"[{table}] 文件={file_name} 列数={len(cols)} JSON键(末段)覆盖={len(present)}/{len(cols)}")
|
||||
if missing:
|
||||
print(" 未命中列:", ", ".join(sorted(missing)))
|
||||
else:
|
||||
print(" 未命中列: 无")
|
||||
if extra_keys:
|
||||
extras = []
|
||||
for k in sorted(extra_keys):
|
||||
paths = ", ".join(sorted(path_map.get(k, [])))
|
||||
extras.append(f"{k} ({paths})")
|
||||
print(" JSON 仅有(表无此列):", "; ".join(extras))
|
||||
else:
|
||||
print(" JSON 仅有(表无此列): 无")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user