118 lines
4.0 KiB
Python
118 lines
4.0 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON(默认目录 C:\\dev\\LLTQ\\export\\test-json-doc)
|
||
是否包含同名键,并输出每表未命中的字段,便于补充映射或确认确实无源字段。
|
||
|
||
使用方法:
|
||
set PG_DSN=postgresql://... # 如 .env 中配置
|
||
python -m etl_billiards.scripts.check_ods_json_vs_table
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import pathlib
|
||
from typing import Dict, Iterable, Set, Tuple
|
||
|
||
import psycopg2
|
||
|
||
from etl_billiards.tasks.manual_ingest_task import ManualIngestTask
|
||
|
||
|
||
def _flatten_keys(obj, prefix: str = "") -> Set[str]:
|
||
"""递归展开 JSON 所有键路径,返回形如 data.assistantInfos.id 的集合。列表不保留索引,仅继续向下展开。"""
|
||
keys: Set[str] = set()
|
||
if isinstance(obj, dict):
|
||
for k, v in obj.items():
|
||
new_prefix = f"{prefix}.{k}" if prefix else k
|
||
keys.add(new_prefix)
|
||
keys |= _flatten_keys(v, new_prefix)
|
||
elif isinstance(obj, list):
|
||
for item in obj:
|
||
keys |= _flatten_keys(item, prefix)
|
||
return keys
|
||
|
||
|
||
def _load_json_keys(path: pathlib.Path) -> Tuple[Set[str], dict[str, Set[str]]]:
|
||
"""读取单个 JSON 文件并返回展开后的键集合以及末段->路径列表映射,若文件不存在或无法解析则返回空集合。"""
|
||
if not path.exists():
|
||
return set(), {}
|
||
data = json.loads(path.read_text(encoding="utf-8"))
|
||
paths = _flatten_keys(data)
|
||
last_map: dict[str, Set[str]] = {}
|
||
for p in paths:
|
||
last = p.split(".")[-1].lower()
|
||
last_map.setdefault(last, set()).add(p)
|
||
return paths, last_map
|
||
|
||
|
||
def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
|
||
"""从数据库读取 billiards_ods.* 的列名集合,按表返回。"""
|
||
conn = psycopg2.connect(dsn)
|
||
cur = conn.cursor()
|
||
cur.execute(
|
||
"""
|
||
SELECT table_name, column_name
|
||
FROM information_schema.columns
|
||
WHERE table_schema='billiards_ods'
|
||
ORDER BY table_name, ordinal_position
|
||
"""
|
||
)
|
||
result: Dict[str, Set[str]] = {}
|
||
for table, col in cur.fetchall():
|
||
result.setdefault(table, set()).add(col.lower())
|
||
cur.close()
|
||
conn.close()
|
||
return result
|
||
|
||
|
||
def main() -> None:
|
||
"""主流程:遍历 FILE_MAPPING 中的 ODS 表,检查 JSON 键覆盖情况并打印报告。"""
|
||
dsn = os.environ.get("PG_DSN")
|
||
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", r"C:\dev\LLTQ\export\test-json-doc"))
|
||
|
||
ods_cols_map = _load_ods_columns(dsn)
|
||
|
||
print(f"使用 JSON 目录: {json_dir}")
|
||
print(f"连接 DSN: {dsn}")
|
||
print("=" * 80)
|
||
|
||
for keywords, ods_table in ManualIngestTask.FILE_MAPPING:
|
||
table = ods_table.split(".")[-1]
|
||
cols = ods_cols_map.get(table, set())
|
||
file_name = f"{keywords[0]}.json"
|
||
file_path = json_dir / file_name
|
||
keys_full, path_map = _load_json_keys(file_path)
|
||
key_last_parts = set(path_map.keys())
|
||
|
||
missing: Set[str] = set()
|
||
extra_keys: Set[str] = set()
|
||
present: Set[str] = set()
|
||
for col in sorted(cols):
|
||
if col in key_last_parts:
|
||
present.add(col)
|
||
else:
|
||
missing.add(col)
|
||
for k in key_last_parts:
|
||
if k not in cols:
|
||
extra_keys.add(k)
|
||
|
||
print(f"[{table}] 文件={file_name} 列数={len(cols)} JSON键(末段)覆盖={len(present)}/{len(cols)}")
|
||
if missing:
|
||
print(" 未命中列:", ", ".join(sorted(missing)))
|
||
else:
|
||
print(" 未命中列: 无")
|
||
if extra_keys:
|
||
extras = []
|
||
for k in sorted(extra_keys):
|
||
paths = ", ".join(sorted(path_map.get(k, [])))
|
||
extras.append(f"{k} ({paths})")
|
||
print(" JSON 仅有(表无此列):", "; ".join(extras))
|
||
else:
|
||
print(" JSON 仅有(表无此列): 无")
|
||
print("-" * 80)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|