整理 SQL 的 注释
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,10 @@
|
|||||||
-- 灏嗘柊鐨?ODS 浠诲姟娉ㄥ唽鍒?etl_admin.etl_task锛堟牴鎹渶瑕佹浛鎹?store_id锛?
|
-- 将新的 ODS 任务注册到 etl_admin.etl_task(按需替换 store_id)。
|
||||||
-- 浣跨敤鏂瑰紡锛堢ず渚嬶級锛?
|
-- 使用方式(示例):
|
||||||
-- psql "$PG_DSN" -f etl_billiards/database/seed_ods_tasks.sql
|
-- psql "$PG_DSN" -f etl_billiards/database/seed_ods_tasks.sql
|
||||||
-- 鎴栬€呭湪 psql 涓墽琛屾湰鏂囦欢鍐呭銆?
|
-- 或在 psql 中直接执行本文件内容。
|
||||||
|
|
||||||
WITH target_store AS (
|
WITH target_store AS (
|
||||||
SELECT 2790685415443269::bigint AS store_id -- TODO: 鏇挎崲涓哄疄闄?store_id
|
SELECT 2790685415443269::bigint AS store_id -- TODO: 替换为实际 store_id
|
||||||
),
|
),
|
||||||
task_codes AS (
|
task_codes AS (
|
||||||
SELECT unnest(ARRAY[
|
SELECT unnest(ARRAY[
|
||||||
@@ -37,5 +37,3 @@ SELECT t.task_code, s.store_id, TRUE
|
|||||||
FROM task_codes t CROSS JOIN target_store s
|
FROM task_codes t CROSS JOIN target_store s
|
||||||
ON CONFLICT (task_code, store_id) DO UPDATE
|
ON CONFLICT (task_code, store_id) DO UPDATE
|
||||||
SET enabled = EXCLUDED.enabled;
|
SET enabled = EXCLUDED.enabled;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
1927
tmp/etl_billiards_misc/tmp & Delete/schema_ODS_doc.sql.bak
Normal file
1927
tmp/etl_billiards_misc/tmp & Delete/schema_ODS_doc.sql.bak
Normal file
File diff suppressed because it is too large
Load Diff
1945
tmp/etl_billiards_misc/tmp & Delete/schema_ODS_doc.sql.rewrite2.bak
Normal file
1945
tmp/etl_billiards_misc/tmp & Delete/schema_ODS_doc.sql.rewrite2.bak
Normal file
File diff suppressed because it is too large
Load Diff
1878
tmp/etl_billiards_misc/tmp & Delete/schema_dwd_doc.sql.bak
Normal file
1878
tmp/etl_billiards_misc/tmp & Delete/schema_dwd_doc.sql.bak
Normal file
File diff suppressed because it is too large
Load Diff
634
tmp/rewrite_schema_dwd_doc_comments.py
Normal file
634
tmp/rewrite_schema_dwd_doc_comments.py
Normal file
@@ -0,0 +1,634 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from collections import deque
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
ROOT = Path(r"C:\dev\LLTQ\ETL\feiqiu-ETL")
|
||||||
|
SQL_PATH = ROOT / "etl_billiards" / "database" / "schema_dwd_doc.sql"
|
||||||
|
DOC_DIR = Path(r"C:\dev\LLTQ\export\test-json-doc")
|
||||||
|
DWD_TASK_PATH = ROOT / "etl_billiards" / "tasks" / "dwd_load_task.py"
|
||||||
|
|
||||||
|
SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
|
||||||
|
|
||||||
|
SITEPROFILE_FIELD_PURPOSE = {
|
||||||
|
"id": "门店 ID,用于门店维度关联。",
|
||||||
|
"org_id": "组织/机构 ID,用于组织维度归属。",
|
||||||
|
"shop_name": "门店名称,用于展示与查询。",
|
||||||
|
"site_label": "门店标签(如 A/B 店),用于展示与分组。",
|
||||||
|
"full_address": "门店详细地址,用于展示与地理信息。",
|
||||||
|
"address": "门店地址简称/快照,用于展示。",
|
||||||
|
"longitude": "经度,用于定位与地图展示。",
|
||||||
|
"latitude": "纬度,用于定位与地图展示。",
|
||||||
|
"tenant_site_region_id": "租户下门店区域 ID,用于区域维度分析。",
|
||||||
|
"business_tel": "门店电话,用于联系信息展示。",
|
||||||
|
"site_type": "门店类型枚举,用于门店分类。",
|
||||||
|
"shop_status": "门店状态枚举,用于营业状态标识。",
|
||||||
|
"tenant_id": "租户/品牌 ID,用于商户维度过滤与关联。",
|
||||||
|
"auto_light": "是否启用自动灯控配置,用于门店设备策略。",
|
||||||
|
"attendance_enabled": "是否启用考勤功能,用于门店考勤配置。",
|
||||||
|
"attendance_distance": "考勤允许距离(米),用于考勤打卡限制。",
|
||||||
|
"prod_env": "环境标识(生产/测试),用于区分配置环境。",
|
||||||
|
"light_status": "灯控状态/开关,用于灯控设备管理。",
|
||||||
|
"light_type": "灯控类型,用于设备类型区分。",
|
||||||
|
"light_token": "灯控控制令牌,用于对接灯控服务。",
|
||||||
|
"avatar": "门店头像/图片 URL,用于展示。",
|
||||||
|
"wifi_name": "门店 WiFi 名称,用于展示与引导。",
|
||||||
|
"wifi_password": "门店 WiFi 密码,用于展示与引导。",
|
||||||
|
"customer_service_qrcode": "客服二维码 URL,用于引导联系。",
|
||||||
|
"customer_service_wechat": "客服微信号,用于引导联系。",
|
||||||
|
"fixed_pay_qrCode": "固定收款码(二维码)URL,用于收款引导。",
|
||||||
|
"create_time": "门店创建时间(快照字段)。",
|
||||||
|
"update_time": "门店更新时间(快照字段)。",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _escape_sql(s: str) -> str:
|
||||||
|
return (s or "").replace("'", "''")
|
||||||
|
|
||||||
|
|
||||||
|
def _first_sentence(text: str, max_len: int = 140) -> str:
|
||||||
|
s = re.sub(r"\s+", " ", (text or "").strip())
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
parts = re.split(r"[。;;]\s*", s)
|
||||||
|
s = parts[0].strip() if parts else s
|
||||||
|
if len(s) > max_len:
|
||||||
|
s = s[: max_len - 1] + "…"
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_key(s: str) -> str:
|
||||||
|
return re.sub(r"[_\-\s]", "", (s or "").lower())
|
||||||
|
|
||||||
|
|
||||||
|
def snake_to_lower_camel(s: str) -> str:
|
||||||
|
parts = re.split(r"[_\-\s]+", s)
|
||||||
|
if not parts:
|
||||||
|
return s
|
||||||
|
first = parts[0].lower()
|
||||||
|
rest = "".join(p[:1].upper() + p[1:] for p in parts[1:] if p)
|
||||||
|
return first + rest
|
||||||
|
|
||||||
|
|
||||||
|
def snake_to_upper_camel(s: str) -> str:
|
||||||
|
parts = re.split(r"[_\-\s]+", s)
|
||||||
|
return "".join(p[:1].upper() + p[1:] for p in parts if p)
|
||||||
|
|
||||||
|
|
||||||
|
def find_key_in_record(record: dict, token: str) -> str | None:
|
||||||
|
if not isinstance(record, dict):
|
||||||
|
return None
|
||||||
|
if token in record:
|
||||||
|
return token
|
||||||
|
norm_to_key = {normalize_key(k): k for k in record.keys()}
|
||||||
|
|
||||||
|
candidates = [
|
||||||
|
token,
|
||||||
|
token.lower(),
|
||||||
|
token.upper(),
|
||||||
|
snake_to_lower_camel(token),
|
||||||
|
snake_to_upper_camel(token),
|
||||||
|
]
|
||||||
|
|
||||||
|
# 常见变体:siteProfile/siteprofile
|
||||||
|
if normalize_key(token) == "siteprofile":
|
||||||
|
candidates.extend(["siteProfile", "siteprofile"])
|
||||||
|
|
||||||
|
for c in candidates:
|
||||||
|
nk = normalize_key(c)
|
||||||
|
if nk in norm_to_key:
|
||||||
|
return norm_to_key[nk]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_dwd_task_mappings(path: Path):
|
||||||
|
mod = ast.parse(path.read_text(encoding="utf-8"))
|
||||||
|
table_map = None
|
||||||
|
fact_mappings = None
|
||||||
|
|
||||||
|
for node in mod.body:
|
||||||
|
if isinstance(node, ast.ClassDef) and node.name == "DwdLoadTask":
|
||||||
|
for stmt in node.body:
|
||||||
|
if isinstance(stmt, ast.Assign) and len(stmt.targets) == 1 and isinstance(stmt.targets[0], ast.Name):
|
||||||
|
name = stmt.targets[0].id
|
||||||
|
if name == "TABLE_MAP":
|
||||||
|
table_map = ast.literal_eval(stmt.value)
|
||||||
|
elif name == "FACT_MAPPINGS":
|
||||||
|
fact_mappings = ast.literal_eval(stmt.value)
|
||||||
|
if isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name):
|
||||||
|
name = stmt.target.id
|
||||||
|
if name == "TABLE_MAP":
|
||||||
|
table_map = ast.literal_eval(stmt.value)
|
||||||
|
elif name == "FACT_MAPPINGS":
|
||||||
|
fact_mappings = ast.literal_eval(stmt.value)
|
||||||
|
|
||||||
|
if not isinstance(table_map, dict) or not isinstance(fact_mappings, dict):
|
||||||
|
raise RuntimeError("Failed to parse TABLE_MAP/FACT_MAPPINGS from dwd_load_task.py")
|
||||||
|
|
||||||
|
return table_map, fact_mappings
|
||||||
|
|
||||||
|
|
||||||
|
def parse_columns_from_ddl(create_sql: str):
|
||||||
|
start = create_sql.find("(")
|
||||||
|
end = create_sql.rfind(")")
|
||||||
|
body = create_sql[start + 1 : end]
|
||||||
|
|
||||||
|
cols = []
|
||||||
|
for line in body.splitlines():
|
||||||
|
s = line.strip().rstrip(",")
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
if s.upper().startswith("PRIMARY KEY"):
|
||||||
|
continue
|
||||||
|
if s.upper().startswith("CONSTRAINT "):
|
||||||
|
continue
|
||||||
|
m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s+", s)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
name = m.group(1)
|
||||||
|
if name.upper() in {"PRIMARY", "UNIQUE", "FOREIGN", "CHECK"}:
|
||||||
|
continue
|
||||||
|
cols.append(name.lower())
|
||||||
|
return cols
|
||||||
|
|
||||||
|
|
||||||
|
def _find_best_record_list(data, required_norm_keys: set[str]):
|
||||||
|
best = None
|
||||||
|
best_score = -1.0
|
||||||
|
best_path: list[str] = []
|
||||||
|
|
||||||
|
q = deque([(data, 0, [])])
|
||||||
|
visited = 0
|
||||||
|
|
||||||
|
while q and visited < 25000:
|
||||||
|
node, depth, path = q.popleft()
|
||||||
|
visited += 1
|
||||||
|
if depth > 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(node, list):
|
||||||
|
if node and all(isinstance(x, dict) for x in node[:3]):
|
||||||
|
scores = []
|
||||||
|
for x in node[:5]:
|
||||||
|
keys_norm = {normalize_key(k) for k in x.keys()}
|
||||||
|
scores.append(len(keys_norm & required_norm_keys))
|
||||||
|
score = sum(scores) / max(1, len(scores))
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best = node
|
||||||
|
best_path = path
|
||||||
|
for x in node[:10]:
|
||||||
|
q.append((x, depth + 1, path))
|
||||||
|
else:
|
||||||
|
for x in node[:120]:
|
||||||
|
q.append((x, depth + 1, path))
|
||||||
|
elif isinstance(node, dict):
|
||||||
|
for k, v in list(node.items())[:160]:
|
||||||
|
q.append((v, depth + 1, path + [str(k)]))
|
||||||
|
|
||||||
|
node_str = ".".join(best_path) if best_path else "$"
|
||||||
|
return best or [], node_str
|
||||||
|
|
||||||
|
|
||||||
|
def _format_example(value, max_len: int = 120) -> str:
|
||||||
|
if value is None:
|
||||||
|
return "NULL"
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return "true" if value else "false"
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return str(value)
|
||||||
|
if isinstance(value, str):
|
||||||
|
s = value.strip()
|
||||||
|
if len(s) > max_len:
|
||||||
|
s = s[: max_len - 1] + "…"
|
||||||
|
return s
|
||||||
|
if isinstance(value, dict):
|
||||||
|
keys = list(value)[:6]
|
||||||
|
mini = {k: value.get(k) for k in keys}
|
||||||
|
rendered = json.dumps(mini, ensure_ascii=False)
|
||||||
|
if len(value) > len(keys):
|
||||||
|
rendered = rendered[:-1] + ", …}"
|
||||||
|
if len(rendered) > max_len:
|
||||||
|
rendered = rendered[: max_len - 1] + "…"
|
||||||
|
return rendered
|
||||||
|
if isinstance(value, list):
|
||||||
|
if not value:
|
||||||
|
return "[]"
|
||||||
|
rendered = json.dumps(value[0], ensure_ascii=False)
|
||||||
|
if len(value) > 1:
|
||||||
|
rendered = f"[{rendered}, …] (len={len(value)})"
|
||||||
|
else:
|
||||||
|
rendered = f"[{rendered}]"
|
||||||
|
if len(rendered) > max_len:
|
||||||
|
rendered = rendered[: max_len - 1] + "…"
|
||||||
|
return rendered
|
||||||
|
s = str(value)
|
||||||
|
if len(s) > max_len:
|
||||||
|
s = s[: max_len - 1] + "…"
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_purpose(table: str, col: str, json_path: str | None) -> str:
|
||||||
|
lcol = col.lower()
|
||||||
|
|
||||||
|
if lcol in SCD_COLS:
|
||||||
|
if lcol == "scd2_start_time":
|
||||||
|
return "SCD2 开始时间(版本生效起点),用于维度慢变追踪。"
|
||||||
|
if lcol == "scd2_end_time":
|
||||||
|
return "SCD2 结束时间(默认 9999-12-31 表示当前版本),用于维度慢变追踪。"
|
||||||
|
if lcol == "scd2_is_current":
|
||||||
|
return "SCD2 当前版本标记(1=当前,0=历史),用于筛选最新维度记录。"
|
||||||
|
if lcol == "scd2_version":
|
||||||
|
return "SCD2 版本号(自增),用于与时间段一起避免版本重叠。"
|
||||||
|
|
||||||
|
if json_path and json_path.startswith("siteProfile."):
|
||||||
|
sf = json_path.split(".", 1)[1]
|
||||||
|
return SITEPROFILE_FIELD_PURPOSE.get(sf, "门店快照字段,用于门店维度补充信息。")
|
||||||
|
|
||||||
|
if lcol.endswith("_id"):
|
||||||
|
return "标识类 ID 字段,用于关联/定位相关实体。"
|
||||||
|
if lcol.endswith("_time") or lcol.endswith("time") or lcol.endswith("_date"):
|
||||||
|
return "时间/日期字段,用于记录业务时间与统计口径对齐。"
|
||||||
|
if any(k in lcol for k in ["amount", "money", "fee", "price", "deduct", "cost", "balance"]):
|
||||||
|
return "金额字段,用于计费/结算/核算等金额计算。"
|
||||||
|
if any(k in lcol for k in ["count", "num", "number", "seconds", "qty", "quantity"]):
|
||||||
|
return "数量/时长字段,用于统计与计量。"
|
||||||
|
if lcol.endswith("_name") or lcol.endswith("name"):
|
||||||
|
return "名称字段,用于展示与辅助识别。"
|
||||||
|
if lcol.endswith("_status") or lcol == "status":
|
||||||
|
return "状态枚举字段,用于标识业务状态。"
|
||||||
|
if lcol.startswith("is_") or lcol.startswith("can_"):
|
||||||
|
return "布尔/开关字段,用于表示是否/可用性等业务开关。"
|
||||||
|
|
||||||
|
# 表级兜底
|
||||||
|
if table.startswith("dim_"):
|
||||||
|
return "维度字段,用于补充维度属性。"
|
||||||
|
return "明细字段,用于记录事实取值。"
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_json_extract(expr: str):
|
||||||
|
# e.g. siteprofile->>'org_id'
|
||||||
|
m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s*->>\s*'([^']+)'\s*$", expr)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
base = m.group(1)
|
||||||
|
field = m.group(2)
|
||||||
|
if normalize_key(base) == "siteprofile":
|
||||||
|
base = "siteProfile"
|
||||||
|
return base, field
|
||||||
|
|
||||||
|
|
||||||
|
def build_table_comment(table: str, source_ods: str | None, source_json_base: str | None) -> str:
|
||||||
|
table_l = table.lower()
|
||||||
|
if table_l.startswith("dim_"):
|
||||||
|
kind = "DWD 维度表"
|
||||||
|
else:
|
||||||
|
kind = "DWD 明细事实表"
|
||||||
|
|
||||||
|
extra = "扩展字段表" if table_l.endswith("_ex") else ""
|
||||||
|
|
||||||
|
if source_ods and source_json_base:
|
||||||
|
src = (
|
||||||
|
f"ODS 来源表:{source_ods}(对应 JSON:{source_json_base}.json;分析:{source_json_base}-Analysis.md)。"
|
||||||
|
f"装载/清洗逻辑参考:etl_billiards/tasks/dwd_load_task.py(DwdLoadTask)。"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
src = "来源:由 ODS 清洗装载生成(详见 DWD 装载任务)。"
|
||||||
|
|
||||||
|
return f"{kind}{('(' + extra + ')') if extra else ''}:{table_l}。{src}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_source_info(table_l: str, table_map: dict) -> tuple[str | None, str | None]:
|
||||||
|
key = f"billiards_dwd.{table_l}"
|
||||||
|
source_ods = table_map.get(key)
|
||||||
|
if not source_ods:
|
||||||
|
return None, None
|
||||||
|
json_base = source_ods.split(".")[-1]
|
||||||
|
return source_ods, json_base
|
||||||
|
|
||||||
|
|
||||||
|
def build_column_mappings(table_l: str, cols: list[str], fact_mappings: dict) -> dict[str, tuple[str | None, str | None]]:
|
||||||
|
# return col -> (json_path, src_expr)
|
||||||
|
mapping_list = fact_mappings.get(f"billiards_dwd.{table_l}") or []
|
||||||
|
explicit = {dwd_col.lower(): src_expr for dwd_col, src_expr, _cast in mapping_list}
|
||||||
|
casts = {dwd_col.lower(): cast for dwd_col, _src_expr, cast in mapping_list}
|
||||||
|
|
||||||
|
out: dict[str, tuple[str | None, str | None]] = {}
|
||||||
|
|
||||||
|
for c in cols:
|
||||||
|
if c in SCD_COLS:
|
||||||
|
out[c] = (None, None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
src_expr = explicit.get(c, c)
|
||||||
|
cast = casts.get(c)
|
||||||
|
|
||||||
|
json_path = None
|
||||||
|
parsed = _parse_json_extract(src_expr)
|
||||||
|
if parsed:
|
||||||
|
base, field = parsed
|
||||||
|
json_path = f"{base}.{field}"
|
||||||
|
else:
|
||||||
|
# derived: pay_date uses pay_time + cast date
|
||||||
|
if cast == "date":
|
||||||
|
json_path = src_expr
|
||||||
|
else:
|
||||||
|
json_path = src_expr
|
||||||
|
|
||||||
|
out[c] = (json_path, src_expr)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def load_json_records(json_base: str, required_norm_keys: set[str]):
|
||||||
|
json_path = DOC_DIR / f"{json_base}.json"
|
||||||
|
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||||
|
return _find_best_record_list(data, required_norm_keys)
|
||||||
|
|
||||||
|
|
||||||
|
def pick_example_from_record(record: dict, json_path: str | None):
|
||||||
|
if not json_path:
|
||||||
|
return None
|
||||||
|
if json_path.startswith("siteProfile."):
|
||||||
|
base_key = find_key_in_record(record, "siteProfile")
|
||||||
|
base = record.get(base_key) if base_key else None
|
||||||
|
if isinstance(base, dict):
|
||||||
|
field = json_path.split(".", 1)[1]
|
||||||
|
return base.get(field)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# plain key
|
||||||
|
key = find_key_in_record(record, json_path)
|
||||||
|
if key:
|
||||||
|
return record.get(key)
|
||||||
|
# fallback: try match by normalized name
|
||||||
|
nk = normalize_key(json_path)
|
||||||
|
for k in record.keys():
|
||||||
|
if normalize_key(k) == nk:
|
||||||
|
return record.get(k)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_json_field_display(records: list, json_path: str | None, cast: str | None = None) -> str:
|
||||||
|
if not json_path:
|
||||||
|
return "无"
|
||||||
|
if json_path.startswith("siteProfile."):
|
||||||
|
return json_path
|
||||||
|
|
||||||
|
actual_key = None
|
||||||
|
for r in records[:80]:
|
||||||
|
if not isinstance(r, dict):
|
||||||
|
continue
|
||||||
|
k = find_key_in_record(r, json_path)
|
||||||
|
if k:
|
||||||
|
actual_key = k
|
||||||
|
break
|
||||||
|
|
||||||
|
base = actual_key or json_path
|
||||||
|
if cast == "date":
|
||||||
|
return f"{base}(派生:DATE({base}))"
|
||||||
|
if cast == "boolean":
|
||||||
|
return f"{base}(派生:BOOLEAN({base}))"
|
||||||
|
if cast in {"numeric", "timestamptz"}:
|
||||||
|
return f"{base}(派生:CAST({base} AS {cast}))"
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_ods_source_field(records: list, src_expr: str | None, cast: str | None = None) -> str:
|
||||||
|
if not src_expr:
|
||||||
|
return "无"
|
||||||
|
|
||||||
|
parsed = _parse_json_extract(src_expr)
|
||||||
|
if parsed:
|
||||||
|
base, field = parsed
|
||||||
|
# 统一大小写展示
|
||||||
|
if normalize_key(base) == "siteprofile":
|
||||||
|
base = "siteProfile"
|
||||||
|
return f"{base}.{field}"
|
||||||
|
|
||||||
|
# 直接字段:尽量输出 JSON 实际键名(大小写/驼峰)
|
||||||
|
actual = None
|
||||||
|
for r in records[:80]:
|
||||||
|
if not isinstance(r, dict):
|
||||||
|
continue
|
||||||
|
k = find_key_in_record(r, src_expr)
|
||||||
|
if k:
|
||||||
|
actual = k
|
||||||
|
break
|
||||||
|
|
||||||
|
base = actual or src_expr
|
||||||
|
if cast == "date":
|
||||||
|
return f"{base}(派生:DATE({base}))"
|
||||||
|
if cast == "boolean":
|
||||||
|
return f"{base}(派生:BOOLEAN({base}))"
|
||||||
|
if cast in {"numeric", "timestamptz"}:
|
||||||
|
return f"{base}(派生:CAST({base} AS {cast}))"
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_json_field_triplet(
|
||||||
|
json_file: str | None,
|
||||||
|
record_node: str | None,
|
||||||
|
records: list,
|
||||||
|
json_path: str | None,
|
||||||
|
cast: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
if not json_file:
|
||||||
|
json_file = "无"
|
||||||
|
node = record_node or "$"
|
||||||
|
|
||||||
|
if not json_path:
|
||||||
|
return f"{json_file} - 无 - 无"
|
||||||
|
|
||||||
|
if json_path.startswith("siteProfile."):
|
||||||
|
base_key = None
|
||||||
|
field_key = None
|
||||||
|
for r in records[:80]:
|
||||||
|
if not isinstance(r, dict):
|
||||||
|
continue
|
||||||
|
base_key = find_key_in_record(r, "siteProfile")
|
||||||
|
if base_key:
|
||||||
|
base = r.get(base_key)
|
||||||
|
if isinstance(base, dict):
|
||||||
|
raw_field = json_path.split(".", 1)[1]
|
||||||
|
# 尽量匹配子字段大小写
|
||||||
|
if raw_field in base:
|
||||||
|
field_key = raw_field
|
||||||
|
else:
|
||||||
|
nk = normalize_key(raw_field)
|
||||||
|
for k in base.keys():
|
||||||
|
if normalize_key(k) == nk:
|
||||||
|
field_key = k
|
||||||
|
break
|
||||||
|
break
|
||||||
|
base_key = base_key or "siteProfile"
|
||||||
|
field_key = field_key or json_path.split(".", 1)[1]
|
||||||
|
node = f"{node}.{base_key}" if node else base_key
|
||||||
|
field = field_key
|
||||||
|
else:
|
||||||
|
actual = None
|
||||||
|
for r in records[:80]:
|
||||||
|
if isinstance(r, dict):
|
||||||
|
actual = find_key_in_record(r, json_path)
|
||||||
|
if actual:
|
||||||
|
break
|
||||||
|
field = actual or json_path
|
||||||
|
|
||||||
|
if cast == "date":
|
||||||
|
field = f"{field}(派生:DATE({field}))"
|
||||||
|
elif cast == "boolean":
|
||||||
|
field = f"{field}(派生:BOOLEAN({field}))"
|
||||||
|
elif cast in {"numeric", "timestamptz"}:
|
||||||
|
field = f"{field}(派生:CAST({field} AS {cast}))"
|
||||||
|
|
||||||
|
return f"{json_file} - {node} - {field}"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
table_map, fact_mappings = parse_dwd_task_mappings(DWD_TASK_PATH)
|
||||||
|
|
||||||
|
raw = SQL_PATH.read_text(encoding="utf-8", errors="replace")
|
||||||
|
newline = "\r\n" if "\r\n" in raw else "\n"
|
||||||
|
|
||||||
|
# strip all sql comments and existing COMMENT ON statements, incl. DO-block comment exec lines
|
||||||
|
kept_lines = []
|
||||||
|
for line in raw.splitlines(True):
|
||||||
|
if line.lstrip().startswith("--"):
|
||||||
|
continue
|
||||||
|
if re.match(r"^\s*COMMENT ON\s+(TABLE|COLUMN)\s+", line, re.I):
|
||||||
|
continue
|
||||||
|
if "COMMENT ON COLUMN" in line or "COMMENT ON TABLE" in line:
|
||||||
|
# remove legacy execute format lines too
|
||||||
|
continue
|
||||||
|
kept_lines.append(line)
|
||||||
|
clean = "".join(kept_lines)
|
||||||
|
|
||||||
|
create_re = re.compile(
|
||||||
|
r"(^\s*CREATE TABLE IF NOT EXISTS\s+(?P<table>[A-Za-z0-9_]+)\s*\([\s\S]*?\)\s*;)",
|
||||||
|
re.M,
|
||||||
|
)
|
||||||
|
|
||||||
|
out_parts = []
|
||||||
|
last = 0
|
||||||
|
count_tables = 0
|
||||||
|
|
||||||
|
for m in create_re.finditer(clean):
|
||||||
|
stmt = m.group(1)
|
||||||
|
table = m.group("table").lower()
|
||||||
|
|
||||||
|
out_parts.append(clean[last : m.end()])
|
||||||
|
|
||||||
|
cols = parse_columns_from_ddl(stmt)
|
||||||
|
|
||||||
|
source_ods, json_base = get_source_info(table, table_map)
|
||||||
|
|
||||||
|
# derive required keys
|
||||||
|
required_norm = set()
|
||||||
|
col_map = build_column_mappings(table, cols, fact_mappings)
|
||||||
|
# cast map for json field display
|
||||||
|
cast_map = {
|
||||||
|
dwd_col.lower(): cast
|
||||||
|
for dwd_col, _src_expr, cast in (fact_mappings.get(f"billiards_dwd.{table}") or [])
|
||||||
|
}
|
||||||
|
src_expr_map = {
|
||||||
|
dwd_col.lower(): src_expr
|
||||||
|
for dwd_col, src_expr, _cast in (fact_mappings.get(f"billiards_dwd.{table}") or [])
|
||||||
|
}
|
||||||
|
for c, (jp, _src) in col_map.items():
|
||||||
|
if not jp:
|
||||||
|
continue
|
||||||
|
if jp.startswith("siteProfile."):
|
||||||
|
required_norm.add(normalize_key("siteProfile"))
|
||||||
|
else:
|
||||||
|
required_norm.add(normalize_key(jp))
|
||||||
|
|
||||||
|
records = []
|
||||||
|
record_node = "$"
|
||||||
|
if json_base and (DOC_DIR / f"{json_base}.json").exists():
|
||||||
|
try:
|
||||||
|
records, record_node = load_json_records(json_base, required_norm)
|
||||||
|
except Exception:
|
||||||
|
records = []
|
||||||
|
record_node = "$"
|
||||||
|
|
||||||
|
table_comment = build_table_comment(table, source_ods, json_base)
|
||||||
|
comment_lines = [f"COMMENT ON TABLE billiards_dwd.{table} IS '{_escape_sql(table_comment)}';"]
|
||||||
|
|
||||||
|
for c in cols:
|
||||||
|
jp, _src = col_map.get(c, (None, None))
|
||||||
|
|
||||||
|
if c in SCD_COLS:
|
||||||
|
if c == "scd2_start_time":
|
||||||
|
ex = "2025-11-10T00:00:00+08:00"
|
||||||
|
elif c == "scd2_end_time":
|
||||||
|
ex = "9999-12-31T00:00:00+00:00"
|
||||||
|
elif c == "scd2_is_current":
|
||||||
|
ex = "1"
|
||||||
|
else:
|
||||||
|
ex = "1"
|
||||||
|
json_field = "无 - DWD慢变元数据 - 无"
|
||||||
|
ods_src = "无(DWD慢变元数据)"
|
||||||
|
else:
|
||||||
|
# pick example from first records
|
||||||
|
ex_val = None
|
||||||
|
for r in records[:80]:
|
||||||
|
v = pick_example_from_record(r, jp)
|
||||||
|
if v not in (None, ""):
|
||||||
|
ex_val = v
|
||||||
|
break
|
||||||
|
ex = _format_example(ex_val)
|
||||||
|
json_field = resolve_json_field_triplet(
|
||||||
|
f"{json_base}.json" if json_base else None,
|
||||||
|
record_node,
|
||||||
|
records,
|
||||||
|
jp,
|
||||||
|
cast_map.get(c),
|
||||||
|
)
|
||||||
|
src_expr = src_expr_map.get(c, jp)
|
||||||
|
ods_src = resolve_ods_source_field(records, src_expr, cast_map.get(c))
|
||||||
|
|
||||||
|
purpose = _first_sentence(_infer_purpose(table, c, jp), 140)
|
||||||
|
func = purpose
|
||||||
|
if "用于" not in func:
|
||||||
|
func = "用于" + func.rstrip("。")
|
||||||
|
|
||||||
|
if source_ods:
|
||||||
|
ods_table_only = source_ods.split(".")[-1]
|
||||||
|
ods_src_display = f"{ods_table_only} - {ods_src}"
|
||||||
|
else:
|
||||||
|
ods_src_display = f"无 - {ods_src}"
|
||||||
|
|
||||||
|
comment = (
|
||||||
|
f"【说明】{purpose}。"
|
||||||
|
f" 【示例】{ex}({func})。"
|
||||||
|
f" 【ODS来源】{ods_src_display}。"
|
||||||
|
f" 【JSON字段】{json_field}。"
|
||||||
|
)
|
||||||
|
comment_lines.append(
|
||||||
|
f"COMMENT ON COLUMN billiards_dwd.{table}.{c} IS '{_escape_sql(comment)}';"
|
||||||
|
)
|
||||||
|
|
||||||
|
out_parts.append(newline + newline + (newline.join(comment_lines)) + newline + newline)
|
||||||
|
last = m.end()
|
||||||
|
count_tables += 1
|
||||||
|
|
||||||
|
out_parts.append(clean[last:])
|
||||||
|
result = "".join(out_parts)
|
||||||
|
|
||||||
|
# collapse extra blank lines
|
||||||
|
result = re.sub(r"(?:\r?\n){4,}", newline * 3, result)
|
||||||
|
|
||||||
|
backup = SQL_PATH.with_suffix(SQL_PATH.suffix + ".bak")
|
||||||
|
if not backup.exists():
|
||||||
|
backup.write_text(raw, encoding="utf-8")
|
||||||
|
|
||||||
|
SQL_PATH.write_text(result, encoding="utf-8")
|
||||||
|
|
||||||
|
print(f"Rewrote comments for {count_tables} tables: {SQL_PATH}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
560
tmp/rewrite_schema_ods_doc_comments.py
Normal file
560
tmp/rewrite_schema_ods_doc_comments.py
Normal file
@@ -0,0 +1,560 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
SQL_PATH = Path(r"C:\dev\LLTQ\ETL\feiqiu-ETL\etl_billiards\database\schema_ODS_doc.sql")
|
||||||
|
DOC_DIR = Path(r"C:\dev\LLTQ\export\test-json-doc")
|
||||||
|
|
||||||
|
TABLE_CN = {
|
||||||
|
"member_profiles": "会员档案/会员账户信息",
|
||||||
|
"member_balance_changes": "会员余额变更流水",
|
||||||
|
"member_stored_value_cards": "会员储值/卡券账户列表",
|
||||||
|
"recharge_settlements": "充值结算记录",
|
||||||
|
"settlement_records": "结账/结算记录",
|
||||||
|
"assistant_cancellation_records": "助教作废/取消记录",
|
||||||
|
"assistant_accounts_master": "助教档案主数据",
|
||||||
|
"assistant_service_records": "助教服务流水",
|
||||||
|
"site_tables_master": "门店桌台主数据",
|
||||||
|
"table_fee_discount_records": "台费折扣记录",
|
||||||
|
"table_fee_transactions": "台费流水",
|
||||||
|
"goods_stock_movements": "商品库存变动流水",
|
||||||
|
"stock_goods_category_tree": "商品分类树",
|
||||||
|
"goods_stock_summary": "商品库存汇总",
|
||||||
|
"payment_transactions": "支付流水",
|
||||||
|
"refund_transactions": "退款流水",
|
||||||
|
"platform_coupon_redemption_records": "平台券核销/使用记录",
|
||||||
|
"tenant_goods_master": "租户商品主数据",
|
||||||
|
"group_buy_packages": "团购套餐主数据",
|
||||||
|
"group_buy_redemption_records": "团购核销记录",
|
||||||
|
"settlement_ticket_details": "结算小票明细",
|
||||||
|
"store_goods_master": "门店商品主数据",
|
||||||
|
"store_goods_sales_records": "门店商品销售流水",
|
||||||
|
}
|
||||||
|
|
||||||
|
COMMON_FIELD_PURPOSE = {
|
||||||
|
"tenant_id": "租户/品牌 ID,用于商户维度过滤与关联。",
|
||||||
|
"site_id": "门店 ID,用于门店维度过滤与关联。",
|
||||||
|
"register_site_id": "会员注册门店 ID,用于归属门店维度关联。",
|
||||||
|
"site_name": "门店名称快照,用于直接展示。",
|
||||||
|
"id": "本表主键 ID,用于唯一标识一条记录。",
|
||||||
|
"system_member_id": "系统级会员 ID(跨门店/跨卡种统一到‘人’的维度)。",
|
||||||
|
"order_trade_no": "订单交易号,用于串联同一订单下的各类消费明细。",
|
||||||
|
"order_settle_id": "订单结算/结账主键,用于关联结算记录与小票明细。",
|
||||||
|
"order_pay_id": "关联支付流水的主键 ID,用于追溯支付明细。",
|
||||||
|
"point": "积分余额,用于记录会员积分取值。",
|
||||||
|
"growth_value": "成长值/成长积分,用于会员成长与等级评估。",
|
||||||
|
"referrer_member_id": "推荐人会员 ID,用于记录会员推荐/拉新关系。",
|
||||||
|
"create_time": "记录创建时间(业务侧产生时间)。",
|
||||||
|
"status": "状态枚举,用于标识记录当前业务状态。",
|
||||||
|
"user_status": "用户状态枚举,用于标识会员账户/用户可用状态。",
|
||||||
|
"is_delete": "逻辑删除标记(0=否,1=是)。",
|
||||||
|
"payload": "完整原始 JSON 记录快照,用于回溯与二次解析。",
|
||||||
|
"source_file": "ETL 元数据:原始导出文件名,用于数据追溯。",
|
||||||
|
"source_endpoint": "ETL 元数据:采集来源(接口/文件路径),用于数据追溯。",
|
||||||
|
"fetched_at": "ETL 元数据:采集/入库时间戳,用于口径对齐与增量处理。",
|
||||||
|
}
|
||||||
|
|
||||||
|
ETL_META_FIELDS = {"source_file", "source_endpoint", "fetched_at"}
|
||||||
|
|
||||||
|
|
||||||
|
def _first_sentence(text: str, max_len: int = 120) -> str:
|
||||||
|
s = re.sub(r"\s+", " ", (text or "").strip())
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
parts = re.split(r"[。;;]\s*", s)
|
||||||
|
s = parts[0].strip() if parts else s
|
||||||
|
if len(s) > max_len:
|
||||||
|
s = s[: max_len - 1] + "…"
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _escape_sql(s: str) -> str:
|
||||||
|
return (s or "").replace("'", "''")
|
||||||
|
|
||||||
|
def normalize_key(s: str) -> str:
|
||||||
|
return re.sub(r"[_\-\s]", "", (s or "").lower())
|
||||||
|
|
||||||
|
|
||||||
|
def snake_to_lower_camel(s: str) -> str:
|
||||||
|
parts = re.split(r"[_\-\s]+", s)
|
||||||
|
if not parts:
|
||||||
|
return s
|
||||||
|
first = parts[0].lower()
|
||||||
|
rest = "".join(p[:1].upper() + p[1:] for p in parts[1:] if p)
|
||||||
|
return first + rest
|
||||||
|
|
||||||
|
|
||||||
|
def snake_to_upper_camel(s: str) -> str:
|
||||||
|
parts = re.split(r"[_\-\s]+", s)
|
||||||
|
return "".join(p[:1].upper() + p[1:] for p in parts if p)
|
||||||
|
|
||||||
|
|
||||||
|
def find_key_in_record(record: dict, token: str) -> str | None:
|
||||||
|
if not isinstance(record, dict) or not token:
|
||||||
|
return None
|
||||||
|
if token in record:
|
||||||
|
return token
|
||||||
|
norm_to_key = {normalize_key(k): k for k in record.keys()}
|
||||||
|
candidates = [
|
||||||
|
token,
|
||||||
|
token.lower(),
|
||||||
|
token.upper(),
|
||||||
|
snake_to_lower_camel(token),
|
||||||
|
snake_to_upper_camel(token),
|
||||||
|
]
|
||||||
|
for c in candidates:
|
||||||
|
nk = normalize_key(c)
|
||||||
|
if nk in norm_to_key:
|
||||||
|
return norm_to_key[nk]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_purpose(_table: str, col: str) -> str:
|
||||||
|
if col in COMMON_FIELD_PURPOSE:
|
||||||
|
return COMMON_FIELD_PURPOSE[col]
|
||||||
|
|
||||||
|
lower = col.lower()
|
||||||
|
if lower.endswith("_id"):
|
||||||
|
return "标识类 ID 字段,用于关联/定位相关实体。"
|
||||||
|
if lower.endswith("_time") or lower.endswith("time"):
|
||||||
|
return "时间字段,用于记录业务时间点/发生时间。"
|
||||||
|
if any(k in lower for k in ["amount", "money", "fee", "price", "deduct", "cost"]):
|
||||||
|
return "金额字段,用于计费/结算/分摊等金额计算。"
|
||||||
|
if any(k in lower for k in ["count", "num", "number", "seconds", "qty"]):
|
||||||
|
return "数量/时长字段,用于统计与计量。"
|
||||||
|
if lower.endswith("_name") or lower.endswith("name"):
|
||||||
|
return "名称字段,用于展示与辅助识别。"
|
||||||
|
if lower.endswith("_code") or lower.endswith("code"):
|
||||||
|
return "编码/枚举字段,用于表示类型、等级或业务枚举。"
|
||||||
|
if lower.startswith("is_") or lower.startswith("able_") or lower.startswith("can_"):
|
||||||
|
return "布尔/开关字段,用于表示权限、可用性或状态开关。"
|
||||||
|
|
||||||
|
return "来自 JSON 导出的原始字段,用于保留业务取值。"
|
||||||
|
|
||||||
|
|
||||||
|
def _format_example(value, max_len: int = 120) -> str:
|
||||||
|
if value is None:
|
||||||
|
return "NULL"
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return "true" if value else "false"
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return str(value)
|
||||||
|
if isinstance(value, str):
|
||||||
|
s = value.strip()
|
||||||
|
if len(s) > max_len:
|
||||||
|
s = s[: max_len - 1] + "…"
|
||||||
|
return s
|
||||||
|
if isinstance(value, list):
|
||||||
|
if not value:
|
||||||
|
return "[]"
|
||||||
|
sample = value[0]
|
||||||
|
rendered = json.dumps(sample, ensure_ascii=False)
|
||||||
|
if len(value) > 1:
|
||||||
|
rendered = f"[{rendered}, …] (len={len(value)})"
|
||||||
|
else:
|
||||||
|
rendered = f"[{rendered}]"
|
||||||
|
if len(rendered) > max_len:
|
||||||
|
rendered = rendered[: max_len - 1] + "…"
|
||||||
|
return rendered
|
||||||
|
if isinstance(value, dict):
|
||||||
|
keys = list(value)[:6]
|
||||||
|
mini = {k: value.get(k) for k in keys}
|
||||||
|
rendered = json.dumps(mini, ensure_ascii=False)
|
||||||
|
if len(value) > len(keys):
|
||||||
|
rendered = rendered[:-1] + ", …}"
|
||||||
|
if len(rendered) > max_len:
|
||||||
|
rendered = rendered[: max_len - 1] + "…"
|
||||||
|
return rendered
|
||||||
|
rendered = str(value)
|
||||||
|
if len(rendered) > max_len:
|
||||||
|
rendered = rendered[: max_len - 1] + "…"
|
||||||
|
return rendered
|
||||||
|
|
||||||
|
|
||||||
|
def _find_best_record_list(data, columns):
|
||||||
|
cols = set(columns)
|
||||||
|
best = None
|
||||||
|
best_score = -1
|
||||||
|
|
||||||
|
queue = [(data, 0)]
|
||||||
|
visited = 0
|
||||||
|
while queue and visited < 20000:
|
||||||
|
node, depth = queue.pop(0)
|
||||||
|
visited += 1
|
||||||
|
if depth > 8:
|
||||||
|
continue
|
||||||
|
if isinstance(node, list):
|
||||||
|
if node and all(isinstance(x, dict) for x in node[:3]):
|
||||||
|
scores = []
|
||||||
|
for x in node[:5]:
|
||||||
|
scores.append(len(set(x.keys()) & cols))
|
||||||
|
score = sum(scores) / max(1, len(scores))
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best = node
|
||||||
|
for x in node[:10]:
|
||||||
|
queue.append((x, depth + 1))
|
||||||
|
else:
|
||||||
|
for x in node[:50]:
|
||||||
|
queue.append((x, depth + 1))
|
||||||
|
elif isinstance(node, dict):
|
||||||
|
for v in list(node.values())[:80]:
|
||||||
|
queue.append((v, depth + 1))
|
||||||
|
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def _find_best_record_list_and_node(data, columns):
|
||||||
|
cols = set(columns)
|
||||||
|
best = None
|
||||||
|
best_score = -1
|
||||||
|
best_path = []
|
||||||
|
|
||||||
|
queue = [(data, 0, [])]
|
||||||
|
visited = 0
|
||||||
|
while queue and visited < 25000:
|
||||||
|
node, depth, path = queue.pop(0)
|
||||||
|
visited += 1
|
||||||
|
if depth > 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(node, list):
|
||||||
|
if node and all(isinstance(x, dict) for x in node[:3]):
|
||||||
|
scores = []
|
||||||
|
for x in node[:5]:
|
||||||
|
scores.append(len(set(x.keys()) & cols))
|
||||||
|
score = sum(scores) / max(1, len(scores))
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best = node
|
||||||
|
best_path = path
|
||||||
|
for x in node[:10]:
|
||||||
|
queue.append((x, depth + 1, path))
|
||||||
|
else:
|
||||||
|
for x in node[:80]:
|
||||||
|
queue.append((x, depth + 1, path))
|
||||||
|
elif isinstance(node, dict):
|
||||||
|
for k, v in list(node.items())[:120]:
|
||||||
|
queue.append((v, depth + 1, path + [str(k)]))
|
||||||
|
|
||||||
|
node_str = ".".join(best_path) if best_path else "$"
|
||||||
|
return best or [], node_str
|
||||||
|
|
||||||
|
|
||||||
|
def _choose_examples(records, columns):
|
||||||
|
examples = {}
|
||||||
|
if not records:
|
||||||
|
return examples
|
||||||
|
for col in columns:
|
||||||
|
val = None
|
||||||
|
for r in records[:120]:
|
||||||
|
if isinstance(r, dict) and col in r and r[col] not in (None, ""):
|
||||||
|
val = r[col]
|
||||||
|
break
|
||||||
|
examples[col] = val
|
||||||
|
return examples
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_header_fields(line: str, columns_set):
|
||||||
|
s = line.strip()
|
||||||
|
if not s:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 支持 1. id / 1.1 siteProfile / 8. tenant_id
|
||||||
|
m = re.match(r"^\d+(?:\.\d+)*[\.)]?\s+(.+)$", s)
|
||||||
|
if m:
|
||||||
|
s = m.group(1).strip()
|
||||||
|
|
||||||
|
parts = re.split(r"\s*[/、,,]\s*", s)
|
||||||
|
fields = [p.strip() for p in parts if p.strip() in columns_set]
|
||||||
|
|
||||||
|
if not fields and s in columns_set:
|
||||||
|
fields = [s]
|
||||||
|
|
||||||
|
if fields and len(line) <= 120:
|
||||||
|
return fields
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_field_purpose_from_block(block_lines):
|
||||||
|
lines = [l.rstrip() for l in block_lines]
|
||||||
|
|
||||||
|
def pick_after_label(labels):
|
||||||
|
for i, l in enumerate(lines):
|
||||||
|
for lab in labels:
|
||||||
|
if lab in l:
|
||||||
|
after = l.split(lab, 1)[1].strip()
|
||||||
|
if after:
|
||||||
|
return after
|
||||||
|
buf = []
|
||||||
|
j = i + 1
|
||||||
|
while j < len(lines) and not lines[j].strip():
|
||||||
|
j += 1
|
||||||
|
for k in range(j, len(lines)):
|
||||||
|
if not lines[k].strip():
|
||||||
|
break
|
||||||
|
if re.match(r"^[\w\u4e00-\u9fff]+[::]", lines[k].strip()):
|
||||||
|
break
|
||||||
|
buf.append(lines[k].strip())
|
||||||
|
if buf:
|
||||||
|
return " ".join(buf)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 兼容「含义(结合其它文件):」「含义(推测):」等变体
|
||||||
|
picked = pick_after_label(["含义:", "含义:"])
|
||||||
|
if not picked:
|
||||||
|
for i, l in enumerate(lines):
|
||||||
|
s = l.strip()
|
||||||
|
m = re.match(r"^含义.*[::]\s*(.*)$", s)
|
||||||
|
if m:
|
||||||
|
after = m.group(1).strip()
|
||||||
|
if after:
|
||||||
|
picked = after
|
||||||
|
else:
|
||||||
|
buf = []
|
||||||
|
j = i + 1
|
||||||
|
while j < len(lines) and not lines[j].strip():
|
||||||
|
j += 1
|
||||||
|
for k in range(j, len(lines)):
|
||||||
|
if not lines[k].strip():
|
||||||
|
break
|
||||||
|
if re.match(r"^[\w\u4e00-\u9fff]+[::]", lines[k].strip()):
|
||||||
|
break
|
||||||
|
buf.append(lines[k].strip())
|
||||||
|
if buf:
|
||||||
|
picked = " ".join(buf)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not picked:
|
||||||
|
picked = pick_after_label(["作用:", "作用:"])
|
||||||
|
if not picked:
|
||||||
|
for i, l in enumerate(lines):
|
||||||
|
s = l.strip()
|
||||||
|
m = re.match(r"^作用.*[::]\s*(.*)$", s)
|
||||||
|
if m:
|
||||||
|
after = m.group(1).strip()
|
||||||
|
if after:
|
||||||
|
picked = after
|
||||||
|
break
|
||||||
|
|
||||||
|
if not picked:
|
||||||
|
# 兜底:尽量避开“类型:/唯一值个数:”这类描述
|
||||||
|
for l in lines:
|
||||||
|
s = l.strip()
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
if any(
|
||||||
|
s.startswith(prefix)
|
||||||
|
for prefix in [
|
||||||
|
"类型:",
|
||||||
|
"非空:",
|
||||||
|
"唯一值",
|
||||||
|
"观测",
|
||||||
|
"特征",
|
||||||
|
"统计",
|
||||||
|
"分布",
|
||||||
|
"说明:",
|
||||||
|
"关联:",
|
||||||
|
"结构关系",
|
||||||
|
"和其它表",
|
||||||
|
"重复记录",
|
||||||
|
"全部为",
|
||||||
|
]
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
picked = s
|
||||||
|
break
|
||||||
|
|
||||||
|
return _first_sentence(picked, 160)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_poor_purpose(purpose: str) -> bool:
|
||||||
|
s = (purpose or "").strip()
|
||||||
|
if not s:
|
||||||
|
return True
|
||||||
|
if s.endswith(":") or s.endswith(":"):
|
||||||
|
return True
|
||||||
|
if s.startswith("全部为"):
|
||||||
|
return True
|
||||||
|
if s.startswith("含义") and (":" in s or ":" in s) and len(s) <= 12:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def parse_analysis(analysis_text: str, columns):
|
||||||
|
columns_set = set(columns)
|
||||||
|
blocks = defaultdict(list)
|
||||||
|
|
||||||
|
current_fields = []
|
||||||
|
buf = []
|
||||||
|
|
||||||
|
for raw in analysis_text.splitlines():
|
||||||
|
fields = _extract_header_fields(raw, columns_set)
|
||||||
|
if fields:
|
||||||
|
if current_fields and buf:
|
||||||
|
for f in current_fields:
|
||||||
|
blocks[f].extend(buf)
|
||||||
|
current_fields = fields
|
||||||
|
buf = []
|
||||||
|
else:
|
||||||
|
if current_fields:
|
||||||
|
buf.append(raw)
|
||||||
|
|
||||||
|
if current_fields and buf:
|
||||||
|
for f in current_fields:
|
||||||
|
blocks[f].extend(buf)
|
||||||
|
|
||||||
|
purposes = {}
|
||||||
|
for col in columns:
|
||||||
|
if col in blocks and blocks[col]:
|
||||||
|
p = _parse_field_purpose_from_block(blocks[col])
|
||||||
|
if p:
|
||||||
|
purposes[col] = p
|
||||||
|
return purposes
|
||||||
|
|
||||||
|
|
||||||
|
def parse_columns_from_ddl(create_sql: str):
|
||||||
|
start = create_sql.find("(")
|
||||||
|
end = create_sql.rfind(")")
|
||||||
|
body = create_sql[start + 1 : end]
|
||||||
|
|
||||||
|
cols = []
|
||||||
|
for line in body.splitlines():
|
||||||
|
s = line.strip().rstrip(",")
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
if s.startswith(")"):
|
||||||
|
continue
|
||||||
|
if s.upper().startswith("CONSTRAINT "):
|
||||||
|
continue
|
||||||
|
m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s+", s)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
name = m.group(1)
|
||||||
|
if name.upper() in {"PRIMARY", "UNIQUE", "FOREIGN", "CHECK"}:
|
||||||
|
continue
|
||||||
|
cols.append(name)
|
||||||
|
return cols
|
||||||
|
|
||||||
|
|
||||||
|
def build_comment_block(table: str, columns, analysis_text: str, records):
|
||||||
|
# records_node: 由外部确定,避免这里重复遍历 JSON
|
||||||
|
records, records_node = records
|
||||||
|
purposes = parse_analysis(analysis_text, columns)
|
||||||
|
examples = _choose_examples(records, columns)
|
||||||
|
|
||||||
|
table_cn = TABLE_CN.get(table, table)
|
||||||
|
|
||||||
|
table_comment = (
|
||||||
|
f"ODS 原始明细表:{table_cn}。"
|
||||||
|
f"来源:C:/dev/LLTQ/export/test-json-doc/{table}.json;分析:{table}-Analysis.md。"
|
||||||
|
f"字段以导出原样为主;ETL 补充 source_file/source_endpoint/fetched_at,并保留 payload 为原始记录快照。"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
lines.append(f"COMMENT ON TABLE billiards_ods.{table} IS '{_escape_sql(table_comment)}';")
|
||||||
|
|
||||||
|
for col in columns:
|
||||||
|
json_file = f"{table}.json"
|
||||||
|
if col in ETL_META_FIELDS:
|
||||||
|
json_field = f"{json_file} - ETL元数据 - 无"
|
||||||
|
elif col == "payload":
|
||||||
|
json_field = f"{json_file} - {records_node} - $"
|
||||||
|
else:
|
||||||
|
actual = None
|
||||||
|
for r in records[:50]:
|
||||||
|
if isinstance(r, dict):
|
||||||
|
actual = find_key_in_record(r, col)
|
||||||
|
if actual:
|
||||||
|
break
|
||||||
|
field_name = actual or col
|
||||||
|
json_field = f"{json_file} - {records_node} - {field_name}"
|
||||||
|
|
||||||
|
purpose = purposes.get(col) or _infer_purpose(table, col)
|
||||||
|
purpose = _first_sentence(purpose, 140) or _infer_purpose(table, col)
|
||||||
|
if _is_poor_purpose(purpose):
|
||||||
|
purpose = COMMON_FIELD_PURPOSE.get(col) or _infer_purpose(table, col)
|
||||||
|
|
||||||
|
if col in ETL_META_FIELDS:
|
||||||
|
if col == "source_file":
|
||||||
|
ex = f"{table}.json"
|
||||||
|
elif col == "source_endpoint":
|
||||||
|
ex = f"C:/dev/LLTQ/export/test-json-doc/{table}.json"
|
||||||
|
else:
|
||||||
|
ex = "2025-11-10T00:00:00+08:00"
|
||||||
|
elif col == "payload":
|
||||||
|
ex = "{...}"
|
||||||
|
else:
|
||||||
|
ex = _format_example(examples.get(col))
|
||||||
|
|
||||||
|
func = purpose
|
||||||
|
if "用于" not in func:
|
||||||
|
func = "用于" + func.rstrip("。")
|
||||||
|
|
||||||
|
# ODS来源:表名-字段名(ODS自身字段);ETL补充字段标记
|
||||||
|
if col in ETL_META_FIELDS:
|
||||||
|
ods_src = f"{table} - {col}(ETL补充)"
|
||||||
|
else:
|
||||||
|
ods_src = f"{table} - {col}"
|
||||||
|
|
||||||
|
comment = (
|
||||||
|
f"【说明】{purpose}。"
|
||||||
|
f" 【示例】{ex}({func})。"
|
||||||
|
f" 【ODS来源】{ods_src}。"
|
||||||
|
f" 【JSON字段】{json_field}。"
|
||||||
|
)
|
||||||
|
lines.append(
|
||||||
|
f"COMMENT ON COLUMN billiards_ods.{table}.{col} IS '{_escape_sql(comment)}';"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
text = SQL_PATH.read_text(encoding="utf-8")
|
||||||
|
newline = "\r\n" if "\r\n" in text else "\n"
|
||||||
|
|
||||||
|
kept = []
|
||||||
|
for raw_line in text.splitlines(True):
|
||||||
|
stripped = raw_line.lstrip()
|
||||||
|
if stripped.startswith("--"):
|
||||||
|
continue
|
||||||
|
if re.match(r"^\s*COMMENT ON\s+(TABLE|COLUMN)\s+", raw_line):
|
||||||
|
continue
|
||||||
|
kept.append(raw_line)
|
||||||
|
|
||||||
|
clean = "".join(kept)
|
||||||
|
|
||||||
|
create_re = re.compile(
|
||||||
|
r"(CREATE TABLE IF NOT EXISTS\s+billiards_ods\.(?P<table>[A-Za-z0-9_]+)\s*\([\s\S]*?\)\s*;)" ,
|
||||||
|
re.M,
|
||||||
|
)
|
||||||
|
|
||||||
|
out_parts = []
|
||||||
|
last = 0
|
||||||
|
count = 0
|
||||||
|
for m in create_re.finditer(clean):
|
||||||
|
out_parts.append(clean[last : m.end()])
|
||||||
|
table = m.group("table")
|
||||||
|
create_sql = m.group(1)
|
||||||
|
|
||||||
|
cols = parse_columns_from_ddl(create_sql)
|
||||||
|
analysis_text = (DOC_DIR / f"{table}-Analysis.md").read_text(encoding="utf-8")
|
||||||
|
data = json.loads((DOC_DIR / f"{table}.json").read_text(encoding="utf-8"))
|
||||||
|
record_list, record_node = _find_best_record_list_and_node(data, cols)
|
||||||
|
|
||||||
|
out_parts.append(newline + newline + build_comment_block(table, cols, analysis_text, (record_list, record_node)) + newline + newline)
|
||||||
|
last = m.end()
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
out_parts.append(clean[last:])
|
||||||
|
|
||||||
|
result = "".join(out_parts)
|
||||||
|
result = re.sub(r"(?:\r?\n){4,}", newline * 3, result)
|
||||||
|
|
||||||
|
backup = SQL_PATH.with_suffix(SQL_PATH.suffix + ".rewrite2.bak")
|
||||||
|
backup.write_text(text, encoding="utf-8")
|
||||||
|
SQL_PATH.write_text(result, encoding="utf-8")
|
||||||
|
|
||||||
|
print(f"Rewrote comments for {count} tables. Backup: {backup}")
|
||||||
1907
tmp/schema_ODS_doc.sql
Normal file
1907
tmp/schema_ODS_doc.sql
Normal file
File diff suppressed because it is too large
Load Diff
1878
tmp/schema_dwd_doc.sql
Normal file
1878
tmp/schema_dwd_doc.sql
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user