整理 SQL 的注释

2025-12-13 08:26:09 +08:00
parent 0ab040b9fb
commit 90fb63feaf
11 changed files with 12634 additions and 1732 deletions
--- a/Delete/schema_ODS_doc.sql.bak
+++ b/Delete/schema_ODS_doc.sql.bak
--- a/Delete/schema_ODS_doc.sql.rewrite2.bak
+++ b/Delete/schema_ODS_doc.sql.rewrite2.bak
--- a/Delete/schema_dwd_doc.sql.bak
+++ b/Delete/schema_dwd_doc.sql.bak
--- a/tmp/rewrite_schema_dwd_doc_comments.py
+++ b/tmp/rewrite_schema_dwd_doc_comments.py
@@ -0,0 +1,634 @@
+# -*- coding: utf-8 -*-
+import ast
+import json
+import re
+from collections import deque
+from pathlib import Path
+
+ROOT = Path(r"C:\dev\LLTQ\ETL\feiqiu-ETL")
+SQL_PATH = ROOT / "etl_billiards" / "database" / "schema_dwd_doc.sql"
+DOC_DIR = Path(r"C:\dev\LLTQ\export\test-json-doc")
+DWD_TASK_PATH = ROOT / "etl_billiards" / "tasks" / "dwd_load_task.py"
+
+SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
+
+SITEPROFILE_FIELD_PURPOSE = {
+    "id": "门店 ID，用于门店维度关联。",
+    "org_id": "组织/机构 ID，用于组织维度归属。",
+    "shop_name": "门店名称，用于展示与查询。",
+    "site_label": "门店标签（如 A/B 店），用于展示与分组。",
+    "full_address": "门店详细地址，用于展示与地理信息。",
+    "address": "门店地址简称/快照，用于展示。",
+    "longitude": "经度，用于定位与地图展示。",
+    "latitude": "纬度，用于定位与地图展示。",
+    "tenant_site_region_id": "租户下门店区域 ID，用于区域维度分析。",
+    "business_tel": "门店电话，用于联系信息展示。",
+    "site_type": "门店类型枚举，用于门店分类。",
+    "shop_status": "门店状态枚举，用于营业状态标识。",
+    "tenant_id": "租户/品牌 ID，用于商户维度过滤与关联。",
+    "auto_light": "是否启用自动灯控配置，用于门店设备策略。",
+    "attendance_enabled": "是否启用考勤功能，用于门店考勤配置。",
+    "attendance_distance": "考勤允许距离（米），用于考勤打卡限制。",
+    "prod_env": "环境标识（生产/测试），用于区分配置环境。",
+    "light_status": "灯控状态/开关，用于灯控设备管理。",
+    "light_type": "灯控类型，用于设备类型区分。",
+    "light_token": "灯控控制令牌，用于对接灯控服务。",
+    "avatar": "门店头像/图片 URL，用于展示。",
+    "wifi_name": "门店 WiFi 名称，用于展示与引导。",
+    "wifi_password": "门店 WiFi 密码，用于展示与引导。",
+    "customer_service_qrcode": "客服二维码 URL，用于引导联系。",
+    "customer_service_wechat": "客服微信号，用于引导联系。",
+    "fixed_pay_qrCode": "固定收款码（二维码）URL，用于收款引导。",
+    "create_time": "门店创建时间（快照字段）。",
+    "update_time": "门店更新时间（快照字段）。",
+}
+
+
+def _escape_sql(s: str) -> str:
+    return (s or "").replace("'", "''")
+
+
+def _first_sentence(text: str, max_len: int = 140) -> str:
+    s = re.sub(r"\s+", " ", (text or "").strip())
+    if not s:
+        return ""
+    parts = re.split(r"[。；;]\s*", s)
+    s = parts[0].strip() if parts else s
+    if len(s) > max_len:
+        s = s[: max_len - 1] + "…"
+    return s
+
+
+def normalize_key(s: str) -> str:
+    return re.sub(r"[_\-\s]", "", (s or "").lower())
+
+
+def snake_to_lower_camel(s: str) -> str:
+    parts = re.split(r"[_\-\s]+", s)
+    if not parts:
+        return s
+    first = parts[0].lower()
+    rest = "".join(p[:1].upper() + p[1:] for p in parts[1:] if p)
+    return first + rest
+
+
+def snake_to_upper_camel(s: str) -> str:
+    parts = re.split(r"[_\-\s]+", s)
+    return "".join(p[:1].upper() + p[1:] for p in parts if p)
+
+
+def find_key_in_record(record: dict, token: str) -> str | None:
+    if not isinstance(record, dict):
+        return None
+    if token in record:
+        return token
+    norm_to_key = {normalize_key(k): k for k in record.keys()}
+
+    candidates = [
+        token,
+        token.lower(),
+        token.upper(),
+        snake_to_lower_camel(token),
+        snake_to_upper_camel(token),
+    ]
+
+    # 常见变体：siteProfile/siteprofile
+    if normalize_key(token) == "siteprofile":
+        candidates.extend(["siteProfile", "siteprofile"])
+
+    for c in candidates:
+        nk = normalize_key(c)
+        if nk in norm_to_key:
+            return norm_to_key[nk]
+
+    return None
+
+
+def parse_dwd_task_mappings(path: Path):
+    mod = ast.parse(path.read_text(encoding="utf-8"))
+    table_map = None
+    fact_mappings = None
+
+    for node in mod.body:
+        if isinstance(node, ast.ClassDef) and node.name == "DwdLoadTask":
+            for stmt in node.body:
+                if isinstance(stmt, ast.Assign) and len(stmt.targets) == 1 and isinstance(stmt.targets[0], ast.Name):
+                    name = stmt.targets[0].id
+                    if name == "TABLE_MAP":
+                        table_map = ast.literal_eval(stmt.value)
+                    elif name == "FACT_MAPPINGS":
+                        fact_mappings = ast.literal_eval(stmt.value)
+                if isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name):
+                    name = stmt.target.id
+                    if name == "TABLE_MAP":
+                        table_map = ast.literal_eval(stmt.value)
+                    elif name == "FACT_MAPPINGS":
+                        fact_mappings = ast.literal_eval(stmt.value)
+
+    if not isinstance(table_map, dict) or not isinstance(fact_mappings, dict):
+        raise RuntimeError("Failed to parse TABLE_MAP/FACT_MAPPINGS from dwd_load_task.py")
+
+    return table_map, fact_mappings
+
+
+def parse_columns_from_ddl(create_sql: str):
+    start = create_sql.find("(")
+    end = create_sql.rfind(")")
+    body = create_sql[start + 1 : end]
+
+    cols = []
+    for line in body.splitlines():
+        s = line.strip().rstrip(",")
+        if not s:
+            continue
+        if s.upper().startswith("PRIMARY KEY"):
+            continue
+        if s.upper().startswith("CONSTRAINT "):
+            continue
+        m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s+", s)
+        if not m:
+            continue
+        name = m.group(1)
+        if name.upper() in {"PRIMARY", "UNIQUE", "FOREIGN", "CHECK"}:
+            continue
+        cols.append(name.lower())
+    return cols
+
+
+def _find_best_record_list(data, required_norm_keys: set[str]):
+    best = None
+    best_score = -1.0
+    best_path: list[str] = []
+
+    q = deque([(data, 0, [])])
+    visited = 0
+
+    while q and visited < 25000:
+        node, depth, path = q.popleft()
+        visited += 1
+        if depth > 10:
+            continue
+
+        if isinstance(node, list):
+            if node and all(isinstance(x, dict) for x in node[:3]):
+                scores = []
+                for x in node[:5]:
+                    keys_norm = {normalize_key(k) for k in x.keys()}
+                    scores.append(len(keys_norm & required_norm_keys))
+                score = sum(scores) / max(1, len(scores))
+                if score > best_score:
+                    best_score = score
+                    best = node
+                    best_path = path
+                for x in node[:10]:
+                    q.append((x, depth + 1, path))
+            else:
+                for x in node[:120]:
+                    q.append((x, depth + 1, path))
+        elif isinstance(node, dict):
+            for k, v in list(node.items())[:160]:
+                q.append((v, depth + 1, path + [str(k)]))
+
+    node_str = ".".join(best_path) if best_path else "$"
+    return best or [], node_str
+
+
+def _format_example(value, max_len: int = 120) -> str:
+    if value is None:
+        return "NULL"
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, (int, float)):
+        return str(value)
+    if isinstance(value, str):
+        s = value.strip()
+        if len(s) > max_len:
+            s = s[: max_len - 1] + "…"
+        return s
+    if isinstance(value, dict):
+        keys = list(value)[:6]
+        mini = {k: value.get(k) for k in keys}
+        rendered = json.dumps(mini, ensure_ascii=False)
+        if len(value) > len(keys):
+            rendered = rendered[:-1] + ", …}"
+        if len(rendered) > max_len:
+            rendered = rendered[: max_len - 1] + "…"
+        return rendered
+    if isinstance(value, list):
+        if not value:
+            return "[]"
+        rendered = json.dumps(value[0], ensure_ascii=False)
+        if len(value) > 1:
+            rendered = f"[{rendered}, …] (len={len(value)})"
+        else:
+            rendered = f"[{rendered}]"
+        if len(rendered) > max_len:
+            rendered = rendered[: max_len - 1] + "…"
+        return rendered
+    s = str(value)
+    if len(s) > max_len:
+        s = s[: max_len - 1] + "…"
+    return s
+
+
+def _infer_purpose(table: str, col: str, json_path: str | None) -> str:
+    lcol = col.lower()
+
+    if lcol in SCD_COLS:
+        if lcol == "scd2_start_time":
+            return "SCD2 开始时间（版本生效起点），用于维度慢变追踪。"
+        if lcol == "scd2_end_time":
+            return "SCD2 结束时间（默认 9999-12-31 表示当前版本），用于维度慢变追踪。"
+        if lcol == "scd2_is_current":
+            return "SCD2 当前版本标记（1=当前，0=历史），用于筛选最新维度记录。"
+        if lcol == "scd2_version":
+            return "SCD2 版本号（自增），用于与时间段一起避免版本重叠。"
+
+    if json_path and json_path.startswith("siteProfile."):
+        sf = json_path.split(".", 1)[1]
+        return SITEPROFILE_FIELD_PURPOSE.get(sf, "门店快照字段，用于门店维度补充信息。")
+
+    if lcol.endswith("_id"):
+        return "标识类 ID 字段，用于关联/定位相关实体。"
+    if lcol.endswith("_time") or lcol.endswith("time") or lcol.endswith("_date"):
+        return "时间/日期字段，用于记录业务时间与统计口径对齐。"
+    if any(k in lcol for k in ["amount", "money", "fee", "price", "deduct", "cost", "balance"]):
+        return "金额字段，用于计费/结算/核算等金额计算。"
+    if any(k in lcol for k in ["count", "num", "number", "seconds", "qty", "quantity"]):
+        return "数量/时长字段，用于统计与计量。"
+    if lcol.endswith("_name") or lcol.endswith("name"):
+        return "名称字段，用于展示与辅助识别。"
+    if lcol.endswith("_status") or lcol == "status":
+        return "状态枚举字段，用于标识业务状态。"
+    if lcol.startswith("is_") or lcol.startswith("can_"):
+        return "布尔/开关字段，用于表示是否/可用性等业务开关。"
+
+    # 表级兜底
+    if table.startswith("dim_"):
+        return "维度字段，用于补充维度属性。"
+    return "明细字段，用于记录事实取值。"
+
+
+def _parse_json_extract(expr: str):
+    # e.g. siteprofile->>'org_id'
+    m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s*->>\s*'([^']+)'\s*$", expr)
+    if not m:
+        return None
+    base = m.group(1)
+    field = m.group(2)
+    if normalize_key(base) == "siteprofile":
+        base = "siteProfile"
+    return base, field
+
+
+def build_table_comment(table: str, source_ods: str | None, source_json_base: str | None) -> str:
+    table_l = table.lower()
+    if table_l.startswith("dim_"):
+        kind = "DWD 维度表"
+    else:
+        kind = "DWD 明细事实表"
+
+    extra = "扩展字段表" if table_l.endswith("_ex") else ""
+
+    if source_ods and source_json_base:
+        src = (
+            f"ODS 来源表：{source_ods}（对应 JSON：{source_json_base}.json；分析：{source_json_base}-Analysis.md）。"
+            f"装载/清洗逻辑参考：etl_billiards/tasks/dwd_load_task.py（DwdLoadTask）。"
+        )
+    else:
+        src = "来源：由 ODS 清洗装载生成（详见 DWD 装载任务）。"
+
+    return f"{kind}{('（' + extra + '）') if extra else ''}：{table_l}。{src}"
+
+
+def get_source_info(table_l: str, table_map: dict) -> tuple[str | None, str | None]:
+    key = f"billiards_dwd.{table_l}"
+    source_ods = table_map.get(key)
+    if not source_ods:
+        return None, None
+    json_base = source_ods.split(".")[-1]
+    return source_ods, json_base
+
+
+def build_column_mappings(table_l: str, cols: list[str], fact_mappings: dict) -> dict[str, tuple[str | None, str | None]]:
+    # return col -> (json_path, src_expr)
+    mapping_list = fact_mappings.get(f"billiards_dwd.{table_l}") or []
+    explicit = {dwd_col.lower(): src_expr for dwd_col, src_expr, _cast in mapping_list}
+    casts = {dwd_col.lower(): cast for dwd_col, _src_expr, cast in mapping_list}
+
+    out: dict[str, tuple[str | None, str | None]] = {}
+
+    for c in cols:
+        if c in SCD_COLS:
+            out[c] = (None, None)
+            continue
+
+        src_expr = explicit.get(c, c)
+        cast = casts.get(c)
+
+        json_path = None
+        parsed = _parse_json_extract(src_expr)
+        if parsed:
+            base, field = parsed
+            json_path = f"{base}.{field}"
+        else:
+            # derived: pay_date uses pay_time + cast date
+            if cast == "date":
+                json_path = src_expr
+            else:
+                json_path = src_expr
+
+        out[c] = (json_path, src_expr)
+
+    return out
+
+
+def load_json_records(json_base: str, required_norm_keys: set[str]):
+    json_path = DOC_DIR / f"{json_base}.json"
+    data = json.loads(json_path.read_text(encoding="utf-8"))
+    return _find_best_record_list(data, required_norm_keys)
+
+
+def pick_example_from_record(record: dict, json_path: str | None):
+    if not json_path:
+        return None
+    if json_path.startswith("siteProfile."):
+        base_key = find_key_in_record(record, "siteProfile")
+        base = record.get(base_key) if base_key else None
+        if isinstance(base, dict):
+            field = json_path.split(".", 1)[1]
+            return base.get(field)
+        return None
+
+    # plain key
+    key = find_key_in_record(record, json_path)
+    if key:
+        return record.get(key)
+    # fallback: try match by normalized name
+    nk = normalize_key(json_path)
+    for k in record.keys():
+        if normalize_key(k) == nk:
+            return record.get(k)
+    return None
+
+
+def resolve_json_field_display(records: list, json_path: str | None, cast: str | None = None) -> str:
+    if not json_path:
+        return "无"
+    if json_path.startswith("siteProfile."):
+        return json_path
+
+    actual_key = None
+    for r in records[:80]:
+        if not isinstance(r, dict):
+            continue
+        k = find_key_in_record(r, json_path)
+        if k:
+            actual_key = k
+            break
+
+    base = actual_key or json_path
+    if cast == "date":
+        return f"{base}（派生：DATE({base})）"
+    if cast == "boolean":
+        return f"{base}（派生：BOOLEAN({base})）"
+    if cast in {"numeric", "timestamptz"}:
+        return f"{base}（派生：CAST({base} AS {cast})）"
+    return base
+
+
+def resolve_ods_source_field(records: list, src_expr: str | None, cast: str | None = None) -> str:
+    if not src_expr:
+        return "无"
+
+    parsed = _parse_json_extract(src_expr)
+    if parsed:
+        base, field = parsed
+        # 统一大小写展示
+        if normalize_key(base) == "siteprofile":
+            base = "siteProfile"
+        return f"{base}.{field}"
+
+    # 直接字段：尽量输出 JSON 实际键名（大小写/驼峰）
+    actual = None
+    for r in records[:80]:
+        if not isinstance(r, dict):
+            continue
+        k = find_key_in_record(r, src_expr)
+        if k:
+            actual = k
+            break
+
+    base = actual or src_expr
+    if cast == "date":
+        return f"{base}（派生：DATE({base})）"
+    if cast == "boolean":
+        return f"{base}（派生：BOOLEAN({base})）"
+    if cast in {"numeric", "timestamptz"}:
+        return f"{base}（派生：CAST({base} AS {cast})）"
+    return base
+
+
+def resolve_json_field_triplet(
+    json_file: str | None,
+    record_node: str | None,
+    records: list,
+    json_path: str | None,
+    cast: str | None = None,
+) -> str:
+    if not json_file:
+        json_file = "无"
+    node = record_node or "$"
+
+    if not json_path:
+        return f"{json_file} - 无 - 无"
+
+    if json_path.startswith("siteProfile."):
+        base_key = None
+        field_key = None
+        for r in records[:80]:
+            if not isinstance(r, dict):
+                continue
+            base_key = find_key_in_record(r, "siteProfile")
+            if base_key:
+                base = r.get(base_key)
+                if isinstance(base, dict):
+                    raw_field = json_path.split(".", 1)[1]
+                    # 尽量匹配子字段大小写
+                    if raw_field in base:
+                        field_key = raw_field
+                    else:
+                        nk = normalize_key(raw_field)
+                        for k in base.keys():
+                            if normalize_key(k) == nk:
+                                field_key = k
+                                break
+                break
+        base_key = base_key or "siteProfile"
+        field_key = field_key or json_path.split(".", 1)[1]
+        node = f"{node}.{base_key}" if node else base_key
+        field = field_key
+    else:
+        actual = None
+        for r in records[:80]:
+            if isinstance(r, dict):
+                actual = find_key_in_record(r, json_path)
+                if actual:
+                    break
+        field = actual or json_path
+
+    if cast == "date":
+        field = f"{field}（派生：DATE({field})）"
+    elif cast == "boolean":
+        field = f"{field}（派生：BOOLEAN({field})）"
+    elif cast in {"numeric", "timestamptz"}:
+        field = f"{field}（派生：CAST({field} AS {cast})）"
+
+    return f"{json_file} - {node} - {field}"
+
+
+def main():
+    table_map, fact_mappings = parse_dwd_task_mappings(DWD_TASK_PATH)
+
+    raw = SQL_PATH.read_text(encoding="utf-8", errors="replace")
+    newline = "\r\n" if "\r\n" in raw else "\n"
+
+    # strip all sql comments and existing COMMENT ON statements, incl. DO-block comment exec lines
+    kept_lines = []
+    for line in raw.splitlines(True):
+        if line.lstrip().startswith("--"):
+            continue
+        if re.match(r"^\s*COMMENT ON\s+(TABLE|COLUMN)\s+", line, re.I):
+            continue
+        if "COMMENT ON COLUMN" in line or "COMMENT ON TABLE" in line:
+            # remove legacy execute format lines too
+            continue
+        kept_lines.append(line)
+    clean = "".join(kept_lines)
+
+    create_re = re.compile(
+        r"(^\s*CREATE TABLE IF NOT EXISTS\s+(?P<table>[A-Za-z0-9_]+)\s*\([\s\S]*?\)\s*;)",
+        re.M,
+    )
+
+    out_parts = []
+    last = 0
+    count_tables = 0
+
+    for m in create_re.finditer(clean):
+        stmt = m.group(1)
+        table = m.group("table").lower()
+
+        out_parts.append(clean[last : m.end()])
+
+        cols = parse_columns_from_ddl(stmt)
+
+        source_ods, json_base = get_source_info(table, table_map)
+
+        # derive required keys
+        required_norm = set()
+        col_map = build_column_mappings(table, cols, fact_mappings)
+        # cast map for json field display
+        cast_map = {
+            dwd_col.lower(): cast
+            for dwd_col, _src_expr, cast in (fact_mappings.get(f"billiards_dwd.{table}") or [])
+        }
+        src_expr_map = {
+            dwd_col.lower(): src_expr
+            for dwd_col, src_expr, _cast in (fact_mappings.get(f"billiards_dwd.{table}") or [])
+        }
+        for c, (jp, _src) in col_map.items():
+            if not jp:
+                continue
+            if jp.startswith("siteProfile."):
+                required_norm.add(normalize_key("siteProfile"))
+            else:
+                required_norm.add(normalize_key(jp))
+
+        records = []
+        record_node = "$"
+        if json_base and (DOC_DIR / f"{json_base}.json").exists():
+            try:
+                records, record_node = load_json_records(json_base, required_norm)
+            except Exception:
+                records = []
+                record_node = "$"
+
+        table_comment = build_table_comment(table, source_ods, json_base)
+        comment_lines = [f"COMMENT ON TABLE billiards_dwd.{table} IS '{_escape_sql(table_comment)}';"]
+
+        for c in cols:
+            jp, _src = col_map.get(c, (None, None))
+
+            if c in SCD_COLS:
+                if c == "scd2_start_time":
+                    ex = "2025-11-10T00:00:00+08:00"
+                elif c == "scd2_end_time":
+                    ex = "9999-12-31T00:00:00+00:00"
+                elif c == "scd2_is_current":
+                    ex = "1"
+                else:
+                    ex = "1"
+                json_field = "无 - DWD慢变元数据 - 无"
+                ods_src = "无（DWD慢变元数据）"
+            else:
+                # pick example from first records
+                ex_val = None
+                for r in records[:80]:
+                    v = pick_example_from_record(r, jp)
+                    if v not in (None, ""):
+                        ex_val = v
+                        break
+                ex = _format_example(ex_val)
+                json_field = resolve_json_field_triplet(
+                    f"{json_base}.json" if json_base else None,
+                    record_node,
+                    records,
+                    jp,
+                    cast_map.get(c),
+                )
+                src_expr = src_expr_map.get(c, jp)
+                ods_src = resolve_ods_source_field(records, src_expr, cast_map.get(c))
+
+            purpose = _first_sentence(_infer_purpose(table, c, jp), 140)
+            func = purpose
+            if "用于" not in func:
+                func = "用于" + func.rstrip("。")
+
+            if source_ods:
+                ods_table_only = source_ods.split(".")[-1]
+                ods_src_display = f"{ods_table_only} - {ods_src}"
+            else:
+                ods_src_display = f"无 - {ods_src}"
+
+            comment = (
+                f"【说明】{purpose}。"
+                f"     【示例】{ex}（{func}）。"
+                f"     【ODS来源】{ods_src_display}。"
+                f"     【JSON字段】{json_field}。"
+            )
+            comment_lines.append(
+                f"COMMENT ON COLUMN billiards_dwd.{table}.{c} IS '{_escape_sql(comment)}';"
+            )
+
+        out_parts.append(newline + newline + (newline.join(comment_lines)) + newline + newline)
+        last = m.end()
+        count_tables += 1
+
+    out_parts.append(clean[last:])
+    result = "".join(out_parts)
+
+    # collapse extra blank lines
+    result = re.sub(r"(?:\r?\n){4,}", newline * 3, result)
+
+    backup = SQL_PATH.with_suffix(SQL_PATH.suffix + ".bak")
+    if not backup.exists():
+        backup.write_text(raw, encoding="utf-8")
+
+    SQL_PATH.write_text(result, encoding="utf-8")
+
+    print(f"Rewrote comments for {count_tables} tables: {SQL_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tmp/rewrite_schema_ods_doc_comments.py
+++ b/tmp/rewrite_schema_ods_doc_comments.py
@@ -0,0 +1,560 @@
+# -*- coding: utf-8 -*-
+import json
+import re
+from pathlib import Path
+from collections import defaultdict
+
+SQL_PATH = Path(r"C:\dev\LLTQ\ETL\feiqiu-ETL\etl_billiards\database\schema_ODS_doc.sql")
+DOC_DIR = Path(r"C:\dev\LLTQ\export\test-json-doc")
+
+TABLE_CN = {
+    "member_profiles": "会员档案/会员账户信息",
+    "member_balance_changes": "会员余额变更流水",
+    "member_stored_value_cards": "会员储值/卡券账户列表",
+    "recharge_settlements": "充值结算记录",
+    "settlement_records": "结账/结算记录",
+    "assistant_cancellation_records": "助教作废/取消记录",
+    "assistant_accounts_master": "助教档案主数据",
+    "assistant_service_records": "助教服务流水",
+    "site_tables_master": "门店桌台主数据",
+    "table_fee_discount_records": "台费折扣记录",
+    "table_fee_transactions": "台费流水",
+    "goods_stock_movements": "商品库存变动流水",
+    "stock_goods_category_tree": "商品分类树",
+    "goods_stock_summary": "商品库存汇总",
+    "payment_transactions": "支付流水",
+    "refund_transactions": "退款流水",
+    "platform_coupon_redemption_records": "平台券核销/使用记录",
+    "tenant_goods_master": "租户商品主数据",
+    "group_buy_packages": "团购套餐主数据",
+    "group_buy_redemption_records": "团购核销记录",
+    "settlement_ticket_details": "结算小票明细",
+    "store_goods_master": "门店商品主数据",
+    "store_goods_sales_records": "门店商品销售流水",
+}
+
+COMMON_FIELD_PURPOSE = {
+    "tenant_id": "租户/品牌 ID，用于商户维度过滤与关联。",
+    "site_id": "门店 ID，用于门店维度过滤与关联。",
+    "register_site_id": "会员注册门店 ID，用于归属门店维度关联。",
+    "site_name": "门店名称快照，用于直接展示。",
+    "id": "本表主键 ID，用于唯一标识一条记录。",
+    "system_member_id": "系统级会员 ID（跨门店/跨卡种统一到‘人’的维度）。",
+    "order_trade_no": "订单交易号，用于串联同一订单下的各类消费明细。",
+    "order_settle_id": "订单结算/结账主键，用于关联结算记录与小票明细。",
+    "order_pay_id": "关联支付流水的主键 ID，用于追溯支付明细。",
+    "point": "积分余额，用于记录会员积分取值。",
+    "growth_value": "成长值/成长积分，用于会员成长与等级评估。",
+    "referrer_member_id": "推荐人会员 ID，用于记录会员推荐/拉新关系。",
+    "create_time": "记录创建时间（业务侧产生时间）。",
+    "status": "状态枚举，用于标识记录当前业务状态。",
+    "user_status": "用户状态枚举，用于标识会员账户/用户可用状态。",
+    "is_delete": "逻辑删除标记（0=否，1=是）。",
+    "payload": "完整原始 JSON 记录快照，用于回溯与二次解析。",
+    "source_file": "ETL 元数据：原始导出文件名，用于数据追溯。",
+    "source_endpoint": "ETL 元数据：采集来源（接口/文件路径），用于数据追溯。",
+    "fetched_at": "ETL 元数据：采集/入库时间戳，用于口径对齐与增量处理。",
+}
+
+ETL_META_FIELDS = {"source_file", "source_endpoint", "fetched_at"}
+
+
+def _first_sentence(text: str, max_len: int = 120) -> str:
+    s = re.sub(r"\s+", " ", (text or "").strip())
+    if not s:
+        return ""
+    parts = re.split(r"[。；;]\s*", s)
+    s = parts[0].strip() if parts else s
+    if len(s) > max_len:
+        s = s[: max_len - 1] + "…"
+    return s
+
+
+def _escape_sql(s: str) -> str:
+    return (s or "").replace("'", "''")
+
+def normalize_key(s: str) -> str:
+    return re.sub(r"[_\-\s]", "", (s or "").lower())
+
+
+def snake_to_lower_camel(s: str) -> str:
+    parts = re.split(r"[_\-\s]+", s)
+    if not parts:
+        return s
+    first = parts[0].lower()
+    rest = "".join(p[:1].upper() + p[1:] for p in parts[1:] if p)
+    return first + rest
+
+
+def snake_to_upper_camel(s: str) -> str:
+    parts = re.split(r"[_\-\s]+", s)
+    return "".join(p[:1].upper() + p[1:] for p in parts if p)
+
+
+def find_key_in_record(record: dict, token: str) -> str | None:
+    if not isinstance(record, dict) or not token:
+        return None
+    if token in record:
+        return token
+    norm_to_key = {normalize_key(k): k for k in record.keys()}
+    candidates = [
+        token,
+        token.lower(),
+        token.upper(),
+        snake_to_lower_camel(token),
+        snake_to_upper_camel(token),
+    ]
+    for c in candidates:
+        nk = normalize_key(c)
+        if nk in norm_to_key:
+            return norm_to_key[nk]
+    return None
+
+
+def _infer_purpose(_table: str, col: str) -> str:
+    if col in COMMON_FIELD_PURPOSE:
+        return COMMON_FIELD_PURPOSE[col]
+
+    lower = col.lower()
+    if lower.endswith("_id"):
+        return "标识类 ID 字段，用于关联/定位相关实体。"
+    if lower.endswith("_time") or lower.endswith("time"):
+        return "时间字段，用于记录业务时间点/发生时间。"
+    if any(k in lower for k in ["amount", "money", "fee", "price", "deduct", "cost"]):
+        return "金额字段，用于计费/结算/分摊等金额计算。"
+    if any(k in lower for k in ["count", "num", "number", "seconds", "qty"]):
+        return "数量/时长字段，用于统计与计量。"
+    if lower.endswith("_name") or lower.endswith("name"):
+        return "名称字段，用于展示与辅助识别。"
+    if lower.endswith("_code") or lower.endswith("code"):
+        return "编码/枚举字段，用于表示类型、等级或业务枚举。"
+    if lower.startswith("is_") or lower.startswith("able_") or lower.startswith("can_"):
+        return "布尔/开关字段，用于表示权限、可用性或状态开关。"
+
+    return "来自 JSON 导出的原始字段，用于保留业务取值。"
+
+
+def _format_example(value, max_len: int = 120) -> str:
+    if value is None:
+        return "NULL"
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, (int, float)):
+        return str(value)
+    if isinstance(value, str):
+        s = value.strip()
+        if len(s) > max_len:
+            s = s[: max_len - 1] + "…"
+        return s
+    if isinstance(value, list):
+        if not value:
+            return "[]"
+        sample = value[0]
+        rendered = json.dumps(sample, ensure_ascii=False)
+        if len(value) > 1:
+            rendered = f"[{rendered}, …] (len={len(value)})"
+        else:
+            rendered = f"[{rendered}]"
+        if len(rendered) > max_len:
+            rendered = rendered[: max_len - 1] + "…"
+        return rendered
+    if isinstance(value, dict):
+        keys = list(value)[:6]
+        mini = {k: value.get(k) for k in keys}
+        rendered = json.dumps(mini, ensure_ascii=False)
+        if len(value) > len(keys):
+            rendered = rendered[:-1] + ", …}"
+        if len(rendered) > max_len:
+            rendered = rendered[: max_len - 1] + "…"
+        return rendered
+    rendered = str(value)
+    if len(rendered) > max_len:
+        rendered = rendered[: max_len - 1] + "…"
+    return rendered
+
+
+def _find_best_record_list(data, columns):
+    cols = set(columns)
+    best = None
+    best_score = -1
+
+    queue = [(data, 0)]
+    visited = 0
+    while queue and visited < 20000:
+        node, depth = queue.pop(0)
+        visited += 1
+        if depth > 8:
+            continue
+        if isinstance(node, list):
+            if node and all(isinstance(x, dict) for x in node[:3]):
+                scores = []
+                for x in node[:5]:
+                    scores.append(len(set(x.keys()) & cols))
+                score = sum(scores) / max(1, len(scores))
+                if score > best_score:
+                    best_score = score
+                    best = node
+                for x in node[:10]:
+                    queue.append((x, depth + 1))
+            else:
+                for x in node[:50]:
+                    queue.append((x, depth + 1))
+        elif isinstance(node, dict):
+            for v in list(node.values())[:80]:
+                queue.append((v, depth + 1))
+
+    return best
+
+
+def _find_best_record_list_and_node(data, columns):
+    cols = set(columns)
+    best = None
+    best_score = -1
+    best_path = []
+
+    queue = [(data, 0, [])]
+    visited = 0
+    while queue and visited < 25000:
+        node, depth, path = queue.pop(0)
+        visited += 1
+        if depth > 10:
+            continue
+
+        if isinstance(node, list):
+            if node and all(isinstance(x, dict) for x in node[:3]):
+                scores = []
+                for x in node[:5]:
+                    scores.append(len(set(x.keys()) & cols))
+                score = sum(scores) / max(1, len(scores))
+                if score > best_score:
+                    best_score = score
+                    best = node
+                    best_path = path
+                for x in node[:10]:
+                    queue.append((x, depth + 1, path))
+            else:
+                for x in node[:80]:
+                    queue.append((x, depth + 1, path))
+        elif isinstance(node, dict):
+            for k, v in list(node.items())[:120]:
+                queue.append((v, depth + 1, path + [str(k)]))
+
+    node_str = ".".join(best_path) if best_path else "$"
+    return best or [], node_str
+
+
+def _choose_examples(records, columns):
+    examples = {}
+    if not records:
+        return examples
+    for col in columns:
+        val = None
+        for r in records[:120]:
+            if isinstance(r, dict) and col in r and r[col] not in (None, ""):
+                val = r[col]
+                break
+        examples[col] = val
+    return examples
+
+
+def _extract_header_fields(line: str, columns_set):
+    s = line.strip()
+    if not s:
+        return []
+
+    # 支持 1. id / 1.1 siteProfile / 8. tenant_id
+    m = re.match(r"^\d+(?:\.\d+)*[\.)]?\s+(.+)$", s)
+    if m:
+        s = m.group(1).strip()
+
+    parts = re.split(r"\s*[/、,，]\s*", s)
+    fields = [p.strip() for p in parts if p.strip() in columns_set]
+
+    if not fields and s in columns_set:
+        fields = [s]
+
+    if fields and len(line) <= 120:
+        return fields
+    return []
+
+
+def _parse_field_purpose_from_block(block_lines):
+    lines = [l.rstrip() for l in block_lines]
+
+    def pick_after_label(labels):
+        for i, l in enumerate(lines):
+            for lab in labels:
+                if lab in l:
+                    after = l.split(lab, 1)[1].strip()
+                    if after:
+                        return after
+                    buf = []
+                    j = i + 1
+                    while j < len(lines) and not lines[j].strip():
+                        j += 1
+                    for k in range(j, len(lines)):
+                        if not lines[k].strip():
+                            break
+                        if re.match(r"^[\w\u4e00-\u9fff]+[:：]", lines[k].strip()):
+                            break
+                        buf.append(lines[k].strip())
+                    if buf:
+                        return " ".join(buf)
+        return ""
+
+    # 兼容「含义（结合其它文件）：」「含义（推测）：」等变体
+    picked = pick_after_label(["含义：", "含义:"])
+    if not picked:
+        for i, l in enumerate(lines):
+            s = l.strip()
+            m = re.match(r"^含义.*[:：]\s*(.*)$", s)
+            if m:
+                after = m.group(1).strip()
+                if after:
+                    picked = after
+                else:
+                    buf = []
+                    j = i + 1
+                    while j < len(lines) and not lines[j].strip():
+                        j += 1
+                    for k in range(j, len(lines)):
+                        if not lines[k].strip():
+                            break
+                        if re.match(r"^[\w\u4e00-\u9fff]+[:：]", lines[k].strip()):
+                            break
+                        buf.append(lines[k].strip())
+                    if buf:
+                        picked = " ".join(buf)
+                break
+
+    if not picked:
+        picked = pick_after_label(["作用：", "作用:"])
+    if not picked:
+        for i, l in enumerate(lines):
+            s = l.strip()
+            m = re.match(r"^作用.*[:：]\s*(.*)$", s)
+            if m:
+                after = m.group(1).strip()
+                if after:
+                    picked = after
+                break
+
+    if not picked:
+        # 兜底：尽量避开“类型：/唯一值个数：”这类描述
+        for l in lines:
+            s = l.strip()
+            if not s:
+                continue
+            if any(
+                s.startswith(prefix)
+                for prefix in [
+                    "类型：",
+                    "非空：",
+                    "唯一值",
+                    "观测",
+                    "特征",
+                    "统计",
+                    "分布",
+                    "说明：",
+                    "关联：",
+                    "结构关系",
+                    "和其它表",
+                    "重复记录",
+                    "全部为",
+                ]
+            ):
+                continue
+            picked = s
+            break
+
+    return _first_sentence(picked, 160)
+
+
+def _is_poor_purpose(purpose: str) -> bool:
+    s = (purpose or "").strip()
+    if not s:
+        return True
+    if s.endswith("：") or s.endswith(":"):
+        return True
+    if s.startswith("全部为"):
+        return True
+    if s.startswith("含义") and ("：" in s or ":" in s) and len(s) <= 12:
+        return True
+    return False
+
+
+def parse_analysis(analysis_text: str, columns):
+    columns_set = set(columns)
+    blocks = defaultdict(list)
+
+    current_fields = []
+    buf = []
+
+    for raw in analysis_text.splitlines():
+        fields = _extract_header_fields(raw, columns_set)
+        if fields:
+            if current_fields and buf:
+                for f in current_fields:
+                    blocks[f].extend(buf)
+            current_fields = fields
+            buf = []
+        else:
+            if current_fields:
+                buf.append(raw)
+
+    if current_fields and buf:
+        for f in current_fields:
+            blocks[f].extend(buf)
+
+    purposes = {}
+    for col in columns:
+        if col in blocks and blocks[col]:
+            p = _parse_field_purpose_from_block(blocks[col])
+            if p:
+                purposes[col] = p
+    return purposes
+
+
+def parse_columns_from_ddl(create_sql: str):
+    start = create_sql.find("(")
+    end = create_sql.rfind(")")
+    body = create_sql[start + 1 : end]
+
+    cols = []
+    for line in body.splitlines():
+        s = line.strip().rstrip(",")
+        if not s:
+            continue
+        if s.startswith(")"):
+            continue
+        if s.upper().startswith("CONSTRAINT "):
+            continue
+        m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s+", s)
+        if not m:
+            continue
+        name = m.group(1)
+        if name.upper() in {"PRIMARY", "UNIQUE", "FOREIGN", "CHECK"}:
+            continue
+        cols.append(name)
+    return cols
+
+
+def build_comment_block(table: str, columns, analysis_text: str, records):
+    # records_node: 由外部确定，避免这里重复遍历 JSON
+    records, records_node = records
+    purposes = parse_analysis(analysis_text, columns)
+    examples = _choose_examples(records, columns)
+
+    table_cn = TABLE_CN.get(table, table)
+
+    table_comment = (
+        f"ODS 原始明细表：{table_cn}。"
+        f"来源：C:/dev/LLTQ/export/test-json-doc/{table}.json；分析：{table}-Analysis.md。"
+        f"字段以导出原样为主；ETL 补充 source_file/source_endpoint/fetched_at，并保留 payload 为原始记录快照。"
+    )
+
+    lines = []
+    lines.append(f"COMMENT ON TABLE billiards_ods.{table} IS '{_escape_sql(table_comment)}';")
+
+    for col in columns:
+        json_file = f"{table}.json"
+        if col in ETL_META_FIELDS:
+            json_field = f"{json_file} - ETL元数据 - 无"
+        elif col == "payload":
+            json_field = f"{json_file} - {records_node} - $"
+        else:
+            actual = None
+            for r in records[:50]:
+                if isinstance(r, dict):
+                    actual = find_key_in_record(r, col)
+                    if actual:
+                        break
+            field_name = actual or col
+            json_field = f"{json_file} - {records_node} - {field_name}"
+
+        purpose = purposes.get(col) or _infer_purpose(table, col)
+        purpose = _first_sentence(purpose, 140) or _infer_purpose(table, col)
+        if _is_poor_purpose(purpose):
+            purpose = COMMON_FIELD_PURPOSE.get(col) or _infer_purpose(table, col)
+
+        if col in ETL_META_FIELDS:
+            if col == "source_file":
+                ex = f"{table}.json"
+            elif col == "source_endpoint":
+                ex = f"C:/dev/LLTQ/export/test-json-doc/{table}.json"
+            else:
+                ex = "2025-11-10T00:00:00+08:00"
+        elif col == "payload":
+            ex = "{...}"
+        else:
+            ex = _format_example(examples.get(col))
+
+        func = purpose
+        if "用于" not in func:
+            func = "用于" + func.rstrip("。")
+
+        # ODS来源：表名-字段名（ODS自身字段）；ETL补充字段标记
+        if col in ETL_META_FIELDS:
+            ods_src = f"{table} - {col}（ETL补充）"
+        else:
+            ods_src = f"{table} - {col}"
+
+        comment = (
+            f"【说明】{purpose}。"
+            f"     【示例】{ex}（{func}）。"
+            f"     【ODS来源】{ods_src}。"
+            f"     【JSON字段】{json_field}。"
+        )
+        lines.append(
+            f"COMMENT ON COLUMN billiards_ods.{table}.{col} IS '{_escape_sql(comment)}';"
+        )
+
+    return "\n".join(lines)
+
+
+text = SQL_PATH.read_text(encoding="utf-8")
+newline = "\r\n" if "\r\n" in text else "\n"
+
+kept = []
+for raw_line in text.splitlines(True):
+    stripped = raw_line.lstrip()
+    if stripped.startswith("--"):
+        continue
+    if re.match(r"^\s*COMMENT ON\s+(TABLE|COLUMN)\s+", raw_line):
+        continue
+    kept.append(raw_line)
+
+clean = "".join(kept)
+
+create_re = re.compile(
+    r"(CREATE TABLE IF NOT EXISTS\s+billiards_ods\.(?P<table>[A-Za-z0-9_]+)\s*\([\s\S]*?\)\s*;)" ,
+    re.M,
+)
+
+out_parts = []
+last = 0
+count = 0
+for m in create_re.finditer(clean):
+    out_parts.append(clean[last : m.end()])
+    table = m.group("table")
+    create_sql = m.group(1)
+
+    cols = parse_columns_from_ddl(create_sql)
+    analysis_text = (DOC_DIR / f"{table}-Analysis.md").read_text(encoding="utf-8")
+    data = json.loads((DOC_DIR / f"{table}.json").read_text(encoding="utf-8"))
+    record_list, record_node = _find_best_record_list_and_node(data, cols)
+
+    out_parts.append(newline + newline + build_comment_block(table, cols, analysis_text, (record_list, record_node)) + newline + newline)
+    last = m.end()
+    count += 1
+
+out_parts.append(clean[last:])
+
+result = "".join(out_parts)
+result = re.sub(r"(?:\r?\n){4,}", newline * 3, result)
+
+backup = SQL_PATH.with_suffix(SQL_PATH.suffix + ".rewrite2.bak")
+backup.write_text(text, encoding="utf-8")
+SQL_PATH.write_text(result, encoding="utf-8")
+
+print(f"Rewrote comments for {count} tables. Backup: {backup}")
--- a/tmp/schema_ODS_doc
+++ b/tmp/schema_ODS_doc
--- a/tmp/schema_ODS_doc.sql
+++ b/tmp/schema_ODS_doc.sql
--- a/tmp/schema_dwd_doc.sql
+++ b/tmp/schema_dwd_doc.sql