# -*- coding: utf-8 -*- import json import re from pathlib import Path from collections import defaultdict SQL_PATH = Path(r"C:\dev\LLTQ\ETL\feiqiu-ETL\etl_billiards\database\schema_ODS_doc.sql") DOC_DIR = Path(r"C:\dev\LLTQ\export\test-json-doc") TABLE_CN = { "member_profiles": "会员档案/会员账户信息", "member_balance_changes": "会员余额变更流水", "member_stored_value_cards": "会员储值/卡券账户列表", "recharge_settlements": "充值结算记录", "settlement_records": "结账/结算记录", "assistant_cancellation_records": "助教作废/取消记录", "assistant_accounts_master": "助教档案主数据", "assistant_service_records": "助教服务流水", "site_tables_master": "门店桌台主数据", "table_fee_discount_records": "台费折扣记录", "table_fee_transactions": "台费流水", "goods_stock_movements": "商品库存变动流水", "stock_goods_category_tree": "商品分类树", "goods_stock_summary": "商品库存汇总", "payment_transactions": "支付流水", "refund_transactions": "退款流水", "platform_coupon_redemption_records": "平台券核销/使用记录", "tenant_goods_master": "租户商品主数据", "group_buy_packages": "团购套餐主数据", "group_buy_redemption_records": "团购核销记录", "settlement_ticket_details": "结算小票明细", "store_goods_master": "门店商品主数据", "store_goods_sales_records": "门店商品销售流水", } COMMON_FIELD_PURPOSE = { "tenant_id": "租户/品牌 ID,用于商户维度过滤与关联。", "site_id": "门店 ID,用于门店维度过滤与关联。", "register_site_id": "会员注册门店 ID,用于归属门店维度关联。", "site_name": "门店名称快照,用于直接展示。", "id": "本表主键 ID,用于唯一标识一条记录。", "system_member_id": "系统级会员 ID(跨门店/跨卡种统一到‘人’的维度)。", "order_trade_no": "订单交易号,用于串联同一订单下的各类消费明细。", "order_settle_id": "订单结算/结账主键,用于关联结算记录与小票明细。", "order_pay_id": "关联支付流水的主键 ID,用于追溯支付明细。", "point": "积分余额,用于记录会员积分取值。", "growth_value": "成长值/成长积分,用于会员成长与等级评估。", "referrer_member_id": "推荐人会员 ID,用于记录会员推荐/拉新关系。", "create_time": "记录创建时间(业务侧产生时间)。", "status": "状态枚举,用于标识记录当前业务状态。", "user_status": "用户状态枚举,用于标识会员账户/用户可用状态。", "is_delete": "逻辑删除标记(0=否,1=是)。", "payload": "完整原始 JSON 记录快照,用于回溯与二次解析。", "source_file": "ETL 元数据:原始导出文件名,用于数据追溯。", "source_endpoint": "ETL 元数据:采集来源(接口/文件路径),用于数据追溯。", "fetched_at": "ETL 元数据:采集/入库时间戳,用于口径对齐与增量处理。", } ETL_META_FIELDS = {"source_file", "source_endpoint", "fetched_at"} def _first_sentence(text: str, max_len: int = 120) -> str: s = re.sub(r"\s+", " ", (text or "").strip()) if not s: return "" parts = re.split(r"[。;;]\s*", s) s = parts[0].strip() if parts else s if len(s) > max_len: s = s[: max_len - 1] + "…" return s def _escape_sql(s: str) -> str: return (s or "").replace("'", "''") def normalize_key(s: str) -> str: return re.sub(r"[_\-\s]", "", (s or "").lower()) def snake_to_lower_camel(s: str) -> str: parts = re.split(r"[_\-\s]+", s) if not parts: return s first = parts[0].lower() rest = "".join(p[:1].upper() + p[1:] for p in parts[1:] if p) return first + rest def snake_to_upper_camel(s: str) -> str: parts = re.split(r"[_\-\s]+", s) return "".join(p[:1].upper() + p[1:] for p in parts if p) def find_key_in_record(record: dict, token: str) -> str | None: if not isinstance(record, dict) or not token: return None if token in record: return token norm_to_key = {normalize_key(k): k for k in record.keys()} candidates = [ token, token.lower(), token.upper(), snake_to_lower_camel(token), snake_to_upper_camel(token), ] for c in candidates: nk = normalize_key(c) if nk in norm_to_key: return norm_to_key[nk] return None def _infer_purpose(_table: str, col: str) -> str: if col in COMMON_FIELD_PURPOSE: return COMMON_FIELD_PURPOSE[col] lower = col.lower() if lower.endswith("_id"): return "标识类 ID 字段,用于关联/定位相关实体。" if lower.endswith("_time") or lower.endswith("time"): return "时间字段,用于记录业务时间点/发生时间。" if any(k in lower for k in ["amount", "money", "fee", "price", "deduct", "cost"]): return "金额字段,用于计费/结算/分摊等金额计算。" if any(k in lower for k in ["count", "num", "number", "seconds", "qty"]): return "数量/时长字段,用于统计与计量。" if lower.endswith("_name") or lower.endswith("name"): return "名称字段,用于展示与辅助识别。" if lower.endswith("_code") or lower.endswith("code"): return "编码/枚举字段,用于表示类型、等级或业务枚举。" if lower.startswith("is_") or lower.startswith("able_") or lower.startswith("can_"): return "布尔/开关字段,用于表示权限、可用性或状态开关。" return "来自 JSON 导出的原始字段,用于保留业务取值。" def _format_example(value, max_len: int = 120) -> str: if value is None: return "NULL" if isinstance(value, bool): return "true" if value else "false" if isinstance(value, (int, float)): return str(value) if isinstance(value, str): s = value.strip() if len(s) > max_len: s = s[: max_len - 1] + "…" return s if isinstance(value, list): if not value: return "[]" sample = value[0] rendered = json.dumps(sample, ensure_ascii=False) if len(value) > 1: rendered = f"[{rendered}, …] (len={len(value)})" else: rendered = f"[{rendered}]" if len(rendered) > max_len: rendered = rendered[: max_len - 1] + "…" return rendered if isinstance(value, dict): keys = list(value)[:6] mini = {k: value.get(k) for k in keys} rendered = json.dumps(mini, ensure_ascii=False) if len(value) > len(keys): rendered = rendered[:-1] + ", …}" if len(rendered) > max_len: rendered = rendered[: max_len - 1] + "…" return rendered rendered = str(value) if len(rendered) > max_len: rendered = rendered[: max_len - 1] + "…" return rendered def _find_best_record_list(data, columns): cols = set(columns) best = None best_score = -1 queue = [(data, 0)] visited = 0 while queue and visited < 20000: node, depth = queue.pop(0) visited += 1 if depth > 8: continue if isinstance(node, list): if node and all(isinstance(x, dict) for x in node[:3]): scores = [] for x in node[:5]: scores.append(len(set(x.keys()) & cols)) score = sum(scores) / max(1, len(scores)) if score > best_score: best_score = score best = node for x in node[:10]: queue.append((x, depth + 1)) else: for x in node[:50]: queue.append((x, depth + 1)) elif isinstance(node, dict): for v in list(node.values())[:80]: queue.append((v, depth + 1)) return best def _find_best_record_list_and_node(data, columns): cols = set(columns) best = None best_score = -1 best_path = [] queue = [(data, 0, [])] visited = 0 while queue and visited < 25000: node, depth, path = queue.pop(0) visited += 1 if depth > 10: continue if isinstance(node, list): if node and all(isinstance(x, dict) for x in node[:3]): scores = [] for x in node[:5]: scores.append(len(set(x.keys()) & cols)) score = sum(scores) / max(1, len(scores)) if score > best_score: best_score = score best = node best_path = path for x in node[:10]: queue.append((x, depth + 1, path)) else: for x in node[:80]: queue.append((x, depth + 1, path)) elif isinstance(node, dict): for k, v in list(node.items())[:120]: queue.append((v, depth + 1, path + [str(k)])) node_str = ".".join(best_path) if best_path else "$" return best or [], node_str def _choose_examples(records, columns): examples = {} if not records: return examples for col in columns: val = None for r in records[:120]: if isinstance(r, dict) and col in r and r[col] not in (None, ""): val = r[col] break examples[col] = val return examples def _extract_header_fields(line: str, columns_set): s = line.strip() if not s: return [] # 支持 1. id / 1.1 siteProfile / 8. tenant_id m = re.match(r"^\d+(?:\.\d+)*[\.)]?\s+(.+)$", s) if m: s = m.group(1).strip() parts = re.split(r"\s*[/、,,]\s*", s) fields = [p.strip() for p in parts if p.strip() in columns_set] if not fields and s in columns_set: fields = [s] if fields and len(line) <= 120: return fields return [] def _parse_field_purpose_from_block(block_lines): lines = [l.rstrip() for l in block_lines] def pick_after_label(labels): for i, l in enumerate(lines): for lab in labels: if lab in l: after = l.split(lab, 1)[1].strip() if after: return after buf = [] j = i + 1 while j < len(lines) and not lines[j].strip(): j += 1 for k in range(j, len(lines)): if not lines[k].strip(): break if re.match(r"^[\w\u4e00-\u9fff]+[::]", lines[k].strip()): break buf.append(lines[k].strip()) if buf: return " ".join(buf) return "" # 兼容「含义(结合其它文件):」「含义(推测):」等变体 picked = pick_after_label(["含义:", "含义:"]) if not picked: for i, l in enumerate(lines): s = l.strip() m = re.match(r"^含义.*[::]\s*(.*)$", s) if m: after = m.group(1).strip() if after: picked = after else: buf = [] j = i + 1 while j < len(lines) and not lines[j].strip(): j += 1 for k in range(j, len(lines)): if not lines[k].strip(): break if re.match(r"^[\w\u4e00-\u9fff]+[::]", lines[k].strip()): break buf.append(lines[k].strip()) if buf: picked = " ".join(buf) break if not picked: picked = pick_after_label(["作用:", "作用:"]) if not picked: for i, l in enumerate(lines): s = l.strip() m = re.match(r"^作用.*[::]\s*(.*)$", s) if m: after = m.group(1).strip() if after: picked = after break if not picked: # 兜底:尽量避开“类型:/唯一值个数:”这类描述 for l in lines: s = l.strip() if not s: continue if any( s.startswith(prefix) for prefix in [ "类型:", "非空:", "唯一值", "观测", "特征", "统计", "分布", "说明:", "关联:", "结构关系", "和其它表", "重复记录", "全部为", ] ): continue picked = s break return _first_sentence(picked, 160) def _is_poor_purpose(purpose: str) -> bool: s = (purpose or "").strip() if not s: return True if s.endswith(":") or s.endswith(":"): return True if s.startswith("全部为"): return True if s.startswith("含义") and (":" in s or ":" in s) and len(s) <= 12: return True return False def parse_analysis(analysis_text: str, columns): columns_set = set(columns) blocks = defaultdict(list) current_fields = [] buf = [] for raw in analysis_text.splitlines(): fields = _extract_header_fields(raw, columns_set) if fields: if current_fields and buf: for f in current_fields: blocks[f].extend(buf) current_fields = fields buf = [] else: if current_fields: buf.append(raw) if current_fields and buf: for f in current_fields: blocks[f].extend(buf) purposes = {} for col in columns: if col in blocks and blocks[col]: p = _parse_field_purpose_from_block(blocks[col]) if p: purposes[col] = p return purposes def parse_columns_from_ddl(create_sql: str): start = create_sql.find("(") end = create_sql.rfind(")") body = create_sql[start + 1 : end] cols = [] for line in body.splitlines(): s = line.strip().rstrip(",") if not s: continue if s.startswith(")"): continue if s.upper().startswith("CONSTRAINT "): continue m = re.match(r"^([A-Za-z_][A-Za-z0-9_]*)\s+", s) if not m: continue name = m.group(1) if name.upper() in {"PRIMARY", "UNIQUE", "FOREIGN", "CHECK"}: continue cols.append(name) return cols def build_comment_block(table: str, columns, analysis_text: str, records): # records_node: 由外部确定,避免这里重复遍历 JSON records, records_node = records purposes = parse_analysis(analysis_text, columns) examples = _choose_examples(records, columns) table_cn = TABLE_CN.get(table, table) table_comment = ( f"ODS 原始明细表:{table_cn}。" f"来源:C:/dev/LLTQ/export/test-json-doc/{table}.json;分析:{table}-Analysis.md。" f"字段以导出原样为主;ETL 补充 source_file/source_endpoint/fetched_at,并保留 payload 为原始记录快照。" ) lines = [] lines.append(f"COMMENT ON TABLE billiards_ods.{table} IS '{_escape_sql(table_comment)}';") for col in columns: json_file = f"{table}.json" if col in ETL_META_FIELDS: json_field = f"{json_file} - ETL元数据 - 无" elif col == "payload": json_field = f"{json_file} - {records_node} - $" else: actual = None for r in records[:50]: if isinstance(r, dict): actual = find_key_in_record(r, col) if actual: break field_name = actual or col json_field = f"{json_file} - {records_node} - {field_name}" purpose = purposes.get(col) or _infer_purpose(table, col) purpose = _first_sentence(purpose, 140) or _infer_purpose(table, col) if _is_poor_purpose(purpose): purpose = COMMON_FIELD_PURPOSE.get(col) or _infer_purpose(table, col) if col in ETL_META_FIELDS: if col == "source_file": ex = f"{table}.json" elif col == "source_endpoint": ex = f"C:/dev/LLTQ/export/test-json-doc/{table}.json" else: ex = "2025-11-10T00:00:00+08:00" elif col == "payload": ex = "{...}" else: ex = _format_example(examples.get(col)) func = purpose if "用于" not in func: func = "用于" + func.rstrip("。") # ODS来源:表名-字段名(ODS自身字段);ETL补充字段标记 if col in ETL_META_FIELDS: ods_src = f"{table} - {col}(ETL补充)" else: ods_src = f"{table} - {col}" comment = ( f"【说明】{purpose}。" f" 【示例】{ex}({func})。" f" 【ODS来源】{ods_src}。" f" 【JSON字段】{json_field}。" ) lines.append( f"COMMENT ON COLUMN billiards_ods.{table}.{col} IS '{_escape_sql(comment)}';" ) return "\n".join(lines) text = SQL_PATH.read_text(encoding="utf-8") newline = "\r\n" if "\r\n" in text else "\n" kept = [] for raw_line in text.splitlines(True): stripped = raw_line.lstrip() if stripped.startswith("--"): continue if re.match(r"^\s*COMMENT ON\s+(TABLE|COLUMN)\s+", raw_line): continue kept.append(raw_line) clean = "".join(kept) create_re = re.compile( r"(CREATE TABLE IF NOT EXISTS\s+billiards_ods\.(?P[A-Za-z0-9_]+)\s*\([\s\S]*?\)\s*;)" , re.M, ) out_parts = [] last = 0 count = 0 for m in create_re.finditer(clean): out_parts.append(clean[last : m.end()]) table = m.group("table") create_sql = m.group(1) cols = parse_columns_from_ddl(create_sql) analysis_text = (DOC_DIR / f"{table}-Analysis.md").read_text(encoding="utf-8") data = json.loads((DOC_DIR / f"{table}.json").read_text(encoding="utf-8")) record_list, record_node = _find_best_record_list_and_node(data, cols) out_parts.append(newline + newline + build_comment_block(table, cols, analysis_text, (record_list, record_node)) + newline + newline) last = m.end() count += 1 out_parts.append(clean[last:]) result = "".join(out_parts) result = re.sub(r"(?:\r?\n){4,}", newline * 3, result) backup = SQL_PATH.with_suffix(SQL_PATH.suffix + ".rewrite2.bak") backup.write_text(text, encoding="utf-8") SQL_PATH.write_text(result, encoding="utf-8") print(f"Rewrote comments for {count} tables. Backup: {backup}")