""" 数据流结构分析 — 核心采集模块 从上游 SaaS API 采集 JSON 数据、递归展开 JSON 层级结构、 查询 PostgreSQL 表结构,输出结构化中间数据供 Kiro Agent 消费。 """ from __future__ import annotations import logging import re from collections import OrderedDict import json from dataclasses import asdict, dataclass, field from datetime import date, datetime from pathlib import Path from typing import Any @dataclass class AnalyzerConfig: """采集配置,由 CLI 参数或 Hook 构造""" date_from: date | None = None date_to: date | None = None limit: int = 200 tables: list[str] | None = None # 调用方必须显式传入(从 SYSTEM_ANALYZE_ROOT 环境变量读取) output_dir: Path = field(default_factory=lambda: Path("")) pg_dsn: str = "" api_base: str = "" api_token: str = "" store_id: str = "" @dataclass class FieldInfo: """JSON 字段信息(递归展开后)""" path: str # 完整路径,如 "data.settleList[].amount" json_type: str # "string" | "integer" | "number" | "boolean" | "object" | "array" | "null" sample: str # 样本值(截断到 60 字符) depth: int # 层级深度(0 为顶层) occurrence: int # 在所有记录中出现的次数 total_records: int # 总记录数 # 多示例值:最多保留 MAX_SAMPLES 个不同值(用于枚举检测和报告展示) samples: list[str] = field(default_factory=list) @dataclass class ColumnInfo: """数据库列信息""" name: str data_type: str is_nullable: bool column_default: str | None comment: str | None # DDL COMMENT 注释(来自 pg_description) ordinal_position: int @dataclass class TableCollectionResult: """单张表的采集结果""" table_name: str task_code: str description: str endpoint: str record_count: int json_fields: OrderedDict[str, FieldInfo] = field(default_factory=OrderedDict) ods_columns: list[ColumnInfo] = field(default_factory=list) dwd_columns: list[ColumnInfo] = field(default_factory=list) # 多张 DWD 表结构:{dwd_short_name -> [ColumnInfo]} dwd_tables: dict[str, list[ColumnInfo]] = field(default_factory=dict) raw_records_path: Path | None = None error: str | None = None # --- JSON 类型映射 --- _JSON_TYPE_MAP: dict[type, str] = { dict: "object", list: "array", str: "string", int: "integer", float: "number", bool: "boolean", } def _json_type_name(value: Any) -> str: """将 Python 值映射为 JSON 类型名称。""" if value is None: return "null" # bool 必须在 int 之前判断(bool 是 int 的子类) if isinstance(value, bool): return "boolean" return _JSON_TYPE_MAP.get(type(value), "string") # 每个字段最多保留的不同示例值数量 MAX_SAMPLES = 8 def _truncate_sample(value: Any, max_len: int = 60) -> str: """将值转为字符串并截断到 max_len 字符。""" s = str(value) return s[:max_len] if len(s) > max_len else s def _recurse_json( obj: Any, prefix: str, depth: int, field_map: dict[str, FieldInfo], total_records: int, ) -> None: """ 递归遍历 JSON 值,填充 field_map。 - dict: 遍历每个 key,路径追加 ".key" - list: 路径追加 "[]",遍历每个元素 - 标量: 记录类型、样本值、出现次数 """ if isinstance(obj, dict): for key, val in obj.items(): child_path = f"{prefix}.{key}" if prefix else key child_depth = depth + 1 if prefix else 0 # depth = 路径中 '.' 的数量(顶层为 0) # 顶层字段 prefix="" → child_depth=0 # 嵌套字段 prefix="data" → child_depth=1 _recurse_json(val, child_path, child_path.replace("[]", "").count("."), field_map, total_records) elif isinstance(obj, list): arr_path = f"{prefix}[]" if prefix else "[]" for item in obj: _recurse_json(item, arr_path, depth, field_map, total_records) else: # 标量叶子节点 — 记录到 field_map if not prefix: return actual_depth = prefix.replace("[]", "").count(".") if prefix in field_map: fi = field_map[prefix] fi.occurrence += 1 # 如果之前是 null 类型,用新的非 null 类型覆盖 if fi.json_type == "null" and obj is not None: fi.json_type = _json_type_name(obj) fi.sample = _truncate_sample(obj) # 收集多示例值(去重,限制数量) if obj is not None: s = _truncate_sample(obj) if s and len(fi.samples) < MAX_SAMPLES and s not in fi.samples: fi.samples.append(s) else: sample_str = _truncate_sample(obj) field_map[prefix] = FieldInfo( path=prefix, json_type=_json_type_name(obj), sample=sample_str, depth=actual_depth, occurrence=1, total_records=total_records, samples=[sample_str] if (obj is not None and sample_str) else [], ) def flatten_json_tree( records: list[dict], ) -> OrderedDict[str, FieldInfo]: """ 递归展开 JSON 记录的完整层级结构。 算法: 1. 对每条记录递归遍历所有嵌套层级 2. 用 '.' 分隔符拼接路径,数组用 '[]' 标记 3. 遍历所有记录拼合最全字段集 4. 统计每个字段的出现频率 返回 path -> FieldInfo 的有序字典(按首次出现顺序)。 """ total = len(records) if total == 0: return OrderedDict() # 第一遍:收集所有字段路径和样本(occurrence 按叶子节点累加) global_map: dict[str, FieldInfo] = {} for record in records: # 每条记录独立追踪出现的路径,避免同一记录内重复计数 per_record_map: dict[str, FieldInfo] = {} _recurse_json(record, "", 0, per_record_map, total) for path, fi in per_record_map.items(): if path in global_map: global_map[path].occurrence += 1 # 用非 null 类型覆盖 if global_map[path].json_type == "null" and fi.json_type != "null": global_map[path].json_type = fi.json_type global_map[path].sample = fi.sample # 合并示例值 for s in fi.samples: if s and len(global_map[path].samples) < MAX_SAMPLES and s not in global_map[path].samples: global_map[path].samples.append(s) else: fi.occurrence = 1 fi.total_records = total global_map[path] = fi # 按首次出现顺序构建 OrderedDict(dict 在 Python 3.7+ 保持插入顺序) result = OrderedDict() for path, fi in global_map.items(): fi.total_records = total result[path] = fi return result logger = logging.getLogger(__name__) def query_table_columns( conn, schema: str, table: str, ) -> list[ColumnInfo]: """ 从 information_schema.columns + pg_description 查询表结构。 返回所有列(含版本控制列如 valid_from, valid_to, is_current, fetched_at)。 连接失败或表不存在时返回空列表并记录错误。 """ sql = """ SELECT c.column_name, c.data_type, c.is_nullable, c.column_default, c.ordinal_position, pgd.description AS column_comment FROM information_schema.columns c LEFT JOIN pg_catalog.pg_statio_all_tables st ON st.schemaname = c.table_schema AND st.relname = c.table_name LEFT JOIN pg_catalog.pg_description pgd ON pgd.objoid = st.relid AND pgd.objsubid = c.ordinal_position WHERE c.table_schema = %s AND c.table_name = %s ORDER BY c.ordinal_position; """ try: with conn.cursor() as cur: cur.execute(sql, (schema, table)) rows = cur.fetchall() except Exception: logger.error("查询表结构失败: %s.%s", schema, table, exc_info=True) return [] if not rows: logger.warning("表不存在或无列: %s.%s", schema, table) return [] columns: list[ColumnInfo] = [] for row in rows: col_name, data_type, is_nullable_str, col_default, ordinal, comment = row columns.append( ColumnInfo( name=col_name, data_type=data_type, is_nullable=is_nullable_str == "YES", column_default=col_default, comment=comment, ordinal_position=ordinal, ) ) return columns def collect_all_tables( config: AnalyzerConfig, specs: list[dict] | None = None, fetch_fn=None, ) -> list[TableCollectionResult]: """ 执行完整数据采集流程编排。 参数: config: 采集配置 specs: ODS_SPECS 列表,每项包含 code/table/endpoint/description 等字段。 缺省时使用本模块的 ODS_SPECS。 fetch_fn: 可选的自定义 fetch 函数,签名 (spec, limit) -> list[dict]。 缺省时使用本模块的 fetch_records(spec, config)。 流程: 1. 根据 config.tables 过滤 specs 2. 建立数据库连接(可选) 3. 逐表:API 采集 → JSON 展开 → ODS/DWD 表结构查询 4. 单表失败不中断,记录 error 继续 5. 关闭数据库连接,返回结果列表 """ # 延迟导入 psycopg2,避免模块级强依赖 try: import psycopg2 except ImportError: psycopg2 = None # type: ignore[assignment] logger.warning("psycopg2 未安装,将跳过数据库表结构查询") # 缺省使用本模块的 ODS_SPECS if specs is None: specs = ODS_SPECS # ── 1. 过滤 specs ── if config.tables: table_set = {t.strip().lower() for t in config.tables} filtered = [s for s in specs if s["table"].lower() in table_set] else: filtered = list(specs) if not filtered: logger.warning("过滤后无可分析的表(config.tables=%s)", config.tables) return [] # ── 2. 建立数据库连接 ── conn = None if psycopg2 and config.pg_dsn: try: conn = psycopg2.connect(config.pg_dsn) except Exception: logger.error("数据库连接失败: %s", config.pg_dsn, exc_info=True) # ── 2b. 解析 TABLE_MAP(用于查询所有关联的 DWD 表) ── _table_map = parse_table_map() # ── 3. 逐表采集 ── results: list[TableCollectionResult] = [] for spec in filtered: table_name = spec["table"] task_code = spec.get("code", "") description = spec.get("description", "") endpoint = spec.get("endpoint", "") try: # 3a. API 采集 if fetch_fn is not None: records = fetch_fn(spec, config.limit) else: # 使用本模块的 fetch_records records = fetch_records(spec, config) # 3b. JSON 展开 json_fields = flatten_json_tree(records) # 3c. ODS/DWD 表结构查询 # 通过 TABLE_MAP 查询所有关联的 DWD 表(一个 ODS 可映射多张 DWD) ods_cols: list[ColumnInfo] = [] dwd_cols: list[ColumnInfo] = [] dwd_tables_dict: dict[str, list[ColumnInfo]] = {} if conn is not None: ods_cols = query_table_columns(conn, "ods", table_name) # 查询所有映射到此 ODS 表的 DWD 表 full_ods = f"ods.{table_name}" dwd_table_names = [ dwd_t for dwd_t, ods_t in _table_map.items() if ods_t == full_ods ] for dwd_full in sorted(dwd_table_names): dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full cols = query_table_columns(conn, "dwd", dwd_short) if cols: dwd_tables_dict[dwd_short] = cols dwd_cols.extend(cols) results.append( TableCollectionResult( table_name=table_name, task_code=task_code, description=description, endpoint=endpoint, record_count=len(records), json_fields=json_fields, ods_columns=ods_cols, dwd_columns=dwd_cols, dwd_tables=dwd_tables_dict, ) ) logger.info( "采集完成: %s — %d 条记录, %d 个 JSON 字段, ODS %d 列, DWD %d 列", table_name, len(records), len(json_fields), len(ods_cols), len(dwd_cols), ) except Exception as exc: # 单表失败不中断 logger.error("采集失败: %s — %s", table_name, exc, exc_info=True) results.append( TableCollectionResult( table_name=table_name, task_code=task_code, description=description, endpoint=endpoint, record_count=0, error=str(exc), ) ) # ── 4. 关闭数据库连接 ── if conn is not None: try: conn.close() except Exception: logger.warning("关闭数据库连接失败", exc_info=True) return results def dump_collection_results( results: list[TableCollectionResult], output_dir: Path, ) -> dict[str, Path]: """ 将采集结果序列化为 JSON 文件落盘。 输出结构: {output_dir}/ json_trees/{table}.json — 展开后的字段结构 db_schemas/ods_{table}.json — ODS 表结构 db_schemas/dwd_{table}.json — DWD 表结构(每张 DWD 表独立文件) field_mappings/{table}.json — 三层字段映射(JSON→ODS→DWD,含锚点) collection_manifest.json — 采集清单(表名、记录数、时间戳) 返回 {类别: 目录路径} 的字典。 """ # CHANGE 2026-02-21 | 清理旧子目录后重建,避免 Windows 文件锁导致写入失败 import shutil as _shutil, time as _time _sub_dirs = ["json_trees", "db_schemas", "field_mappings"] for _name in _sub_dirs: _d = output_dir / _name if _d.exists(): try: _shutil.rmtree(_d) except (PermissionError, OSError): # Windows 文件锁:无法删除也无法遍历,跳过(后面用备选名) pass # Windows rmtree 后句柄可能未释放,等待后再 mkdir _time.sleep(1) def _ensure_writable_dir(base: Path, name: str) -> Path: """确保目录可写,如果被锁则用带后缀的备选名""" d = base / name for _attempt in range(3): try: d.mkdir(parents=True, exist_ok=True) _test = d / ".write_test" _test.write_text("ok", encoding="utf-8") _test.unlink() return d except (FileNotFoundError, PermissionError, OSError): _time.sleep(1) # 旧目录不可用,用带后缀的新目录 d = base / f"{name}_new" d.mkdir(parents=True, exist_ok=True) print(f" [警告] {name}/ 被锁定,使用备选目录 {d.name}/") return d json_trees_dir = _ensure_writable_dir(output_dir, "json_trees") db_schemas_dir = _ensure_writable_dir(output_dir, "db_schemas") field_mappings_dir = _ensure_writable_dir(output_dir, "field_mappings") # 解析 TABLE_MAP / FACT_MAPPINGS(用于构建字段映射) table_map = parse_table_map() fact_mappings_data = parse_fact_mappings() # 收集所有 DWD 表结构(用于 build_field_mappings) all_dwd_cols: dict[str, list[ColumnInfo]] = {} for r in results: for dwd_short, cols in r.dwd_tables.items(): all_dwd_cols[dwd_short] = cols # ── 逐表落盘 ── for r in results: # json_trees/{table}.json — 展开后的字段结构 tree_data = { "table": r.table_name, "total_records": r.record_count, "fields": [ {**asdict(fi), "samples": fi.samples} for fi in r.json_fields.values() ], } _write_json(json_trees_dir / f"{r.table_name}.json", tree_data) # db_schemas/ods_{table}.json — ODS 表结构 ods_data = { "schema": "ods", "table": r.table_name, "columns": [asdict(c) for c in r.ods_columns], } _write_json(db_schemas_dir / f"ods_{r.table_name}.json", ods_data) # db_schemas/dwd_{dwd_short}.json — 每张 DWD 表独立文件 for dwd_short, cols in r.dwd_tables.items(): dwd_data = { "schema": "dwd", "table": dwd_short, "ods_source": r.table_name, "columns": [asdict(c) for c in cols], } _write_json(db_schemas_dir / f"dwd_{dwd_short}.json", dwd_data) # field_mappings/{table}.json — 三层字段映射 if r.error is None: mapping = build_field_mappings(r, table_map, fact_mappings_data, all_dwd_cols) _write_json(field_mappings_dir / f"{r.table_name}.json", mapping) # ── collection_manifest.json — 采集清单 ── manifest = { "timestamp": datetime.now().astimezone().isoformat(), "table_map": table_map, "tables": [ { "table": r.table_name, "task_code": r.task_code, "description": r.description, "record_count": r.record_count, "json_field_count": len(r.json_fields), "ods_column_count": len(r.ods_columns), "dwd_tables": list(r.dwd_tables.keys()), "dwd_column_count": sum(len(cols) for cols in r.dwd_tables.values()), "error": r.error, } for r in results ], } _write_json(output_dir / "collection_manifest.json", manifest) # ── BD_manual 业务描述 ── dump_bd_descriptions(results, output_dir) return { "json_trees": json_trees_dir, "db_schemas": db_schemas_dir, "field_mappings": field_mappings_dir, "bd_descriptions": output_dir / "bd_descriptions", "manifest": output_dir, } def _write_json(path: Path, data: Any) -> None: """UTF-8 编码写入 JSON 文件,ensure_ascii=False,indent=2。""" content = json.dumps(data, ensure_ascii=False, indent=2, default=str) try: path.write_text(content, encoding="utf-8") except PermissionError: # CHANGE 2026-02-21 | Windows 文件锁重试:先删再写 import time time.sleep(1) try: path.unlink(missing_ok=True) except PermissionError: pass path.write_text(content, encoding="utf-8") # ══════════════════════════════════════════════════════════════════ # ODS 任务规格(从 gen_full_dataflow_doc.py 迁移) # ══════════════════════════════════════════════════════════════════ # 格式: code, table, endpoint, data_path, list_key, time_fields, # requires_window, extra_params, description # 注意: ODS_STORE_GOODS 的 extra_params 包含 {"siteId": ["__STORE_ID__"]}, # 在 fetch_records 中根据 config.store_id 动态替换。 ODS_SPECS: list[dict] = [ { "code": "ODS_ASSISTANT_ACCOUNT", "table": "assistant_accounts_master", "dwd_table": "dim_assistant", "endpoint": "/PersonnelManagement/SearchAssistantInfo", "data_path": ("data",), "list_key": "assistantInfos", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "助教账号档案", }, { "code": "ODS_SETTLEMENT_RECORDS", "table": "settlement_records", "dwd_table": "dwd_settlement_head", "endpoint": "/Site/GetAllOrderSettleList", "data_path": ("data",), "list_key": "settleList", "time_fields": ("rangeStartTime", "rangeEndTime"), "requires_window": True, "extra_params": {}, "description": "结账记录", }, { "code": "ODS_TABLE_USE", "table": "table_fee_transactions", "dwd_table": "dwd_table_fee_log", "endpoint": "/Site/GetSiteTableOrderDetails", "data_path": ("data",), "list_key": "siteTableUseDetailsList", "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "台费计费流水", }, { "code": "ODS_ASSISTANT_LEDGER", "table": "assistant_service_records", "dwd_table": "dwd_assistant_service_log", "endpoint": "/AssistantPerformance/GetOrderAssistantDetails", "data_path": ("data",), "list_key": "orderAssistantDetails", "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "助教服务流水", }, { "code": "ODS_STORE_GOODS_SALES", "table": "store_goods_sales_records", "dwd_table": "dwd_store_goods_sale", "endpoint": "/TenantGoods/GetGoodsSalesList", "data_path": ("data",), "list_key": "orderGoodsLedgers", "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "门店商品销售流水", }, { "code": "ODS_PAYMENT", "table": "payment_transactions", "dwd_table": "dwd_payment", "endpoint": "/PayLog/GetPayLogListPage", "data_path": ("data",), "list_key": None, "time_fields": ("StartPayTime", "EndPayTime"), "requires_window": False, "extra_params": {}, "description": "支付流水", }, { "code": "ODS_REFUND", "table": "refund_transactions", "dwd_table": "dwd_refund", "endpoint": "/Order/GetRefundPayLogList", "data_path": ("data",), "list_key": None, "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "退款流水", }, { "code": "ODS_PLATFORM_COUPON", "table": "platform_coupon_redemption_records", "dwd_table": "dwd_platform_coupon_redemption", "endpoint": "/Promotion/GetOfflineCouponConsumePageList", "data_path": ("data",), "list_key": None, "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "平台/团购券核销", }, { "code": "ODS_MEMBER", "table": "member_profiles", "dwd_table": "dim_member", "endpoint": "/MemberProfile/GetTenantMemberList", "data_path": ("data",), "list_key": "tenantMemberInfos", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "会员档案", }, { "code": "ODS_MEMBER_CARD", "table": "member_stored_value_cards", "dwd_table": "dim_member_card_account", "endpoint": "/MemberProfile/GetTenantMemberCardList", "data_path": ("data",), "list_key": "tenantMemberCards", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "会员储值卡", }, { "code": "ODS_MEMBER_BALANCE", "table": "member_balance_changes", "dwd_table": "dwd_member_balance_change", "endpoint": "/MemberProfile/GetMemberCardBalanceChange", "data_path": ("data",), "list_key": "tenantMemberCardLogs", "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "会员余额变动", }, { "code": "ODS_RECHARGE_SETTLE", "table": "recharge_settlements", "dwd_table": "dwd_recharge_order", "endpoint": "/Site/GetRechargeSettleList", "data_path": ("data",), "list_key": "settleList", "time_fields": ("rangeStartTime", "rangeEndTime"), "requires_window": True, "extra_params": {}, "description": "充值结算", }, { "code": "ODS_GROUP_PACKAGE", "table": "group_buy_packages", "dwd_table": "dim_groupbuy_package", "endpoint": "/PackageCoupon/QueryPackageCouponList", "data_path": ("data",), "list_key": "packageCouponList", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "团购套餐定义", }, { "code": "ODS_GROUP_BUY_REDEMPTION", "table": "group_buy_redemption_records", "dwd_table": "dwd_groupbuy_redemption", "endpoint": "/Site/GetSiteTableUseDetails", "data_path": ("data",), "list_key": "siteTableUseDetailsList", "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "团购套餐核销", }, { "code": "ODS_INVENTORY_STOCK", "table": "goods_stock_summary", "dwd_table": None, "endpoint": "/TenantGoods/GetGoodsStockReport", "data_path": ("data",), "list_key": None, "time_fields": None, "requires_window": False, "extra_params": {}, "description": "库存汇总", }, { "code": "ODS_INVENTORY_CHANGE", "table": "goods_stock_movements", "dwd_table": None, "endpoint": "/GoodsStockManage/QueryGoodsOutboundReceipt", "data_path": ("data",), "list_key": "queryDeliveryRecordsList", "time_fields": ("startTime", "endTime"), "requires_window": True, "extra_params": {}, "description": "库存变化记录", }, { "code": "ODS_TABLES", "table": "site_tables_master", "dwd_table": "dim_table", "endpoint": "/Table/GetSiteTables", "data_path": ("data",), "list_key": "siteTables", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "台桌维表", }, { "code": "ODS_GOODS_CATEGORY", "table": "stock_goods_category_tree", "dwd_table": "dim_goods_category", "endpoint": "/TenantGoodsCategory/QueryPrimarySecondaryCategory", "data_path": ("data",), "list_key": "goodsCategoryList", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "库存商品分类树", }, { "code": "ODS_STORE_GOODS", "table": "store_goods_master", "dwd_table": "dim_store_goods", "endpoint": "/TenantGoods/GetGoodsInventoryList", "data_path": ("data",), "list_key": "orderGoodsList", "time_fields": None, "requires_window": False, # STORE_ID 占位符,在 fetch_records 中动态替换为 config.store_id "extra_params": {"siteId": ["__STORE_ID__"]}, "description": "门店商品档案", }, { "code": "ODS_TABLE_FEE_DISCOUNT", "table": "table_fee_discount_records", "dwd_table": "dwd_table_fee_adjust", "endpoint": "/Site/GetTaiFeeAdjustList", "data_path": ("data",), "list_key": "taiFeeAdjustInfos", "time_fields": ("startTime", "endTime"), "requires_window": False, "extra_params": {}, "description": "台费折扣/调账", }, { "code": "ODS_TENANT_GOODS", "table": "tenant_goods_master", "dwd_table": "dim_tenant_goods", "endpoint": "/TenantGoods/QueryTenantGoods", "data_path": ("data",), "list_key": "tenantGoodsList", "time_fields": None, "requires_window": False, "extra_params": {}, "description": "租户商品档案", }, ] # 默认 list_key 候选(与 APIClient 一致) DEFAULT_LIST_KEYS: tuple[str, ...] = ( "list", "rows", "records", "items", "dataList", "data_list", "tenantMemberInfos", "tenantMemberCardLogs", "tenantMemberCards", "settleList", "orderAssistantDetails", "assistantInfos", "siteTables", "taiFeeAdjustInfos", "siteTableUseDetailsList", "tenantGoodsList", "packageCouponList", "queryDeliveryRecordsList", "goodsCategoryList", "orderGoodsList", "orderGoodsLedgers", ) # ══════════════════════════════════════════════════════════════════ # API 调用(从 gen_full_dataflow_doc.py 迁移,适配 AnalyzerConfig) # ══════════════════════════════════════════════════════════════════ def _build_headers(config: AnalyzerConfig) -> dict[str, str]: """根据 config 构造浏览器风格请求头。""" return { "Accept": "application/json, text/plain, */*", "Content-Type": "application/json", "Origin": "https://pc.ficoo.vip", "Referer": "https://pc.ficoo.vip/", "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36" ), "Authorization": f"Bearer {config.api_token}" if config.api_token else "", } def api_post(endpoint: str, payload: dict, config: AnalyzerConfig) -> dict: """发送 POST 请求到 API。""" import requests url = f"{config.api_base.rstrip('/')}/{endpoint.lstrip('/')}" headers = _build_headers(config) resp = requests.post(url, json=payload, headers=headers, timeout=20) resp.raise_for_status() data = resp.json() code = data.get("code") if code not in (0, "0", None): msg = data.get("msg") or data.get("message") or "" raise ValueError(f"API 错误 code={code} msg={msg} endpoint={endpoint}") return data def extract_list(payload: dict, data_path: tuple, list_key: str | None) -> list: """从 API 响应中提取记录列表。""" cur = payload for key in data_path: if isinstance(cur, dict): cur = cur.get(key) else: cur = None if cur is None: break if isinstance(cur, list): return cur if isinstance(cur, dict): if list_key and isinstance(cur.get(list_key), list): return cur[list_key] for k in DEFAULT_LIST_KEYS: if isinstance(cur.get(k), list): return cur[k] for v in cur.values(): if isinstance(v, list): return v return [] def _resolve_extra_params(extra_params: dict, config: AnalyzerConfig) -> dict: """将 extra_params 中的 __STORE_ID__ 占位符替换为 config.store_id。""" if not extra_params: return extra_params resolved = {} for k, v in extra_params.items(): if isinstance(v, list): resolved[k] = [ config.store_id if item == "__STORE_ID__" else item for item in v ] elif v == "__STORE_ID__": resolved[k] = config.store_id else: resolved[k] = v return resolved def fetch_records(spec: dict, config: AnalyzerConfig) -> list[dict]: """ 获取 API 记录。 - 有时间字段的表:从今天往回 10 天一批,不够则继续扩展,最多 10 次重试 - 无时间字段的表:单次请求 参数: spec: ODS_SPECS 中的单项配置 config: AnalyzerConfig,提供 api_base/api_token/store_id/limit """ from datetime import timedelta from zoneinfo import ZoneInfo endpoint = spec["endpoint"] data_path = spec["data_path"] list_key = spec["list_key"] time_fields = spec["time_fields"] extra_params = _resolve_extra_params(spec.get("extra_params", {}), config) target_count = config.limit tz = ZoneInfo("Asia/Shanghai") all_records: list[dict] = [] if time_fields: # 有时间窗口:从今天往回扩展 start_key, end_key = time_fields now = datetime.now(tz) end_dt = now batch_days = 10 max_retries = 10 for attempt in range(max_retries): start_dt = end_dt - timedelta(days=batch_days) params = { "siteId": config.store_id, "page": 1, "limit": target_count, start_key: start_dt.strftime("%Y-%m-%d %H:%M:%S"), end_key: end_dt.strftime("%Y-%m-%d %H:%M:%S"), **extra_params, } try: resp = api_post(endpoint, params, config) records = extract_list(resp, data_path, list_key) all_records.extend(records) except Exception as e: logger.warning( "API 请求失败 %s attempt=%d: %s", endpoint, attempt + 1, e ) if len(all_records) >= target_count: break # 继续往前扩展 end_dt = start_dt else: # 无时间窗口:单次请求 params = { "siteId": config.store_id, "page": 1, "limit": target_count, **extra_params, } try: resp = api_post(endpoint, params, config) all_records = extract_list(resp, data_path, list_key) except Exception as e: logger.warning("API 请求失败 %s: %s", endpoint, e) return all_records[:target_count] # ══════════════════════════════════════════════════════════════════ # ETL 源码解析:TABLE_MAP / FACT_MAPPINGS # ══════════════════════════════════════════════════════════════════ import re # DWD 加载任务源码的默认路径(使用绝对路径,避免 cwd 不在项目根时找不到) # CHANGE 2026-02-21 | 相对路径 → 绝对路径,与 _env_paths 同源 _PROJECT_ROOT = Path(__file__).resolve().parents[2] _DWD_TASK_PY = _PROJECT_ROOT / "apps" / "etl" / "connectors" / "feiqiu" / "tasks" / "dwd" / "dwd_load_task.py" def parse_table_map(py_path: Path | None = None) -> dict[str, str]: """ 从 dwd_load_task.py 解析 TABLE_MAP: {dwd_table -> ods_table}。 返回如 {"dwd.dim_assistant": "ods.assistant_accounts_master", ...} """ py_path = py_path or _DWD_TASK_PY if not py_path.exists(): logger.warning("TABLE_MAP 源文件不存在: %s", py_path) return {} text = py_path.read_text(encoding="utf-8") match = re.search( r"TABLE_MAP\s*(?::\s*dict\[.*?\])?\s*=\s*\{(.*?)\}", text, re.DOTALL, ) if not match: return {} body = match.group(1) result = {} for m in re.finditer(r'"([^"]+)"\s*:\s*"([^"]+)"', body): result[m.group(1)] = m.group(2) return result def parse_fact_mappings(py_path: Path | None = None) -> dict[str, list[tuple]]: """ 从 dwd_load_task.py 解析 FACT_MAPPINGS: {dwd_table -> [(dwd_col, ods_expr, cast), ...]}。 显式映射(字段重命名、JSONB 提取、CAST 转换等)。 """ py_path = py_path or _DWD_TASK_PY if not py_path.exists(): logger.warning("FACT_MAPPINGS 源文件不存在: %s", py_path) return {} text = py_path.read_text(encoding="utf-8") start = text.find("FACT_MAPPINGS") if start < 0: return {} brace_start = text.find("{", start) if brace_start < 0: return {} depth = 0 end = brace_start for i in range(brace_start, len(text)): if text[i] == "{": depth += 1 elif text[i] == "}": depth -= 1 if depth == 0: end = i + 1 break block = text[brace_start:end] result: dict[str, list[tuple]] = {} table_pattern = re.compile(r'"([^"]+)"\s*:\s*\[', re.DOTALL) for tm in table_pattern.finditer(block): table_name = tm.group(1) list_start = tm.end() bracket_depth = 1 list_end = list_start for i in range(list_start, len(block)): if block[i] == "[": bracket_depth += 1 elif block[i] == "]": bracket_depth -= 1 if bracket_depth == 0: list_end = i break list_body = block[list_start:list_end] tuples = [] tuple_pattern = re.compile( r'\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*(?:"([^"]+)"|None)\s*\)' ) for tp in tuple_pattern.finditer(list_body): tuples.append((tp.group(1), tp.group(2), tp.group(3))) result[table_name] = tuples return result # ══════════════════════════════════════════════════════════════════ # BD_manual 文档解析:提取字段级业务描述 # ══════════════════════════════════════════════════════════════════ # BD_manual 文档根目录(使用绝对路径,与 _DWD_TASK_PY 同源) # CHANGE 2026-02-21 | 相对路径 → 绝对路径,避免 cwd 不在项目根时找不到 _BD_DOCS_ROOT = _PROJECT_ROOT / "apps" / "etl" / "connectors" / "feiqiu" / "docs" / "database" def parse_bd_manual_fields(doc_path: Path) -> dict[str, str]: """ 从 BD_manual Markdown 文档中解析字段说明表格。 返回 {字段名(小写) -> 说明文本}。 支持 ODS/main/ 和 DWD/main/ 下的 BD_manual_*.md 格式。 """ if not doc_path.exists(): return {} text = doc_path.read_text(encoding="utf-8") result: dict[str, str] = {} # 查找 "## 字段说明" 后的表格 in_table = False header_found = False desc_col_idx = -1 name_col_idx = -1 for line in text.splitlines(): stripped = line.strip() if stripped.startswith("## 字段说明"): in_table = True continue if in_table and stripped.startswith("##"): # 遇到下一个 section,停止 break if not in_table: continue if not stripped.startswith("|"): continue cols = [c.strip() for c in stripped.split("|")] # 去掉首尾空元素(因为 | 开头和结尾) cols = cols[1:-1] if len(cols) > 2 else cols if not header_found: # 查找表头行 for i, c in enumerate(cols): if "字段名" in c or "字段" == c: name_col_idx = i if "说明" in c: desc_col_idx = i if name_col_idx >= 0 and desc_col_idx >= 0: header_found = True continue # 跳过分隔行 if all(c.replace("-", "").replace(":", "").strip() == "" for c in cols): continue if name_col_idx < len(cols) and desc_col_idx < len(cols): field_name = cols[name_col_idx].strip().strip("`") desc = cols[desc_col_idx].strip() if field_name and desc: result[field_name.lower()] = desc return result def load_bd_descriptions(table_name: str) -> dict[str, dict[str, str]]: """ 加载指定 ODS 表及其关联 DWD 表的 BD_manual 业务描述。 返回: { "ods": {字段名 -> 说明}, "dwd": { "dim_assistant": {字段名 -> 说明}, ... } } """ result: dict[str, dict[str, str]] = {"ods": {}, "dwd": {}} # ODS BD_manual ods_doc = _BD_DOCS_ROOT / "ODS" / "main" / f"BD_manual_{table_name}.md" result["ods"] = parse_bd_manual_fields(ods_doc) # DWD BD_manual — 需要通过 TABLE_MAP 找到关联的 DWD 表 dwd_dir = _BD_DOCS_ROOT / "DWD" / "main" if dwd_dir.exists(): for f in sorted(dwd_dir.iterdir()): if f.name.startswith("BD_manual_") and f.suffix == ".md": dwd_short = f.stem.replace("BD_manual_", "") result["dwd"][dwd_short] = parse_bd_manual_fields(f) return result def dump_bd_descriptions( results: list[TableCollectionResult], output_dir: Path, ) -> None: """ 为每张 ODS 表解析 BD_manual 文档,输出 bd_descriptions/{table}.json。 结构: { "ods_table": "assistant_accounts_master", "ods_fields": {"id": "助教账号主键 ID...", ...}, "dwd_fields": { "dim_assistant": {"assistant_id": "助教唯一标识 ID", ...}, ... } } """ bd_dir = output_dir / "bd_descriptions" bd_dir.mkdir(parents=True, exist_ok=True) for r in results: descs = load_bd_descriptions(r.table_name) data = { "ods_table": r.table_name, "ods_fields": descs["ods"], "dwd_fields": { dwd_short: descs["dwd"].get(dwd_short, {}) for dwd_short in r.dwd_tables.keys() }, } _write_json(bd_dir / f"{r.table_name}.json", data) # ══════════════════════════════════════════════════════════════════ # 三层字段映射构建(JSON → ODS → DWD,含锚点 ID) # ══════════════════════════════════════════════════════════════════ SCD2_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"} # CHANGE 2026-03-26 | 字段名归一化工具,消除 camelCase↔snake_case 误报 def _camel_to_snake(name: str) -> str: """camelCase/PascalCase → snake_case""" s1 = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", name) s2 = re.sub(r"([a-z\d])([A-Z])", r"\1_\2", s1) return s2.lower() def _normalize_field(name: str) -> str: """去下划线纯小写(兜底匹配)""" return _camel_to_snake(name).replace("_", "") def build_field_mappings( result: TableCollectionResult, table_map: dict[str, str], fact_mappings: dict[str, list[tuple]], all_dwd_cols: dict[str, list[ColumnInfo]], ) -> dict: """ 为单张 ODS 表构建完整的三层字段映射关系。 返回结构: { "ods_table": "assistant_accounts_master", "anchors": { "api": "api-assistant-accounts-master", "ods": "ods-assistant-accounts-master", "dwd": {"dim_assistant": "dwd-dim-assistant", ...} }, "json_to_ods": [ {"json_path": "id", "ods_col": "id", "match_type": "exact", ...}, ... ], "ods_to_dwd": { "id": [ {"dwd_table": "dim_assistant", "dwd_col": "assistant_id", "cast": null, "note": "字段重命名"}, ... ], ... }, "dwd_to_ods": { "dim_assistant": [ {"dwd_col": "assistant_id", "type": "BIGINT", "ods_source": "id", "mapping_type": "直接", "note": "字段重命名"}, ... ], ... } } """ ods_table = result.table_name full_ods = f"ods.{ods_table}" # 锚点 ID 生成(与旧文档格式一致) anchor_base = ods_table.replace("_", "-") anchors = { "api": f"api-{anchor_base}", "ods": f"ods-{anchor_base}", "dwd": {}, } # 找到所有映射到此 ODS 表的 DWD 表 dwd_tables_for_ods = sorted( [dwd_t for dwd_t, ods_t in table_map.items() if ods_t == full_ods] ) for dwd_full in dwd_tables_for_ods: dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full anchors["dwd"][dwd_short] = f"dwd-{dwd_short.replace('_', '-')}" # ── JSON → ODS 映射 ── ods_col_set = {c.name.lower() for c in result.ods_columns} json_to_ods = [] for path, fi in result.json_fields.items(): # 叶子字段名(去掉嵌套前缀和 []) leaf = path.split(".")[-1].replace("[]", "") leaf_lower = leaf.lower() if leaf_lower in ods_col_set: match_type = "exact" if leaf in ods_col_set else "case_insensitive" json_to_ods.append({ "json_path": path, "ods_col": leaf_lower, "match_type": match_type, "json_type": fi.json_type, "occurrence_pct": round(fi.occurrence / fi.total_records * 100, 1) if fi.total_records > 0 else 0, }) else: json_to_ods.append({ "json_path": path, "ods_col": None, "match_type": "unmapped", "json_type": fi.json_type, "occurrence_pct": round(fi.occurrence / fi.total_records * 100, 1) if fi.total_records > 0 else 0, }) # ── ODS → DWD 映射(按 ODS 列聚合所有下游 DWD 列) ── ods_to_dwd: dict[str, list[dict]] = {} dwd_to_ods: dict[str, list[dict]] = {} for dwd_full in dwd_tables_for_ods: dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full dwd_cols = all_dwd_cols.get(dwd_short, []) if not dwd_cols: continue mappings_list = fact_mappings.get(dwd_full, []) mapping_dict = {m[0].lower(): (m[1], m[2]) for m in mappings_list} is_dim = "dim_" in dwd_short is_ex = dwd_short.endswith("_ex") table_type = "维度" if is_dim else "事实" if is_ex: table_type += "(扩展)" dwd_to_ods[dwd_short] = [] for c in dwd_cols: col_lower = c.name.lower() if col_lower in SCD2_COLS: dwd_to_ods[dwd_short].append({ "dwd_col": c.name, "type": c.data_type, "ods_source": "—", "mapping_type": "SCD2", "note": "SCD2 元数据", }) continue if col_lower in mapping_dict: ods_expr, cast = mapping_dict[col_lower] note = "" if "->>" in ods_expr: note = "JSONB 提取" elif "CASE" in ods_expr.upper(): note = "派生计算" elif ods_expr.lower() != col_lower: note = "字段重命名" dwd_to_ods[dwd_short].append({ "dwd_col": c.name, "type": c.data_type, "ods_source": ods_expr, "mapping_type": f"→ {cast}" if cast else "直接", "note": note, }) # 反向:ODS 列 → DWD 列 ods_key = ods_expr.lower() if "->>" not in ods_expr and "CASE" not in ods_expr.upper() else None if ods_key and ods_key in ods_col_set: ods_to_dwd.setdefault(ods_key, []).append({ "dwd_table": dwd_short, "dwd_col": c.name, "cast": cast, "note": note, }) elif col_lower in ods_col_set: dwd_to_ods[dwd_short].append({ "dwd_col": c.name, "type": c.data_type, "ods_source": c.name, "mapping_type": "直接", "note": "同名直传", }) ods_to_dwd.setdefault(col_lower, []).append({ "dwd_table": dwd_short, "dwd_col": c.name, "cast": None, "note": "同名直传", }) else: # CHANGE 2026-03-26 | camelCase→snake_case 归一化兜底匹配 # ODS 存储原始 API 字段名(camelCase),DWD 统一 snake_case norm_dwd = _normalize_field(col_lower) matched_ods = None for oc in ods_col_set: if _normalize_field(oc) == norm_dwd: matched_ods = oc break if matched_ods: dwd_to_ods[dwd_short].append({ "dwd_col": c.name, "type": c.data_type, "ods_source": matched_ods, "mapping_type": "直接", "note": "命名转换(camelCase→snake_case)", }) ods_to_dwd.setdefault(matched_ods, []).append({ "dwd_table": dwd_short, "dwd_col": c.name, "cast": None, "note": "命名转换", }) else: dwd_to_ods[dwd_short].append({ "dwd_col": c.name, "type": c.data_type, "ods_source": "—", "mapping_type": "—", "note": "未显式映射", }) return { "ods_table": ods_table, "anchors": anchors, "json_to_ods": json_to_ods, "ods_to_dwd": ods_to_dwd, "dwd_to_ods": dwd_to_ods, }