1355 lines
47 KiB
Python
1355 lines
47 KiB
Python
"""
|
||
数据流结构分析 — 核心采集模块
|
||
|
||
从上游 SaaS API 采集 JSON 数据、递归展开 JSON 层级结构、
|
||
查询 PostgreSQL 表结构,输出结构化中间数据供 Kiro Agent 消费。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from collections import OrderedDict
|
||
import json
|
||
from dataclasses import asdict, dataclass, field
|
||
from datetime import date, datetime
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
|
||
@dataclass
|
||
class AnalyzerConfig:
|
||
"""采集配置,由 CLI 参数或 Hook 构造"""
|
||
|
||
date_from: date | None = None
|
||
date_to: date | None = None
|
||
limit: int = 200
|
||
tables: list[str] | None = None
|
||
output_dir: Path = field(default_factory=lambda: Path("docs/reports"))
|
||
pg_dsn: str = ""
|
||
api_base: str = ""
|
||
api_token: str = ""
|
||
store_id: str = ""
|
||
|
||
|
||
@dataclass
|
||
class FieldInfo:
|
||
"""JSON 字段信息(递归展开后)"""
|
||
|
||
path: str # 完整路径,如 "data.settleList[].amount"
|
||
json_type: str # "string" | "integer" | "number" | "boolean" | "object" | "array" | "null"
|
||
sample: str # 样本值(截断到 60 字符)
|
||
depth: int # 层级深度(0 为顶层)
|
||
occurrence: int # 在所有记录中出现的次数
|
||
total_records: int # 总记录数
|
||
# 多示例值:最多保留 MAX_SAMPLES 个不同值(用于枚举检测和报告展示)
|
||
samples: list[str] = field(default_factory=list)
|
||
|
||
|
||
@dataclass
|
||
class ColumnInfo:
|
||
"""数据库列信息"""
|
||
|
||
name: str
|
||
data_type: str
|
||
is_nullable: bool
|
||
column_default: str | None
|
||
comment: str | None # DDL COMMENT 注释(来自 pg_description)
|
||
ordinal_position: int
|
||
|
||
|
||
@dataclass
|
||
class TableCollectionResult:
|
||
"""单张表的采集结果"""
|
||
|
||
table_name: str
|
||
task_code: str
|
||
description: str
|
||
endpoint: str
|
||
record_count: int
|
||
json_fields: OrderedDict[str, FieldInfo] = field(default_factory=OrderedDict)
|
||
ods_columns: list[ColumnInfo] = field(default_factory=list)
|
||
dwd_columns: list[ColumnInfo] = field(default_factory=list)
|
||
# 多张 DWD 表结构:{dwd_short_name -> [ColumnInfo]}
|
||
dwd_tables: dict[str, list[ColumnInfo]] = field(default_factory=dict)
|
||
raw_records_path: Path | None = None
|
||
error: str | None = None
|
||
|
||
|
||
# --- JSON 类型映射 ---
|
||
_JSON_TYPE_MAP: dict[type, str] = {
|
||
dict: "object",
|
||
list: "array",
|
||
str: "string",
|
||
int: "integer",
|
||
float: "number",
|
||
bool: "boolean",
|
||
}
|
||
|
||
|
||
def _json_type_name(value: Any) -> str:
|
||
"""将 Python 值映射为 JSON 类型名称。"""
|
||
if value is None:
|
||
return "null"
|
||
# bool 必须在 int 之前判断(bool 是 int 的子类)
|
||
if isinstance(value, bool):
|
||
return "boolean"
|
||
return _JSON_TYPE_MAP.get(type(value), "string")
|
||
|
||
|
||
# 每个字段最多保留的不同示例值数量
|
||
MAX_SAMPLES = 8
|
||
|
||
|
||
def _truncate_sample(value: Any, max_len: int = 60) -> str:
|
||
"""将值转为字符串并截断到 max_len 字符。"""
|
||
s = str(value)
|
||
return s[:max_len] if len(s) > max_len else s
|
||
|
||
|
||
def _recurse_json(
|
||
obj: Any,
|
||
prefix: str,
|
||
depth: int,
|
||
field_map: dict[str, FieldInfo],
|
||
total_records: int,
|
||
) -> None:
|
||
"""
|
||
递归遍历 JSON 值,填充 field_map。
|
||
|
||
- dict: 遍历每个 key,路径追加 ".key"
|
||
- list: 路径追加 "[]",遍历每个元素
|
||
- 标量: 记录类型、样本值、出现次数
|
||
"""
|
||
if isinstance(obj, dict):
|
||
for key, val in obj.items():
|
||
child_path = f"{prefix}.{key}" if prefix else key
|
||
child_depth = depth + 1 if prefix else 0
|
||
# depth = 路径中 '.' 的数量(顶层为 0)
|
||
# 顶层字段 prefix="" → child_depth=0
|
||
# 嵌套字段 prefix="data" → child_depth=1
|
||
_recurse_json(val, child_path, child_path.replace("[]", "").count("."), field_map, total_records)
|
||
elif isinstance(obj, list):
|
||
arr_path = f"{prefix}[]" if prefix else "[]"
|
||
for item in obj:
|
||
_recurse_json(item, arr_path, depth, field_map, total_records)
|
||
else:
|
||
# 标量叶子节点 — 记录到 field_map
|
||
if not prefix:
|
||
return
|
||
actual_depth = prefix.replace("[]", "").count(".")
|
||
if prefix in field_map:
|
||
fi = field_map[prefix]
|
||
fi.occurrence += 1
|
||
# 如果之前是 null 类型,用新的非 null 类型覆盖
|
||
if fi.json_type == "null" and obj is not None:
|
||
fi.json_type = _json_type_name(obj)
|
||
fi.sample = _truncate_sample(obj)
|
||
# 收集多示例值(去重,限制数量)
|
||
if obj is not None:
|
||
s = _truncate_sample(obj)
|
||
if s and len(fi.samples) < MAX_SAMPLES and s not in fi.samples:
|
||
fi.samples.append(s)
|
||
else:
|
||
sample_str = _truncate_sample(obj)
|
||
field_map[prefix] = FieldInfo(
|
||
path=prefix,
|
||
json_type=_json_type_name(obj),
|
||
sample=sample_str,
|
||
depth=actual_depth,
|
||
occurrence=1,
|
||
total_records=total_records,
|
||
samples=[sample_str] if (obj is not None and sample_str) else [],
|
||
)
|
||
|
||
|
||
def flatten_json_tree(
|
||
records: list[dict],
|
||
) -> OrderedDict[str, FieldInfo]:
|
||
"""
|
||
递归展开 JSON 记录的完整层级结构。
|
||
|
||
算法:
|
||
1. 对每条记录递归遍历所有嵌套层级
|
||
2. 用 '.' 分隔符拼接路径,数组用 '[]' 标记
|
||
3. 遍历所有记录拼合最全字段集
|
||
4. 统计每个字段的出现频率
|
||
|
||
返回 path -> FieldInfo 的有序字典(按首次出现顺序)。
|
||
"""
|
||
total = len(records)
|
||
if total == 0:
|
||
return OrderedDict()
|
||
|
||
# 第一遍:收集所有字段路径和样本(occurrence 按叶子节点累加)
|
||
global_map: dict[str, FieldInfo] = {}
|
||
for record in records:
|
||
# 每条记录独立追踪出现的路径,避免同一记录内重复计数
|
||
per_record_map: dict[str, FieldInfo] = {}
|
||
_recurse_json(record, "", 0, per_record_map, total)
|
||
for path, fi in per_record_map.items():
|
||
if path in global_map:
|
||
global_map[path].occurrence += 1
|
||
# 用非 null 类型覆盖
|
||
if global_map[path].json_type == "null" and fi.json_type != "null":
|
||
global_map[path].json_type = fi.json_type
|
||
global_map[path].sample = fi.sample
|
||
# 合并示例值
|
||
for s in fi.samples:
|
||
if s and len(global_map[path].samples) < MAX_SAMPLES and s not in global_map[path].samples:
|
||
global_map[path].samples.append(s)
|
||
else:
|
||
fi.occurrence = 1
|
||
fi.total_records = total
|
||
global_map[path] = fi
|
||
|
||
# 按首次出现顺序构建 OrderedDict(dict 在 Python 3.7+ 保持插入顺序)
|
||
result = OrderedDict()
|
||
for path, fi in global_map.items():
|
||
fi.total_records = total
|
||
result[path] = fi
|
||
return result
|
||
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def query_table_columns(
|
||
conn,
|
||
schema: str,
|
||
table: str,
|
||
) -> list[ColumnInfo]:
|
||
"""
|
||
从 information_schema.columns + pg_description 查询表结构。
|
||
|
||
返回所有列(含版本控制列如 valid_from, valid_to, is_current, fetched_at)。
|
||
连接失败或表不存在时返回空列表并记录错误。
|
||
"""
|
||
sql = """
|
||
SELECT c.column_name, c.data_type, c.is_nullable,
|
||
c.column_default, c.ordinal_position,
|
||
pgd.description AS column_comment
|
||
FROM information_schema.columns c
|
||
LEFT JOIN pg_catalog.pg_statio_all_tables st
|
||
ON st.schemaname = c.table_schema
|
||
AND st.relname = c.table_name
|
||
LEFT JOIN pg_catalog.pg_description pgd
|
||
ON pgd.objoid = st.relid
|
||
AND pgd.objsubid = c.ordinal_position
|
||
WHERE c.table_schema = %s AND c.table_name = %s
|
||
ORDER BY c.ordinal_position;
|
||
"""
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(sql, (schema, table))
|
||
rows = cur.fetchall()
|
||
except Exception:
|
||
logger.error("查询表结构失败: %s.%s", schema, table, exc_info=True)
|
||
return []
|
||
|
||
if not rows:
|
||
logger.warning("表不存在或无列: %s.%s", schema, table)
|
||
return []
|
||
|
||
columns: list[ColumnInfo] = []
|
||
for row in rows:
|
||
col_name, data_type, is_nullable_str, col_default, ordinal, comment = row
|
||
columns.append(
|
||
ColumnInfo(
|
||
name=col_name,
|
||
data_type=data_type,
|
||
is_nullable=is_nullable_str == "YES",
|
||
column_default=col_default,
|
||
comment=comment,
|
||
ordinal_position=ordinal,
|
||
)
|
||
)
|
||
return columns
|
||
|
||
|
||
def collect_all_tables(
|
||
config: AnalyzerConfig,
|
||
specs: list[dict] | None = None,
|
||
fetch_fn=None,
|
||
) -> list[TableCollectionResult]:
|
||
"""
|
||
执行完整数据采集流程编排。
|
||
|
||
参数:
|
||
config: 采集配置
|
||
specs: ODS_SPECS 列表,每项包含 code/table/endpoint/description 等字段。
|
||
缺省时使用本模块的 ODS_SPECS。
|
||
fetch_fn: 可选的自定义 fetch 函数,签名 (spec, limit) -> list[dict]。
|
||
缺省时使用本模块的 fetch_records(spec, config)。
|
||
|
||
流程:
|
||
1. 根据 config.tables 过滤 specs
|
||
2. 建立数据库连接(可选)
|
||
3. 逐表:API 采集 → JSON 展开 → ODS/DWD 表结构查询
|
||
4. 单表失败不中断,记录 error 继续
|
||
5. 关闭数据库连接,返回结果列表
|
||
"""
|
||
# 延迟导入 psycopg2,避免模块级强依赖
|
||
try:
|
||
import psycopg2
|
||
except ImportError:
|
||
psycopg2 = None # type: ignore[assignment]
|
||
logger.warning("psycopg2 未安装,将跳过数据库表结构查询")
|
||
|
||
# 缺省使用本模块的 ODS_SPECS
|
||
if specs is None:
|
||
specs = ODS_SPECS
|
||
|
||
# ── 1. 过滤 specs ──
|
||
if config.tables:
|
||
table_set = {t.strip().lower() for t in config.tables}
|
||
filtered = [s for s in specs if s["table"].lower() in table_set]
|
||
else:
|
||
filtered = list(specs)
|
||
|
||
if not filtered:
|
||
logger.warning("过滤后无可分析的表(config.tables=%s)", config.tables)
|
||
return []
|
||
|
||
# ── 2. 建立数据库连接 ──
|
||
conn = None
|
||
if psycopg2 and config.pg_dsn:
|
||
try:
|
||
conn = psycopg2.connect(config.pg_dsn)
|
||
except Exception:
|
||
logger.error("数据库连接失败: %s", config.pg_dsn, exc_info=True)
|
||
|
||
# ── 2b. 解析 TABLE_MAP(用于查询所有关联的 DWD 表) ──
|
||
_table_map = parse_table_map()
|
||
|
||
# ── 3. 逐表采集 ──
|
||
results: list[TableCollectionResult] = []
|
||
for spec in filtered:
|
||
table_name = spec["table"]
|
||
task_code = spec.get("code", "")
|
||
description = spec.get("description", "")
|
||
endpoint = spec.get("endpoint", "")
|
||
|
||
try:
|
||
# 3a. API 采集
|
||
if fetch_fn is not None:
|
||
records = fetch_fn(spec, config.limit)
|
||
else:
|
||
# 使用本模块的 fetch_records
|
||
records = fetch_records(spec, config)
|
||
|
||
# 3b. JSON 展开
|
||
json_fields = flatten_json_tree(records)
|
||
|
||
# 3c. ODS/DWD 表结构查询
|
||
# 通过 TABLE_MAP 查询所有关联的 DWD 表(一个 ODS 可映射多张 DWD)
|
||
ods_cols: list[ColumnInfo] = []
|
||
dwd_cols: list[ColumnInfo] = []
|
||
dwd_tables_dict: dict[str, list[ColumnInfo]] = {}
|
||
if conn is not None:
|
||
ods_cols = query_table_columns(conn, "ods", table_name)
|
||
# 查询所有映射到此 ODS 表的 DWD 表
|
||
full_ods = f"ods.{table_name}"
|
||
dwd_table_names = [
|
||
dwd_t for dwd_t, ods_t in _table_map.items()
|
||
if ods_t == full_ods
|
||
]
|
||
for dwd_full in sorted(dwd_table_names):
|
||
dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full
|
||
cols = query_table_columns(conn, "dwd", dwd_short)
|
||
if cols:
|
||
dwd_tables_dict[dwd_short] = cols
|
||
dwd_cols.extend(cols)
|
||
|
||
results.append(
|
||
TableCollectionResult(
|
||
table_name=table_name,
|
||
task_code=task_code,
|
||
description=description,
|
||
endpoint=endpoint,
|
||
record_count=len(records),
|
||
json_fields=json_fields,
|
||
ods_columns=ods_cols,
|
||
dwd_columns=dwd_cols,
|
||
dwd_tables=dwd_tables_dict,
|
||
)
|
||
)
|
||
logger.info(
|
||
"采集完成: %s — %d 条记录, %d 个 JSON 字段, ODS %d 列, DWD %d 列",
|
||
table_name, len(records), len(json_fields),
|
||
len(ods_cols), len(dwd_cols),
|
||
)
|
||
|
||
except Exception as exc:
|
||
# 单表失败不中断
|
||
logger.error("采集失败: %s — %s", table_name, exc, exc_info=True)
|
||
results.append(
|
||
TableCollectionResult(
|
||
table_name=table_name,
|
||
task_code=task_code,
|
||
description=description,
|
||
endpoint=endpoint,
|
||
record_count=0,
|
||
error=str(exc),
|
||
)
|
||
)
|
||
|
||
# ── 4. 关闭数据库连接 ──
|
||
if conn is not None:
|
||
try:
|
||
conn.close()
|
||
except Exception:
|
||
logger.warning("关闭数据库连接失败", exc_info=True)
|
||
|
||
return results
|
||
|
||
|
||
def dump_collection_results(
|
||
results: list[TableCollectionResult],
|
||
output_dir: Path,
|
||
) -> dict[str, Path]:
|
||
"""
|
||
将采集结果序列化为 JSON 文件落盘。
|
||
|
||
输出结构:
|
||
{output_dir}/
|
||
json_trees/{table}.json — 展开后的字段结构
|
||
db_schemas/ods_{table}.json — ODS 表结构
|
||
db_schemas/dwd_{table}.json — DWD 表结构(每张 DWD 表独立文件)
|
||
field_mappings/{table}.json — 三层字段映射(JSON→ODS→DWD,含锚点)
|
||
collection_manifest.json — 采集清单(表名、记录数、时间戳)
|
||
|
||
返回 {类别: 目录路径} 的字典。
|
||
"""
|
||
json_trees_dir = output_dir / "json_trees"
|
||
db_schemas_dir = output_dir / "db_schemas"
|
||
field_mappings_dir = output_dir / "field_mappings"
|
||
json_trees_dir.mkdir(parents=True, exist_ok=True)
|
||
db_schemas_dir.mkdir(parents=True, exist_ok=True)
|
||
field_mappings_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 解析 TABLE_MAP / FACT_MAPPINGS(用于构建字段映射)
|
||
table_map = parse_table_map()
|
||
fact_mappings_data = parse_fact_mappings()
|
||
|
||
# 收集所有 DWD 表结构(用于 build_field_mappings)
|
||
all_dwd_cols: dict[str, list[ColumnInfo]] = {}
|
||
for r in results:
|
||
for dwd_short, cols in r.dwd_tables.items():
|
||
all_dwd_cols[dwd_short] = cols
|
||
|
||
# ── 逐表落盘 ──
|
||
for r in results:
|
||
# json_trees/{table}.json — 展开后的字段结构
|
||
tree_data = {
|
||
"table": r.table_name,
|
||
"total_records": r.record_count,
|
||
"fields": [
|
||
{**asdict(fi), "samples": fi.samples}
|
||
for fi in r.json_fields.values()
|
||
],
|
||
}
|
||
_write_json(json_trees_dir / f"{r.table_name}.json", tree_data)
|
||
|
||
# db_schemas/ods_{table}.json — ODS 表结构
|
||
ods_data = {
|
||
"schema": "ods",
|
||
"table": r.table_name,
|
||
"columns": [asdict(c) for c in r.ods_columns],
|
||
}
|
||
_write_json(db_schemas_dir / f"ods_{r.table_name}.json", ods_data)
|
||
|
||
# db_schemas/dwd_{dwd_short}.json — 每张 DWD 表独立文件
|
||
for dwd_short, cols in r.dwd_tables.items():
|
||
dwd_data = {
|
||
"schema": "dwd",
|
||
"table": dwd_short,
|
||
"ods_source": r.table_name,
|
||
"columns": [asdict(c) for c in cols],
|
||
}
|
||
_write_json(db_schemas_dir / f"dwd_{dwd_short}.json", dwd_data)
|
||
|
||
# field_mappings/{table}.json — 三层字段映射
|
||
if r.error is None:
|
||
mapping = build_field_mappings(r, table_map, fact_mappings_data, all_dwd_cols)
|
||
_write_json(field_mappings_dir / f"{r.table_name}.json", mapping)
|
||
|
||
# ── collection_manifest.json — 采集清单 ──
|
||
manifest = {
|
||
"timestamp": datetime.now().astimezone().isoformat(),
|
||
"table_map": table_map,
|
||
"tables": [
|
||
{
|
||
"table": r.table_name,
|
||
"task_code": r.task_code,
|
||
"description": r.description,
|
||
"record_count": r.record_count,
|
||
"json_field_count": len(r.json_fields),
|
||
"ods_column_count": len(r.ods_columns),
|
||
"dwd_tables": list(r.dwd_tables.keys()),
|
||
"dwd_column_count": sum(len(cols) for cols in r.dwd_tables.values()),
|
||
"error": r.error,
|
||
}
|
||
for r in results
|
||
],
|
||
}
|
||
_write_json(output_dir / "collection_manifest.json", manifest)
|
||
|
||
# ── BD_manual 业务描述 ──
|
||
dump_bd_descriptions(results, output_dir)
|
||
|
||
return {
|
||
"json_trees": json_trees_dir,
|
||
"db_schemas": db_schemas_dir,
|
||
"field_mappings": field_mappings_dir,
|
||
"bd_descriptions": output_dir / "bd_descriptions",
|
||
"manifest": output_dir,
|
||
}
|
||
|
||
|
||
def _write_json(path: Path, data: Any) -> None:
|
||
"""UTF-8 编码写入 JSON 文件,ensure_ascii=False,indent=2。"""
|
||
path.write_text(
|
||
json.dumps(data, ensure_ascii=False, indent=2, default=str),
|
||
encoding="utf-8",
|
||
)
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════
|
||
# ODS 任务规格(从 gen_full_dataflow_doc.py 迁移)
|
||
# ══════════════════════════════════════════════════════════════════
|
||
|
||
# 格式: code, table, endpoint, data_path, list_key, time_fields,
|
||
# requires_window, extra_params, description
|
||
# 注意: ODS_STORE_GOODS 的 extra_params 包含 {"siteId": ["__STORE_ID__"]},
|
||
# 在 fetch_records 中根据 config.store_id 动态替换。
|
||
ODS_SPECS: list[dict] = [
|
||
{
|
||
"code": "ODS_ASSISTANT_ACCOUNT",
|
||
"table": "assistant_accounts_master",
|
||
"dwd_table": "dim_assistant",
|
||
"endpoint": "/PersonnelManagement/SearchAssistantInfo",
|
||
"data_path": ("data",),
|
||
"list_key": "assistantInfos",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "助教账号档案",
|
||
},
|
||
{
|
||
"code": "ODS_SETTLEMENT_RECORDS",
|
||
"table": "settlement_records",
|
||
"dwd_table": "dwd_settlement_head",
|
||
"endpoint": "/Site/GetAllOrderSettleList",
|
||
"data_path": ("data",),
|
||
"list_key": "settleList",
|
||
"time_fields": ("rangeStartTime", "rangeEndTime"),
|
||
"requires_window": True,
|
||
"extra_params": {},
|
||
"description": "结账记录",
|
||
},
|
||
{
|
||
"code": "ODS_TABLE_USE",
|
||
"table": "table_fee_transactions",
|
||
"dwd_table": "dwd_table_fee_log",
|
||
"endpoint": "/Site/GetSiteTableOrderDetails",
|
||
"data_path": ("data",),
|
||
"list_key": "siteTableUseDetailsList",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "台费计费流水",
|
||
},
|
||
{
|
||
"code": "ODS_ASSISTANT_LEDGER",
|
||
"table": "assistant_service_records",
|
||
"dwd_table": "dwd_assistant_service_log",
|
||
"endpoint": "/AssistantPerformance/GetOrderAssistantDetails",
|
||
"data_path": ("data",),
|
||
"list_key": "orderAssistantDetails",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "助教服务流水",
|
||
},
|
||
{
|
||
"code": "ODS_ASSISTANT_ABOLISH",
|
||
"table": "assistant_cancellation_records",
|
||
"dwd_table": "dwd_assistant_trash_event",
|
||
"endpoint": "/AssistantPerformance/GetAbolitionAssistant",
|
||
"data_path": ("data",),
|
||
"list_key": "abolitionAssistants",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": True,
|
||
"extra_params": {},
|
||
"description": "助教废除记录",
|
||
},
|
||
{
|
||
"code": "ODS_STORE_GOODS_SALES",
|
||
"table": "store_goods_sales_records",
|
||
"dwd_table": "dwd_store_goods_sale",
|
||
"endpoint": "/TenantGoods/GetGoodsSalesList",
|
||
"data_path": ("data",),
|
||
"list_key": "orderGoodsLedgers",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "门店商品销售流水",
|
||
},
|
||
{
|
||
"code": "ODS_PAYMENT",
|
||
"table": "payment_transactions",
|
||
"dwd_table": "dwd_payment",
|
||
"endpoint": "/PayLog/GetPayLogListPage",
|
||
"data_path": ("data",),
|
||
"list_key": None,
|
||
"time_fields": ("StartPayTime", "EndPayTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "支付流水",
|
||
},
|
||
{
|
||
"code": "ODS_REFUND",
|
||
"table": "refund_transactions",
|
||
"dwd_table": "dwd_refund",
|
||
"endpoint": "/Order/GetRefundPayLogList",
|
||
"data_path": ("data",),
|
||
"list_key": None,
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "退款流水",
|
||
},
|
||
{
|
||
"code": "ODS_PLATFORM_COUPON",
|
||
"table": "platform_coupon_redemption_records",
|
||
"dwd_table": "dwd_platform_coupon_redemption",
|
||
"endpoint": "/Promotion/GetOfflineCouponConsumePageList",
|
||
"data_path": ("data",),
|
||
"list_key": None,
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "平台/团购券核销",
|
||
},
|
||
{
|
||
"code": "ODS_MEMBER",
|
||
"table": "member_profiles",
|
||
"dwd_table": "dim_member",
|
||
"endpoint": "/MemberProfile/GetTenantMemberList",
|
||
"data_path": ("data",),
|
||
"list_key": "tenantMemberInfos",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "会员档案",
|
||
},
|
||
{
|
||
"code": "ODS_MEMBER_CARD",
|
||
"table": "member_stored_value_cards",
|
||
"dwd_table": "dim_member_card_account",
|
||
"endpoint": "/MemberProfile/GetTenantMemberCardList",
|
||
"data_path": ("data",),
|
||
"list_key": "tenantMemberCards",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "会员储值卡",
|
||
},
|
||
{
|
||
"code": "ODS_MEMBER_BALANCE",
|
||
"table": "member_balance_changes",
|
||
"dwd_table": "dwd_member_balance_change",
|
||
"endpoint": "/MemberProfile/GetMemberCardBalanceChange",
|
||
"data_path": ("data",),
|
||
"list_key": "tenantMemberCardLogs",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "会员余额变动",
|
||
},
|
||
{
|
||
"code": "ODS_RECHARGE_SETTLE",
|
||
"table": "recharge_settlements",
|
||
"dwd_table": "dwd_recharge_order",
|
||
"endpoint": "/Site/GetRechargeSettleList",
|
||
"data_path": ("data",),
|
||
"list_key": "settleList",
|
||
"time_fields": ("rangeStartTime", "rangeEndTime"),
|
||
"requires_window": True,
|
||
"extra_params": {},
|
||
"description": "充值结算",
|
||
},
|
||
{
|
||
"code": "ODS_GROUP_PACKAGE",
|
||
"table": "group_buy_packages",
|
||
"dwd_table": "dim_groupbuy_package",
|
||
"endpoint": "/PackageCoupon/QueryPackageCouponList",
|
||
"data_path": ("data",),
|
||
"list_key": "packageCouponList",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "团购套餐定义",
|
||
},
|
||
{
|
||
"code": "ODS_GROUP_BUY_REDEMPTION",
|
||
"table": "group_buy_redemption_records",
|
||
"dwd_table": "dwd_groupbuy_redemption",
|
||
"endpoint": "/Site/GetSiteTableUseDetails",
|
||
"data_path": ("data",),
|
||
"list_key": "siteTableUseDetailsList",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "团购套餐核销",
|
||
},
|
||
{
|
||
"code": "ODS_INVENTORY_STOCK",
|
||
"table": "goods_stock_summary",
|
||
"dwd_table": None,
|
||
"endpoint": "/TenantGoods/GetGoodsStockReport",
|
||
"data_path": ("data",),
|
||
"list_key": None,
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "库存汇总",
|
||
},
|
||
{
|
||
"code": "ODS_INVENTORY_CHANGE",
|
||
"table": "goods_stock_movements",
|
||
"dwd_table": None,
|
||
"endpoint": "/GoodsStockManage/QueryGoodsOutboundReceipt",
|
||
"data_path": ("data",),
|
||
"list_key": "queryDeliveryRecordsList",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": True,
|
||
"extra_params": {},
|
||
"description": "库存变化记录",
|
||
},
|
||
{
|
||
"code": "ODS_TABLES",
|
||
"table": "site_tables_master",
|
||
"dwd_table": "dim_table",
|
||
"endpoint": "/Table/GetSiteTables",
|
||
"data_path": ("data",),
|
||
"list_key": "siteTables",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "台桌维表",
|
||
},
|
||
{
|
||
"code": "ODS_GOODS_CATEGORY",
|
||
"table": "stock_goods_category_tree",
|
||
"dwd_table": "dim_goods_category",
|
||
"endpoint": "/TenantGoodsCategory/QueryPrimarySecondaryCategory",
|
||
"data_path": ("data",),
|
||
"list_key": "goodsCategoryList",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "库存商品分类树",
|
||
},
|
||
{
|
||
"code": "ODS_STORE_GOODS",
|
||
"table": "store_goods_master",
|
||
"dwd_table": "dim_store_goods",
|
||
"endpoint": "/TenantGoods/GetGoodsInventoryList",
|
||
"data_path": ("data",),
|
||
"list_key": "orderGoodsList",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
# STORE_ID 占位符,在 fetch_records 中动态替换为 config.store_id
|
||
"extra_params": {"siteId": ["__STORE_ID__"]},
|
||
"description": "门店商品档案",
|
||
},
|
||
{
|
||
"code": "ODS_TABLE_FEE_DISCOUNT",
|
||
"table": "table_fee_discount_records",
|
||
"dwd_table": "dwd_table_fee_adjust",
|
||
"endpoint": "/Site/GetTaiFeeAdjustList",
|
||
"data_path": ("data",),
|
||
"list_key": "taiFeeAdjustInfos",
|
||
"time_fields": ("startTime", "endTime"),
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "台费折扣/调账",
|
||
},
|
||
{
|
||
"code": "ODS_TENANT_GOODS",
|
||
"table": "tenant_goods_master",
|
||
"dwd_table": "dim_tenant_goods",
|
||
"endpoint": "/TenantGoods/QueryTenantGoods",
|
||
"data_path": ("data",),
|
||
"list_key": "tenantGoodsList",
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "租户商品档案",
|
||
},
|
||
{
|
||
"code": "ODS_SETTLEMENT_TICKET",
|
||
"table": "settlement_ticket_details",
|
||
"dwd_table": None,
|
||
"endpoint": "/Order/GetOrderSettleTicketNew",
|
||
"data_path": (),
|
||
"list_key": None,
|
||
"time_fields": None,
|
||
"requires_window": False,
|
||
"extra_params": {},
|
||
"description": "结账小票详情(按 orderSettleId 逐条获取,不走常规分页)",
|
||
},
|
||
]
|
||
|
||
# 默认 list_key 候选(与 APIClient 一致)
|
||
DEFAULT_LIST_KEYS: tuple[str, ...] = (
|
||
"list", "rows", "records", "items", "dataList", "data_list",
|
||
"tenantMemberInfos", "tenantMemberCardLogs", "tenantMemberCards",
|
||
"settleList", "orderAssistantDetails", "assistantInfos", "siteTables",
|
||
"taiFeeAdjustInfos", "siteTableUseDetailsList", "tenantGoodsList",
|
||
"packageCouponList", "queryDeliveryRecordsList", "goodsCategoryList",
|
||
"orderGoodsList", "orderGoodsLedgers",
|
||
)
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════
|
||
# API 调用(从 gen_full_dataflow_doc.py 迁移,适配 AnalyzerConfig)
|
||
# ══════════════════════════════════════════════════════════════════
|
||
|
||
def _build_headers(config: AnalyzerConfig) -> dict[str, str]:
|
||
"""根据 config 构造浏览器风格请求头。"""
|
||
return {
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Content-Type": "application/json",
|
||
"Origin": "https://pc.ficoo.vip",
|
||
"Referer": "https://pc.ficoo.vip/",
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"
|
||
),
|
||
"Authorization": f"Bearer {config.api_token}" if config.api_token else "",
|
||
}
|
||
|
||
|
||
def api_post(endpoint: str, payload: dict, config: AnalyzerConfig) -> dict:
|
||
"""发送 POST 请求到 API。"""
|
||
import requests
|
||
|
||
url = f"{config.api_base.rstrip('/')}/{endpoint.lstrip('/')}"
|
||
headers = _build_headers(config)
|
||
resp = requests.post(url, json=payload, headers=headers, timeout=20)
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
code = data.get("code")
|
||
if code not in (0, "0", None):
|
||
msg = data.get("msg") or data.get("message") or ""
|
||
raise ValueError(f"API 错误 code={code} msg={msg} endpoint={endpoint}")
|
||
return data
|
||
|
||
|
||
def extract_list(payload: dict, data_path: tuple, list_key: str | None) -> list:
|
||
"""从 API 响应中提取记录列表。"""
|
||
cur = payload
|
||
for key in data_path:
|
||
if isinstance(cur, dict):
|
||
cur = cur.get(key)
|
||
else:
|
||
cur = None
|
||
if cur is None:
|
||
break
|
||
if isinstance(cur, list):
|
||
return cur
|
||
if isinstance(cur, dict):
|
||
if list_key and isinstance(cur.get(list_key), list):
|
||
return cur[list_key]
|
||
for k in DEFAULT_LIST_KEYS:
|
||
if isinstance(cur.get(k), list):
|
||
return cur[k]
|
||
for v in cur.values():
|
||
if isinstance(v, list):
|
||
return v
|
||
return []
|
||
|
||
|
||
def _resolve_extra_params(extra_params: dict, config: AnalyzerConfig) -> dict:
|
||
"""将 extra_params 中的 __STORE_ID__ 占位符替换为 config.store_id。"""
|
||
if not extra_params:
|
||
return extra_params
|
||
resolved = {}
|
||
for k, v in extra_params.items():
|
||
if isinstance(v, list):
|
||
resolved[k] = [
|
||
config.store_id if item == "__STORE_ID__" else item
|
||
for item in v
|
||
]
|
||
elif v == "__STORE_ID__":
|
||
resolved[k] = config.store_id
|
||
else:
|
||
resolved[k] = v
|
||
return resolved
|
||
|
||
|
||
def fetch_records(spec: dict, config: AnalyzerConfig) -> list[dict]:
|
||
"""
|
||
获取 API 记录。
|
||
|
||
- 有时间字段的表:从今天往回 10 天一批,不够则继续扩展,最多 10 次重试
|
||
- 无时间字段的表:单次请求
|
||
- 特殊表(settlement_ticket_details):跳过
|
||
|
||
参数:
|
||
spec: ODS_SPECS 中的单项配置
|
||
config: AnalyzerConfig,提供 api_base/api_token/store_id/limit
|
||
"""
|
||
from datetime import timedelta
|
||
from zoneinfo import ZoneInfo
|
||
|
||
endpoint = spec["endpoint"]
|
||
data_path = spec["data_path"]
|
||
list_key = spec["list_key"]
|
||
time_fields = spec["time_fields"]
|
||
extra_params = _resolve_extra_params(spec.get("extra_params", {}), config)
|
||
target_count = config.limit
|
||
|
||
# 结账小票是逐条获取的,跳过
|
||
if spec["table"] == "settlement_ticket_details":
|
||
return []
|
||
|
||
tz = ZoneInfo("Asia/Shanghai")
|
||
all_records: list[dict] = []
|
||
|
||
if time_fields:
|
||
# 有时间窗口:从今天往回扩展
|
||
start_key, end_key = time_fields
|
||
now = datetime.now(tz)
|
||
end_dt = now
|
||
batch_days = 10
|
||
max_retries = 10
|
||
|
||
for attempt in range(max_retries):
|
||
start_dt = end_dt - timedelta(days=batch_days)
|
||
params = {
|
||
"siteId": config.store_id,
|
||
"page": 1,
|
||
"limit": target_count,
|
||
start_key: start_dt.strftime("%Y-%m-%d %H:%M:%S"),
|
||
end_key: end_dt.strftime("%Y-%m-%d %H:%M:%S"),
|
||
**extra_params,
|
||
}
|
||
try:
|
||
resp = api_post(endpoint, params, config)
|
||
records = extract_list(resp, data_path, list_key)
|
||
all_records.extend(records)
|
||
except Exception as e:
|
||
logger.warning(
|
||
"API 请求失败 %s attempt=%d: %s", endpoint, attempt + 1, e
|
||
)
|
||
|
||
if len(all_records) >= target_count:
|
||
break
|
||
# 继续往前扩展
|
||
end_dt = start_dt
|
||
else:
|
||
# 无时间窗口:单次请求
|
||
params = {
|
||
"siteId": config.store_id,
|
||
"page": 1,
|
||
"limit": target_count,
|
||
**extra_params,
|
||
}
|
||
try:
|
||
resp = api_post(endpoint, params, config)
|
||
all_records = extract_list(resp, data_path, list_key)
|
||
except Exception as e:
|
||
logger.warning("API 请求失败 %s: %s", endpoint, e)
|
||
|
||
return all_records[:target_count]
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════
|
||
# ETL 源码解析:TABLE_MAP / FACT_MAPPINGS
|
||
# ══════════════════════════════════════════════════════════════════
|
||
|
||
import re
|
||
|
||
# DWD 加载任务源码的默认路径
|
||
_DWD_TASK_PY = Path("apps/etl/connectors/feiqiu/tasks/dwd/dwd_load_task.py")
|
||
|
||
|
||
def parse_table_map(py_path: Path | None = None) -> dict[str, str]:
|
||
"""
|
||
从 dwd_load_task.py 解析 TABLE_MAP: {dwd_table -> ods_table}。
|
||
|
||
返回如 {"dwd.dim_assistant": "ods.assistant_accounts_master", ...}
|
||
"""
|
||
py_path = py_path or _DWD_TASK_PY
|
||
if not py_path.exists():
|
||
logger.warning("TABLE_MAP 源文件不存在: %s", py_path)
|
||
return {}
|
||
text = py_path.read_text(encoding="utf-8")
|
||
match = re.search(
|
||
r"TABLE_MAP\s*(?::\s*dict\[.*?\])?\s*=\s*\{(.*?)\}",
|
||
text, re.DOTALL,
|
||
)
|
||
if not match:
|
||
return {}
|
||
body = match.group(1)
|
||
result = {}
|
||
for m in re.finditer(r'"([^"]+)"\s*:\s*"([^"]+)"', body):
|
||
result[m.group(1)] = m.group(2)
|
||
return result
|
||
|
||
|
||
def parse_fact_mappings(py_path: Path | None = None) -> dict[str, list[tuple]]:
|
||
"""
|
||
从 dwd_load_task.py 解析 FACT_MAPPINGS: {dwd_table -> [(dwd_col, ods_expr, cast), ...]}。
|
||
|
||
显式映射(字段重命名、JSONB 提取、CAST 转换等)。
|
||
"""
|
||
py_path = py_path or _DWD_TASK_PY
|
||
if not py_path.exists():
|
||
logger.warning("FACT_MAPPINGS 源文件不存在: %s", py_path)
|
||
return {}
|
||
text = py_path.read_text(encoding="utf-8")
|
||
start = text.find("FACT_MAPPINGS")
|
||
if start < 0:
|
||
return {}
|
||
brace_start = text.find("{", start)
|
||
if brace_start < 0:
|
||
return {}
|
||
depth = 0
|
||
end = brace_start
|
||
for i in range(brace_start, len(text)):
|
||
if text[i] == "{":
|
||
depth += 1
|
||
elif text[i] == "}":
|
||
depth -= 1
|
||
if depth == 0:
|
||
end = i + 1
|
||
break
|
||
block = text[brace_start:end]
|
||
result: dict[str, list[tuple]] = {}
|
||
table_pattern = re.compile(r'"([^"]+)"\s*:\s*\[', re.DOTALL)
|
||
for tm in table_pattern.finditer(block):
|
||
table_name = tm.group(1)
|
||
list_start = tm.end()
|
||
bracket_depth = 1
|
||
list_end = list_start
|
||
for i in range(list_start, len(block)):
|
||
if block[i] == "[":
|
||
bracket_depth += 1
|
||
elif block[i] == "]":
|
||
bracket_depth -= 1
|
||
if bracket_depth == 0:
|
||
list_end = i
|
||
break
|
||
list_body = block[list_start:list_end]
|
||
tuples = []
|
||
tuple_pattern = re.compile(
|
||
r'\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*(?:"([^"]+)"|None)\s*\)'
|
||
)
|
||
for tp in tuple_pattern.finditer(list_body):
|
||
tuples.append((tp.group(1), tp.group(2), tp.group(3)))
|
||
result[table_name] = tuples
|
||
return result
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════
|
||
# BD_manual 文档解析:提取字段级业务描述
|
||
# ══════════════════════════════════════════════════════════════════
|
||
|
||
# BD_manual 文档根目录
|
||
_BD_DOCS_ROOT = Path("apps/etl/connectors/feiqiu/docs/database")
|
||
|
||
|
||
def parse_bd_manual_fields(doc_path: Path) -> dict[str, str]:
|
||
"""
|
||
从 BD_manual Markdown 文档中解析字段说明表格。
|
||
|
||
返回 {字段名(小写) -> 说明文本}。
|
||
支持 ODS/main/ 和 DWD/main/ 下的 BD_manual_*.md 格式。
|
||
"""
|
||
if not doc_path.exists():
|
||
return {}
|
||
text = doc_path.read_text(encoding="utf-8")
|
||
result: dict[str, str] = {}
|
||
|
||
# 查找 "## 字段说明" 后的表格
|
||
in_table = False
|
||
header_found = False
|
||
desc_col_idx = -1
|
||
name_col_idx = -1
|
||
|
||
for line in text.splitlines():
|
||
stripped = line.strip()
|
||
if stripped.startswith("## 字段说明"):
|
||
in_table = True
|
||
continue
|
||
if in_table and stripped.startswith("##"):
|
||
# 遇到下一个 section,停止
|
||
break
|
||
if not in_table:
|
||
continue
|
||
if not stripped.startswith("|"):
|
||
continue
|
||
|
||
cols = [c.strip() for c in stripped.split("|")]
|
||
# 去掉首尾空元素(因为 | 开头和结尾)
|
||
cols = cols[1:-1] if len(cols) > 2 else cols
|
||
|
||
if not header_found:
|
||
# 查找表头行
|
||
for i, c in enumerate(cols):
|
||
if "字段名" in c or "字段" == c:
|
||
name_col_idx = i
|
||
if "说明" in c:
|
||
desc_col_idx = i
|
||
if name_col_idx >= 0 and desc_col_idx >= 0:
|
||
header_found = True
|
||
continue
|
||
|
||
# 跳过分隔行
|
||
if all(c.replace("-", "").replace(":", "").strip() == "" for c in cols):
|
||
continue
|
||
|
||
if name_col_idx < len(cols) and desc_col_idx < len(cols):
|
||
field_name = cols[name_col_idx].strip().strip("`")
|
||
desc = cols[desc_col_idx].strip()
|
||
if field_name and desc:
|
||
result[field_name.lower()] = desc
|
||
|
||
return result
|
||
|
||
|
||
def load_bd_descriptions(table_name: str) -> dict[str, dict[str, str]]:
|
||
"""
|
||
加载指定 ODS 表及其关联 DWD 表的 BD_manual 业务描述。
|
||
|
||
返回:
|
||
{
|
||
"ods": {字段名 -> 说明},
|
||
"dwd": {
|
||
"dim_assistant": {字段名 -> 说明},
|
||
...
|
||
}
|
||
}
|
||
"""
|
||
result: dict[str, dict[str, str]] = {"ods": {}, "dwd": {}}
|
||
|
||
# ODS BD_manual
|
||
ods_doc = _BD_DOCS_ROOT / "ODS" / "main" / f"BD_manual_{table_name}.md"
|
||
result["ods"] = parse_bd_manual_fields(ods_doc)
|
||
|
||
# DWD BD_manual — 需要通过 TABLE_MAP 找到关联的 DWD 表
|
||
dwd_dir = _BD_DOCS_ROOT / "DWD" / "main"
|
||
if dwd_dir.exists():
|
||
for f in sorted(dwd_dir.iterdir()):
|
||
if f.name.startswith("BD_manual_") and f.suffix == ".md":
|
||
dwd_short = f.stem.replace("BD_manual_", "")
|
||
result["dwd"][dwd_short] = parse_bd_manual_fields(f)
|
||
|
||
return result
|
||
|
||
|
||
def dump_bd_descriptions(
|
||
results: list[TableCollectionResult],
|
||
output_dir: Path,
|
||
) -> None:
|
||
"""
|
||
为每张 ODS 表解析 BD_manual 文档,输出 bd_descriptions/{table}.json。
|
||
|
||
结构:
|
||
{
|
||
"ods_table": "assistant_accounts_master",
|
||
"ods_fields": {"id": "助教账号主键 ID...", ...},
|
||
"dwd_fields": {
|
||
"dim_assistant": {"assistant_id": "助教唯一标识 ID", ...},
|
||
...
|
||
}
|
||
}
|
||
"""
|
||
bd_dir = output_dir / "bd_descriptions"
|
||
bd_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
for r in results:
|
||
descs = load_bd_descriptions(r.table_name)
|
||
data = {
|
||
"ods_table": r.table_name,
|
||
"ods_fields": descs["ods"],
|
||
"dwd_fields": {
|
||
dwd_short: descs["dwd"].get(dwd_short, {})
|
||
for dwd_short in r.dwd_tables.keys()
|
||
},
|
||
}
|
||
_write_json(bd_dir / f"{r.table_name}.json", data)
|
||
|
||
|
||
# ══════════════════════════════════════════════════════════════════
|
||
# 三层字段映射构建(JSON → ODS → DWD,含锚点 ID)
|
||
# ══════════════════════════════════════════════════════════════════
|
||
|
||
SCD2_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
|
||
|
||
|
||
def build_field_mappings(
|
||
result: TableCollectionResult,
|
||
table_map: dict[str, str],
|
||
fact_mappings: dict[str, list[tuple]],
|
||
all_dwd_cols: dict[str, list[ColumnInfo]],
|
||
) -> dict:
|
||
"""
|
||
为单张 ODS 表构建完整的三层字段映射关系。
|
||
|
||
返回结构:
|
||
{
|
||
"ods_table": "assistant_accounts_master",
|
||
"anchors": {
|
||
"api": "api-assistant-accounts-master",
|
||
"ods": "ods-assistant-accounts-master",
|
||
"dwd": {"dim_assistant": "dwd-dim-assistant", ...}
|
||
},
|
||
"json_to_ods": [
|
||
{"json_path": "id", "ods_col": "id", "match_type": "exact", ...},
|
||
...
|
||
],
|
||
"ods_to_dwd": {
|
||
"id": [
|
||
{"dwd_table": "dim_assistant", "dwd_col": "assistant_id", "cast": null, "note": "字段重命名"},
|
||
...
|
||
],
|
||
...
|
||
},
|
||
"dwd_to_ods": {
|
||
"dim_assistant": [
|
||
{"dwd_col": "assistant_id", "type": "BIGINT", "ods_source": "id", "mapping_type": "直接", "note": "字段重命名"},
|
||
...
|
||
],
|
||
...
|
||
}
|
||
}
|
||
"""
|
||
ods_table = result.table_name
|
||
full_ods = f"ods.{ods_table}"
|
||
|
||
# 锚点 ID 生成(与旧文档格式一致)
|
||
anchor_base = ods_table.replace("_", "-")
|
||
anchors = {
|
||
"api": f"api-{anchor_base}",
|
||
"ods": f"ods-{anchor_base}",
|
||
"dwd": {},
|
||
}
|
||
|
||
# 找到所有映射到此 ODS 表的 DWD 表
|
||
dwd_tables_for_ods = sorted(
|
||
[dwd_t for dwd_t, ods_t in table_map.items() if ods_t == full_ods]
|
||
)
|
||
for dwd_full in dwd_tables_for_ods:
|
||
dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full
|
||
anchors["dwd"][dwd_short] = f"dwd-{dwd_short.replace('_', '-')}"
|
||
|
||
# ── JSON → ODS 映射 ──
|
||
ods_col_set = {c.name.lower() for c in result.ods_columns}
|
||
json_to_ods = []
|
||
for path, fi in result.json_fields.items():
|
||
# 叶子字段名(去掉嵌套前缀和 [])
|
||
leaf = path.split(".")[-1].replace("[]", "")
|
||
leaf_lower = leaf.lower()
|
||
|
||
if leaf_lower in ods_col_set:
|
||
match_type = "exact" if leaf in ods_col_set else "case_insensitive"
|
||
json_to_ods.append({
|
||
"json_path": path,
|
||
"ods_col": leaf_lower,
|
||
"match_type": match_type,
|
||
"json_type": fi.json_type,
|
||
"occurrence_pct": round(fi.occurrence / fi.total_records * 100, 1) if fi.total_records > 0 else 0,
|
||
})
|
||
else:
|
||
json_to_ods.append({
|
||
"json_path": path,
|
||
"ods_col": None,
|
||
"match_type": "unmapped",
|
||
"json_type": fi.json_type,
|
||
"occurrence_pct": round(fi.occurrence / fi.total_records * 100, 1) if fi.total_records > 0 else 0,
|
||
})
|
||
|
||
# ── ODS → DWD 映射(按 ODS 列聚合所有下游 DWD 列) ──
|
||
ods_to_dwd: dict[str, list[dict]] = {}
|
||
dwd_to_ods: dict[str, list[dict]] = {}
|
||
|
||
for dwd_full in dwd_tables_for_ods:
|
||
dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full
|
||
dwd_cols = all_dwd_cols.get(dwd_short, [])
|
||
if not dwd_cols:
|
||
continue
|
||
|
||
mappings_list = fact_mappings.get(dwd_full, [])
|
||
mapping_dict = {m[0].lower(): (m[1], m[2]) for m in mappings_list}
|
||
|
||
is_dim = "dim_" in dwd_short
|
||
is_ex = dwd_short.endswith("_ex")
|
||
table_type = "维度" if is_dim else "事实"
|
||
if is_ex:
|
||
table_type += "(扩展)"
|
||
|
||
dwd_to_ods[dwd_short] = []
|
||
|
||
for c in dwd_cols:
|
||
col_lower = c.name.lower()
|
||
|
||
if col_lower in SCD2_COLS:
|
||
dwd_to_ods[dwd_short].append({
|
||
"dwd_col": c.name, "type": c.data_type,
|
||
"ods_source": "—", "mapping_type": "SCD2",
|
||
"note": "SCD2 元数据",
|
||
})
|
||
continue
|
||
|
||
if col_lower in mapping_dict:
|
||
ods_expr, cast = mapping_dict[col_lower]
|
||
note = ""
|
||
if "->>" in ods_expr:
|
||
note = "JSONB 提取"
|
||
elif "CASE" in ods_expr.upper():
|
||
note = "派生计算"
|
||
elif ods_expr.lower() != col_lower:
|
||
note = "字段重命名"
|
||
|
||
dwd_to_ods[dwd_short].append({
|
||
"dwd_col": c.name, "type": c.data_type,
|
||
"ods_source": ods_expr, "mapping_type": f"→ {cast}" if cast else "直接",
|
||
"note": note,
|
||
})
|
||
# 反向:ODS 列 → DWD 列
|
||
ods_key = ods_expr.lower() if "->>" not in ods_expr and "CASE" not in ods_expr.upper() else None
|
||
if ods_key and ods_key in ods_col_set:
|
||
ods_to_dwd.setdefault(ods_key, []).append({
|
||
"dwd_table": dwd_short, "dwd_col": c.name,
|
||
"cast": cast, "note": note,
|
||
})
|
||
elif col_lower in ods_col_set:
|
||
dwd_to_ods[dwd_short].append({
|
||
"dwd_col": c.name, "type": c.data_type,
|
||
"ods_source": c.name, "mapping_type": "直接",
|
||
"note": "同名直传",
|
||
})
|
||
ods_to_dwd.setdefault(col_lower, []).append({
|
||
"dwd_table": dwd_short, "dwd_col": c.name,
|
||
"cast": None, "note": "同名直传",
|
||
})
|
||
else:
|
||
dwd_to_ods[dwd_short].append({
|
||
"dwd_col": c.name, "type": c.data_type,
|
||
"ods_source": "—", "mapping_type": "—",
|
||
"note": "未显式映射",
|
||
})
|
||
|
||
return {
|
||
"ods_table": ods_table,
|
||
"anchors": anchors,
|
||
"json_to_ods": json_to_ods,
|
||
"ods_to_dwd": ods_to_dwd,
|
||
"dwd_to_ods": dwd_to_ods,
|
||
}
|