Files
Neo-ZQYY/scripts/ops/analyze_dataflow.py

207 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据流结构分析 — CLI 入口
用法:
python scripts/ops/analyze_dataflow.py
python scripts/ops/analyze_dataflow.py --date-from 2025-01-01 --date-to 2025-01-15
python scripts/ops/analyze_dataflow.py --limit 100 --tables settlement_records,payment_transactions
"""
from __future__ import annotations
import argparse
import os
from datetime import datetime
from pathlib import Path
def build_parser() -> argparse.ArgumentParser:
"""
构造 CLI 参数解析器。
参数:
--date-from 数据获取起始日期 (YYYY-MM-DD)
--date-to 数据获取截止日期 (YYYY-MM-DD)
--limit 每端点最大记录数 (默认 200)
--tables 要分析的表名列表 (逗号分隔,缺省=全部)
"""
parser = argparse.ArgumentParser(
description="数据流结构分析 — 采集 API JSON 和 DB 表结构",
)
parser.add_argument(
"--date-from",
type=str,
default=None,
help="数据获取起始日期 (YYYY-MM-DD),默认 30 天前",
)
parser.add_argument(
"--date-to",
type=str,
default=None,
help="数据获取截止日期 (YYYY-MM-DD),默认今天",
)
parser.add_argument(
"--limit",
type=int,
default=200,
help="每端点最大记录数 (默认 200)",
)
parser.add_argument(
"--tables",
type=str,
default=None,
help="要分析的表名列表 (逗号分隔,缺省=全部)",
)
return parser
def resolve_output_dir() -> Path:
"""
确定输出目录:
1. 从 .env 读取 SYSTEM_ANALYZE_ROOT
2. 确保目录存在(自动创建)
"""
from _env_paths import get_output_path
return get_output_path("SYSTEM_ANALYZE_ROOT")
def generate_output_filename(dt: "datetime") -> str:
"""生成输出文件名dataflow_YYYY-MM-DD_HHMMSS.md"""
return f"dataflow_{dt.strftime('%Y-%m-%d_%H%M%S')}.md"
def main() -> None:
"""
串联采集流程:
1. 解析 CLI 参数
2. 加载环境变量(.env 分层叠加)
3. 构造 AnalyzerConfig
4. 调用 collect_all_tables() 执行采集
5. 调用 dump_collection_results() 落盘
6. 输出采集摘要到 stdout
"""
from datetime import date as _date, datetime as _datetime, timedelta as _timedelta
# ── 1. 解析 CLI 参数 ──
parser = build_parser()
args = parser.parse_args()
# ── 2. 加载环境变量 ──
# _env_paths 在 import 时已通过 Path(__file__).parents[2] / ".env" 绝对路径
# 加载了根 .env无需再用相对路径 load_dotenv避免 cwd 不在项目根时失效)
output_dir = resolve_output_dir() # 触发 _env_paths import → 加载根 .env
# ── 3. 构造基础参数 ──
date_to = _date.fromisoformat(args.date_to) if args.date_to else _date.today()
user_date_from = _date.fromisoformat(args.date_from) if args.date_from else None
target_limit = args.limit
tables_filter = [t.strip() for t in args.tables.split(",")] if args.tables else None
# CHANGE 2026-02-21 | 遵循 testing-env.md优先使用测试库 TEST_DB_DSN
pg_dsn = os.environ.get("TEST_DB_DSN") or os.environ.get("PG_DSN", "")
if not pg_dsn:
raise RuntimeError("TEST_DB_DSN 和 PG_DSN 均未定义,请检查根 .env 配置")
from dataflow_analyzer import AnalyzerConfig, ODS_SPECS, collect_all_tables, dump_collection_results
# CHANGE 2026-02-21 | API 凭证缺失时提前报错,避免静默产出空报告
api_base = os.environ.get("API_BASE", "")
api_token = os.environ.get("API_TOKEN", "")
store_id = os.environ.get("STORE_ID", "")
missing = [k for k, v in [("API_BASE", api_base), ("API_TOKEN", api_token), ("STORE_ID", store_id)] if not v]
if missing:
raise RuntimeError(
f"API 凭证缺失:{', '.join(missing)}"
f"请在根 .env 中配置,参考 .env.template"
)
base_kwargs = dict(
date_to=date_to,
limit=target_limit,
output_dir=output_dir,
pg_dsn=pg_dsn,
api_base=api_base,
api_token=api_token,
store_id=store_id,
)
# ── 4. 逐表自适应日期扩展采集 ──
# CHANGE 2026-02-21 | 策略10天 → 30天 → 90天3 个档位
expand_days = [10, 30, 90]
if user_date_from:
# 用户显式指定了 date_from不做自适应扩展
expand_days = []
initial_date_from = user_date_from
else:
initial_date_from = date_to - _timedelta(days=expand_days[0])
# 首轮采集
config = AnalyzerConfig(date_from=initial_date_from, tables=tables_filter, **base_kwargs)
results = collect_all_tables(config, specs=ODS_SPECS)
actual_date_from = initial_date_from
# 自适应扩展:对不满 target_limit 的表逐步扩大日期范围
# CHANGE 2026-02-21 | 维表time_fields=None不参与时间扩展其 API 不接受日期范围
_dim_tables = {s["table"] for s in ODS_SPECS if s.get("time_fields") is None}
if not user_date_from:
for days in expand_days[1:]:
short_tables = [r.table_name for r in results
if r.error is None
and r.record_count < target_limit
and r.table_name not in _dim_tables]
if not short_tables:
break # 所有表都满足了
wider_from = date_to - _timedelta(days=days)
print(f" [自适应扩展] {len(short_tables)} 张表不足 {target_limit} 条,扩展至 {wider_from} ~ {date_to}")
wider_config = AnalyzerConfig(
date_from=wider_from, tables=short_tables, **base_kwargs)
wider_results = collect_all_tables(wider_config, specs=ODS_SPECS)
# 用更宽范围的结果替换不满的表(仅当新结果记录数更多时)
wider_map = {r.table_name: r for r in wider_results}
for idx, r in enumerate(results):
if r.table_name in wider_map:
new_r = wider_map[r.table_name]
if new_r.record_count > r.record_count:
results[idx] = new_r
actual_date_from = wider_from
# ── 5. 落盘 ──
paths = dump_collection_results(results, output_dir)
# ── 5.1 将实际使用的 date_from/date_to 追加写入 manifest ──
import json as _json
manifest_path = output_dir / "collection_manifest.json"
if manifest_path.exists():
with open(manifest_path, "r", encoding="utf-8") as _f:
manifest_data = _json.load(_f)
manifest_data["date_from"] = str(actual_date_from)
manifest_data["date_to"] = str(date_to)
with open(manifest_path, "w", encoding="utf-8") as _f:
_json.dump(manifest_data, _f, ensure_ascii=False, indent=2)
# ── 6. 输出采集摘要 ──
now = _datetime.now()
filename = generate_output_filename(now)
ok = sum(1 for r in results if r.error is None)
fail = len(results) - ok
total_records = sum(r.record_count for r in results)
print(f"\n{'='*60}")
print(f"数据流结构分析完成")
print(f"{'='*60}")
print(f" 输出目录: {output_dir}")
print(f" 报告文件名: {filename}")
print(f" 分析表数: {len(results)} ({ok} 成功, {fail} 失败)")
print(f" 总记录数: {total_records}")
print(f" 落盘路径:")
for category, p in paths.items():
print(f" {category}: {p}")
print(f"{'='*60}")
if __name__ == "__main__":
main()