255 lines
8.2 KiB
Python
255 lines
8.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
ETL 统一分析 — 编排入口
|
||
|
||
合并「数据流结构分析」和「ETL 数据一致性检查」为一个统一流程。
|
||
支持三种模式:
|
||
--mode structure 仅结构分析(analyze_dataflow + gen_dataflow_report)
|
||
--mode consistency 仅一致性检查(etl_consistency_check)
|
||
--mode full 全部执行(默认)
|
||
|
||
数据源:
|
||
默认主动调 API 采集最近 60 天数据。
|
||
--source etl-log 切换为读 ETL 落盘 JSON(一致性检查模式)
|
||
|
||
用法:
|
||
cd C:\\NeoZQYY
|
||
uv run python scripts/ops/etl_unified_analysis.py
|
||
uv run python scripts/ops/etl_unified_analysis.py --mode consistency --source etl-log
|
||
uv run python scripts/ops/etl_unified_analysis.py --mode structure --date-from 2026-01-01
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import subprocess
|
||
import sys
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from zoneinfo import ZoneInfo
|
||
|
||
# 确保 scripts/ops 在 sys.path 中,以便 import _env_paths
|
||
_SCRIPT_DIR = Path(__file__).resolve().parent
|
||
if str(_SCRIPT_DIR) not in sys.path:
|
||
sys.path.insert(0, str(_SCRIPT_DIR))
|
||
|
||
from _env_paths import get_output_path, ensure_repo_root
|
||
|
||
ensure_repo_root()
|
||
|
||
TZ = ZoneInfo("Asia/Shanghai")
|
||
NOW = datetime.now(TZ)
|
||
TS = NOW.strftime("%Y%m%d_%H%M%S")
|
||
|
||
|
||
def build_parser() -> argparse.ArgumentParser:
|
||
parser = argparse.ArgumentParser(
|
||
description="ETL 统一分析 — 结构分析 + 数据一致性检查",
|
||
)
|
||
parser.add_argument(
|
||
"--mode",
|
||
choices=["structure", "consistency", "full"],
|
||
default="full",
|
||
help="执行模式:structure=仅结构分析, consistency=仅一致性检查, full=全部(默认)",
|
||
)
|
||
parser.add_argument(
|
||
"--source",
|
||
choices=["api", "etl-log"],
|
||
default="api",
|
||
help="数据源:api=主动调 API 采集(默认), etl-log=读 ETL 落盘 JSON",
|
||
)
|
||
parser.add_argument(
|
||
"--date-from", type=str, default=None,
|
||
help="数据获取起始日期 (YYYY-MM-DD),默认 60 天前",
|
||
)
|
||
parser.add_argument(
|
||
"--date-to", type=str, default=None,
|
||
help="数据获取截止日期 (YYYY-MM-DD),默认今天",
|
||
)
|
||
parser.add_argument(
|
||
"--limit", type=int, default=200,
|
||
help="每端点最大记录数(默认 200)",
|
||
)
|
||
parser.add_argument(
|
||
"--tables", type=str, default=None,
|
||
help="要分析的表名列表(逗号分隔,缺省=全部)",
|
||
)
|
||
return parser
|
||
|
||
|
||
def run_subprocess(cmd: list[str], label: str) -> int:
|
||
"""运行子进程,实时输出 stdout/stderr,返回 exit code"""
|
||
print(f"\n{'='*60}")
|
||
print(f"[{label}] 开始执行...")
|
||
print(f" 命令: {' '.join(cmd)}")
|
||
print(f"{'='*60}\n")
|
||
|
||
proc = subprocess.run(
|
||
cmd,
|
||
cwd=str(Path(__file__).resolve().parents[2]), # 项目根目录
|
||
)
|
||
if proc.returncode != 0:
|
||
print(f"\n❌ [{label}] 执行失败 (exit code: {proc.returncode})")
|
||
else:
|
||
print(f"\n✅ [{label}] 执行完成")
|
||
return proc.returncode
|
||
|
||
|
||
def run_structure_analysis(args: argparse.Namespace) -> int:
|
||
"""执行数据流结构分析(采集 + 报告生成)"""
|
||
from datetime import date, timedelta
|
||
|
||
# 阶段 1:数据采集
|
||
cmd = [sys.executable, "scripts/ops/analyze_dataflow.py"]
|
||
if args.date_from:
|
||
cmd += ["--date-from", args.date_from]
|
||
else:
|
||
# 默认 60 天
|
||
default_from = (date.today() - timedelta(days=60)).isoformat()
|
||
cmd += ["--date-from", default_from]
|
||
if args.date_to:
|
||
cmd += ["--date-to", args.date_to]
|
||
if args.limit:
|
||
cmd += ["--limit", str(args.limit)]
|
||
if args.tables:
|
||
cmd += ["--tables", args.tables]
|
||
|
||
rc = run_subprocess(cmd, "数据流采集")
|
||
if rc != 0:
|
||
return rc
|
||
|
||
# 阶段 2:报告生成
|
||
cmd2 = [sys.executable, "scripts/ops/gen_dataflow_report.py"]
|
||
rc2 = run_subprocess(cmd2, "数据流报告生成")
|
||
return rc2
|
||
|
||
|
||
def run_consistency_check(args: argparse.Namespace) -> int:
|
||
"""执行 ETL 数据一致性检查"""
|
||
cmd = [sys.executable, "scripts/ops/etl_consistency_check.py"]
|
||
rc = run_subprocess(cmd, "ETL 数据一致性检查")
|
||
return rc
|
||
|
||
|
||
def merge_reports(structure_ok: bool, consistency_ok: bool) -> Path | None:
|
||
"""合并两份报告为一份统一报告"""
|
||
report_root = get_output_path("ETL_REPORT_ROOT")
|
||
analyze_root = get_output_path("SYSTEM_ANALYZE_ROOT")
|
||
|
||
# 找最新的 dataflow 报告
|
||
dataflow_report = None
|
||
if structure_ok:
|
||
dataflow_files = sorted(analyze_root.glob("dataflow_*.md"), reverse=True)
|
||
if dataflow_files:
|
||
dataflow_report = dataflow_files[0]
|
||
|
||
# 找最新的 consistency 报告
|
||
consistency_report = None
|
||
if consistency_ok:
|
||
consistency_files = sorted(report_root.glob("consistency_check_*.md"), reverse=True)
|
||
if consistency_files:
|
||
consistency_report = consistency_files[0]
|
||
|
||
if not dataflow_report and not consistency_report:
|
||
print("⚠️ 没有找到任何报告文件,跳过合并")
|
||
return None
|
||
|
||
# 合并
|
||
lines: list[str] = []
|
||
lines.append("# ETL 统一分析报告")
|
||
lines.append("")
|
||
lines.append(f"生成时间: {NOW.strftime('%Y-%m-%d %H:%M:%S')} CST")
|
||
lines.append(f"模式: 结构分析{'✅' if structure_ok else '❌'} + 一致性检查{'✅' if consistency_ok else '❌'}")
|
||
lines.append("")
|
||
lines.append("---")
|
||
lines.append("")
|
||
|
||
if dataflow_report:
|
||
lines.append("# 第一部分:数据流结构分析")
|
||
lines.append("")
|
||
content = dataflow_report.read_text(encoding="utf-8")
|
||
# 跳过原报告的一级标题行(避免重复标题)
|
||
for line in content.splitlines():
|
||
if line.startswith("# ") and "数据流" in line:
|
||
continue
|
||
lines.append(line)
|
||
lines.append("")
|
||
lines.append("---")
|
||
lines.append("")
|
||
|
||
if consistency_report:
|
||
lines.append("# 第二部分:ETL 数据一致性检查")
|
||
lines.append("")
|
||
content = consistency_report.read_text(encoding="utf-8")
|
||
for line in content.splitlines():
|
||
if line.startswith("# ") and "一致性" in line:
|
||
continue
|
||
lines.append(line)
|
||
lines.append("")
|
||
|
||
out_file = report_root / f"etl_unified_analysis_{TS}.md"
|
||
out_file.write_text("\n".join(lines), encoding="utf-8")
|
||
return out_file
|
||
|
||
|
||
def main() -> None:
|
||
parser = build_parser()
|
||
args = parser.parse_args()
|
||
|
||
print(f"{'='*60}")
|
||
print(f"ETL 统一分析")
|
||
print(f"{'='*60}")
|
||
print(f" 模式: {args.mode}")
|
||
print(f" 数据源: {args.source}")
|
||
print(f" 日期范围: {args.date_from or '默认60天前'} ~ {args.date_to or '今天'}")
|
||
print(f"{'='*60}")
|
||
|
||
structure_ok = False
|
||
consistency_ok = False
|
||
any_failure = False
|
||
|
||
# 结构分析
|
||
if args.mode in ("structure", "full"):
|
||
rc = run_structure_analysis(args)
|
||
structure_ok = rc == 0
|
||
if rc != 0:
|
||
any_failure = True
|
||
if args.mode == "structure":
|
||
sys.exit(rc)
|
||
|
||
# 一致性检查
|
||
if args.mode in ("consistency", "full"):
|
||
rc = run_consistency_check(args)
|
||
consistency_ok = rc == 0
|
||
if rc != 0:
|
||
any_failure = True
|
||
if args.mode == "consistency":
|
||
sys.exit(rc)
|
||
|
||
# 合并报告(仅 full 模式且至少有一个成功)
|
||
if args.mode == "full" and (structure_ok or consistency_ok):
|
||
print(f"\n{'='*60}")
|
||
print(f"[报告合并] 开始合并...")
|
||
print(f"{'='*60}")
|
||
merged = merge_reports(structure_ok, consistency_ok)
|
||
if merged:
|
||
print(f"\n✅ 统一报告已生成: {merged}")
|
||
else:
|
||
print(f"\n⚠️ 报告合并失败")
|
||
|
||
# 最终摘要
|
||
print(f"\n{'='*60}")
|
||
print(f"ETL 统一分析完成")
|
||
print(f"{'='*60}")
|
||
if args.mode in ("structure", "full"):
|
||
print(f" 结构分析: {'✅ 成功' if structure_ok else '❌ 失败'}")
|
||
if args.mode in ("consistency", "full"):
|
||
print(f" 一致性检查: {'✅ 成功' if consistency_ok else '❌ 失败'}")
|
||
print(f"{'='*60}")
|
||
|
||
if any_failure:
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|