Files
Neo-ZQYY/scripts/ops/etl_unified_analysis.py

255 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
ETL 统一分析 — 编排入口
合并「数据流结构分析」和「ETL 数据一致性检查」为一个统一流程。
支持三种模式:
--mode structure 仅结构分析analyze_dataflow + gen_dataflow_report
--mode consistency 仅一致性检查etl_consistency_check
--mode full 全部执行(默认)
数据源:
默认主动调 API 采集最近 60 天数据。
--source etl-log 切换为读 ETL 落盘 JSON一致性检查模式
用法:
cd C:\\NeoZQYY
uv run python scripts/ops/etl_unified_analysis.py
uv run python scripts/ops/etl_unified_analysis.py --mode consistency --source etl-log
uv run python scripts/ops/etl_unified_analysis.py --mode structure --date-from 2026-01-01
"""
from __future__ import annotations
import argparse
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
# 确保 scripts/ops 在 sys.path 中,以便 import _env_paths
_SCRIPT_DIR = Path(__file__).resolve().parent
if str(_SCRIPT_DIR) not in sys.path:
sys.path.insert(0, str(_SCRIPT_DIR))
from _env_paths import get_output_path, ensure_repo_root
ensure_repo_root()
TZ = ZoneInfo("Asia/Shanghai")
NOW = datetime.now(TZ)
TS = NOW.strftime("%Y%m%d_%H%M%S")
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="ETL 统一分析 — 结构分析 + 数据一致性检查",
)
parser.add_argument(
"--mode",
choices=["structure", "consistency", "full"],
default="full",
help="执行模式structure=仅结构分析, consistency=仅一致性检查, full=全部(默认)",
)
parser.add_argument(
"--source",
choices=["api", "etl-log"],
default="api",
help="数据源api=主动调 API 采集(默认), etl-log=读 ETL 落盘 JSON",
)
parser.add_argument(
"--date-from", type=str, default=None,
help="数据获取起始日期 (YYYY-MM-DD),默认 60 天前",
)
parser.add_argument(
"--date-to", type=str, default=None,
help="数据获取截止日期 (YYYY-MM-DD),默认今天",
)
parser.add_argument(
"--limit", type=int, default=200,
help="每端点最大记录数(默认 200",
)
parser.add_argument(
"--tables", type=str, default=None,
help="要分析的表名列表(逗号分隔,缺省=全部)",
)
return parser
def run_subprocess(cmd: list[str], label: str) -> int:
"""运行子进程,实时输出 stdout/stderr返回 exit code"""
print(f"\n{'='*60}")
print(f"[{label}] 开始执行...")
print(f" 命令: {' '.join(cmd)}")
print(f"{'='*60}\n")
proc = subprocess.run(
cmd,
cwd=str(Path(__file__).resolve().parents[2]), # 项目根目录
)
if proc.returncode != 0:
print(f"\n❌ [{label}] 执行失败 (exit code: {proc.returncode})")
else:
print(f"\n✅ [{label}] 执行完成")
return proc.returncode
def run_structure_analysis(args: argparse.Namespace) -> int:
"""执行数据流结构分析(采集 + 报告生成)"""
from datetime import date, timedelta
# 阶段 1数据采集
cmd = [sys.executable, "scripts/ops/analyze_dataflow.py"]
if args.date_from:
cmd += ["--date-from", args.date_from]
else:
# 默认 60 天
default_from = (date.today() - timedelta(days=60)).isoformat()
cmd += ["--date-from", default_from]
if args.date_to:
cmd += ["--date-to", args.date_to]
if args.limit:
cmd += ["--limit", str(args.limit)]
if args.tables:
cmd += ["--tables", args.tables]
rc = run_subprocess(cmd, "数据流采集")
if rc != 0:
return rc
# 阶段 2报告生成
cmd2 = [sys.executable, "scripts/ops/gen_dataflow_report.py"]
rc2 = run_subprocess(cmd2, "数据流报告生成")
return rc2
def run_consistency_check(args: argparse.Namespace) -> int:
"""执行 ETL 数据一致性检查"""
cmd = [sys.executable, "scripts/ops/etl_consistency_check.py"]
rc = run_subprocess(cmd, "ETL 数据一致性检查")
return rc
def merge_reports(structure_ok: bool, consistency_ok: bool) -> Path | None:
"""合并两份报告为一份统一报告"""
report_root = get_output_path("ETL_REPORT_ROOT")
analyze_root = get_output_path("SYSTEM_ANALYZE_ROOT")
# 找最新的 dataflow 报告
dataflow_report = None
if structure_ok:
dataflow_files = sorted(analyze_root.glob("dataflow_*.md"), reverse=True)
if dataflow_files:
dataflow_report = dataflow_files[0]
# 找最新的 consistency 报告
consistency_report = None
if consistency_ok:
consistency_files = sorted(report_root.glob("consistency_check_*.md"), reverse=True)
if consistency_files:
consistency_report = consistency_files[0]
if not dataflow_report and not consistency_report:
print("⚠️ 没有找到任何报告文件,跳过合并")
return None
# 合并
lines: list[str] = []
lines.append("# ETL 统一分析报告")
lines.append("")
lines.append(f"生成时间: {NOW.strftime('%Y-%m-%d %H:%M:%S')} CST")
lines.append(f"模式: 结构分析{'' if structure_ok else ''} + 一致性检查{'' if consistency_ok else ''}")
lines.append("")
lines.append("---")
lines.append("")
if dataflow_report:
lines.append("# 第一部分:数据流结构分析")
lines.append("")
content = dataflow_report.read_text(encoding="utf-8")
# 跳过原报告的一级标题行(避免重复标题)
for line in content.splitlines():
if line.startswith("# ") and "数据流" in line:
continue
lines.append(line)
lines.append("")
lines.append("---")
lines.append("")
if consistency_report:
lines.append("# 第二部分ETL 数据一致性检查")
lines.append("")
content = consistency_report.read_text(encoding="utf-8")
for line in content.splitlines():
if line.startswith("# ") and "一致性" in line:
continue
lines.append(line)
lines.append("")
out_file = report_root / f"etl_unified_analysis_{TS}.md"
out_file.write_text("\n".join(lines), encoding="utf-8")
return out_file
def main() -> None:
parser = build_parser()
args = parser.parse_args()
print(f"{'='*60}")
print(f"ETL 统一分析")
print(f"{'='*60}")
print(f" 模式: {args.mode}")
print(f" 数据源: {args.source}")
print(f" 日期范围: {args.date_from or '默认60天前'} ~ {args.date_to or '今天'}")
print(f"{'='*60}")
structure_ok = False
consistency_ok = False
any_failure = False
# 结构分析
if args.mode in ("structure", "full"):
rc = run_structure_analysis(args)
structure_ok = rc == 0
if rc != 0:
any_failure = True
if args.mode == "structure":
sys.exit(rc)
# 一致性检查
if args.mode in ("consistency", "full"):
rc = run_consistency_check(args)
consistency_ok = rc == 0
if rc != 0:
any_failure = True
if args.mode == "consistency":
sys.exit(rc)
# 合并报告(仅 full 模式且至少有一个成功)
if args.mode == "full" and (structure_ok or consistency_ok):
print(f"\n{'='*60}")
print(f"[报告合并] 开始合并...")
print(f"{'='*60}")
merged = merge_reports(structure_ok, consistency_ok)
if merged:
print(f"\n✅ 统一报告已生成: {merged}")
else:
print(f"\n⚠️ 报告合并失败")
# 最终摘要
print(f"\n{'='*60}")
print(f"ETL 统一分析完成")
print(f"{'='*60}")
if args.mode in ("structure", "full"):
print(f" 结构分析: {'✅ 成功' if structure_ok else '❌ 失败'}")
if args.mode in ("consistency", "full"):
print(f" 一致性检查: {'✅ 成功' if consistency_ok else '❌ 失败'}")
print(f"{'='*60}")
if any_failure:
sys.exit(1)
if __name__ == "__main__":
main()