ETL 完成
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Recompute billiards_dws.dws_order_summary from DWD fact tables."""
|
||||
"""Recompute billiards_dws.dws_order_summary from DWD tables (dwd_*)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
@@ -15,119 +15,90 @@ from database.connection import DatabaseConnection # noqa: E402
|
||||
|
||||
|
||||
SQL_BUILD_SUMMARY = r"""
|
||||
WITH table_fee AS (
|
||||
WITH base AS (
|
||||
SELECT
|
||||
sh.site_id,
|
||||
sh.order_settle_id,
|
||||
sh.order_trade_no,
|
||||
COALESCE(sh.pay_time, sh.create_time)::date AS order_date,
|
||||
sh.tenant_id,
|
||||
sh.member_id,
|
||||
COALESCE(sh.is_bind_member, FALSE) AS member_flag,
|
||||
(COALESCE(sh.consume_money, 0) = 0 AND COALESCE(sh.pay_amount, 0) > 0) AS recharge_order_flag,
|
||||
COALESCE(sh.member_discount_amount, 0) AS member_discount_amount,
|
||||
COALESCE(sh.adjust_amount, 0) AS manual_discount_amount,
|
||||
COALESCE(sh.pay_amount, 0) AS total_paid_amount,
|
||||
COALESCE(sh.balance_amount, 0) + COALESCE(sh.recharge_card_amount, 0) + COALESCE(sh.gift_card_amount, 0) AS stored_card_deduct,
|
||||
COALESCE(sh.coupon_amount, 0) AS total_coupon_deduction,
|
||||
COALESCE(sh.table_charge_money, 0) AS settle_table_fee_amount,
|
||||
COALESCE(sh.assistant_pd_money, 0) + COALESCE(sh.assistant_cx_money, 0) AS settle_assistant_service_amount,
|
||||
COALESCE(sh.real_goods_money, 0) AS settle_goods_amount
|
||||
FROM billiards_dwd.dwd_settlement_head sh
|
||||
WHERE (%(site_id)s IS NULL OR sh.site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR COALESCE(sh.pay_time, sh.create_time)::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR COALESCE(sh.pay_time, sh.create_time)::date <= %(end_date)s)
|
||||
),
|
||||
table_fee AS (
|
||||
SELECT
|
||||
site_id,
|
||||
order_settle_id,
|
||||
order_trade_no,
|
||||
MIN(member_id) AS member_id,
|
||||
SUM(COALESCE(final_table_fee, 0)) AS table_fee_amount,
|
||||
SUM(COALESCE(member_discount_amount, 0)) AS member_discount_amount,
|
||||
SUM(COALESCE(manual_discount_amount, 0)) AS manual_discount_amount,
|
||||
SUM(COALESCE(original_table_fee, 0)) AS original_table_fee,
|
||||
MIN(start_time) AS first_time
|
||||
FROM billiards_dwd.fact_table_usage
|
||||
WHERE (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR start_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR start_time::date <= %(end_date)s)
|
||||
AND COALESCE(is_canceled, FALSE) = FALSE
|
||||
GROUP BY site_id, order_settle_id, order_trade_no
|
||||
SUM(COALESCE(real_table_charge_money, 0)) AS table_fee_amount
|
||||
FROM billiards_dwd.dwd_table_fee_log
|
||||
WHERE COALESCE(is_delete, 0) = 0
|
||||
AND (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR start_use_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR start_use_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id
|
||||
),
|
||||
assistant_fee AS (
|
||||
SELECT
|
||||
site_id,
|
||||
order_settle_id,
|
||||
order_trade_no,
|
||||
MIN(member_id) AS member_id,
|
||||
SUM(COALESCE(final_fee, 0)) AS assistant_service_amount,
|
||||
SUM(COALESCE(member_discount_amount, 0)) AS member_discount_amount,
|
||||
SUM(COALESCE(manual_discount_amount, 0)) AS manual_discount_amount,
|
||||
SUM(COALESCE(original_fee, 0)) AS original_fee,
|
||||
MIN(start_time) AS first_time
|
||||
FROM billiards_dwd.fact_assistant_service
|
||||
WHERE (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR start_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR start_time::date <= %(end_date)s)
|
||||
AND COALESCE(is_canceled, FALSE) = FALSE
|
||||
GROUP BY site_id, order_settle_id, order_trade_no
|
||||
SUM(COALESCE(ledger_amount, 0)) AS assistant_service_amount
|
||||
FROM billiards_dwd.dwd_assistant_service_log
|
||||
WHERE COALESCE(is_delete, 0) = 0
|
||||
AND (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR start_use_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR start_use_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id
|
||||
),
|
||||
goods_fee AS (
|
||||
SELECT
|
||||
site_id,
|
||||
order_settle_id,
|
||||
order_trade_no,
|
||||
MIN(member_id) AS member_id,
|
||||
SUM(COALESCE(final_amount, 0)) FILTER (WHERE COALESCE(is_gift, FALSE) = FALSE) AS goods_amount,
|
||||
SUM(COALESCE(discount_amount, 0)) FILTER (WHERE COALESCE(is_gift, FALSE) = FALSE) AS goods_discount_amount,
|
||||
SUM(COALESCE(original_amount, 0)) FILTER (WHERE COALESCE(is_gift, FALSE) = FALSE) AS goods_original_amount,
|
||||
COUNT(*) FILTER (WHERE COALESCE(is_gift, FALSE) = FALSE) AS item_count,
|
||||
SUM(COALESCE(quantity, 0)) FILTER (WHERE COALESCE(is_gift, FALSE) = FALSE) AS total_item_quantity,
|
||||
MIN(sale_time) AS first_time
|
||||
FROM billiards_dwd.fact_sale_item
|
||||
WHERE (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR sale_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR sale_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id, order_trade_no
|
||||
COUNT(*) AS item_count,
|
||||
SUM(COALESCE(ledger_count, 0)) AS total_item_quantity,
|
||||
SUM(COALESCE(real_goods_money, 0)) AS goods_amount
|
||||
FROM billiards_dwd.dwd_store_goods_sale
|
||||
WHERE COALESCE(is_delete, 0) = 0
|
||||
AND (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR create_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR create_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id
|
||||
),
|
||||
coupon_usage AS (
|
||||
group_fee AS (
|
||||
SELECT
|
||||
site_id,
|
||||
order_settle_id,
|
||||
order_trade_no,
|
||||
MIN(member_id) AS member_id,
|
||||
SUM(COALESCE(deduct_amount, 0)) AS coupon_deduction,
|
||||
SUM(COALESCE(settle_price, 0)) AS settle_price,
|
||||
MIN(used_time) AS first_time
|
||||
FROM billiards_dwd.fact_coupon_usage
|
||||
WHERE (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR used_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR used_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id, order_trade_no
|
||||
),
|
||||
payments AS (
|
||||
SELECT
|
||||
fp.site_id,
|
||||
fp.order_settle_id,
|
||||
fp.order_trade_no,
|
||||
MIN(fp.member_id) AS member_id,
|
||||
SUM(COALESCE(fp.pay_amount, 0)) AS total_paid_amount,
|
||||
SUM(COALESCE(fp.pay_amount, 0)) FILTER (WHERE COALESCE(pm.is_stored_value, FALSE)) AS stored_card_deduct,
|
||||
SUM(COALESCE(fp.pay_amount, 0)) FILTER (WHERE NOT COALESCE(pm.is_stored_value, FALSE)) AS external_paid_amount,
|
||||
MIN(fp.pay_time) AS first_time
|
||||
FROM billiards_dwd.fact_payment fp
|
||||
LEFT JOIN billiards_dwd.dim_pay_method pm ON fp.pay_method_code = pm.pay_method_code
|
||||
WHERE (%(site_id)s IS NULL OR fp.site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR fp.pay_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR fp.pay_time::date <= %(end_date)s)
|
||||
GROUP BY fp.site_id, fp.order_settle_id, fp.order_trade_no
|
||||
SUM(COALESCE(ledger_amount, 0)) AS group_amount
|
||||
FROM billiards_dwd.dwd_groupbuy_redemption
|
||||
WHERE COALESCE(is_delete, 0) = 0
|
||||
AND (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR create_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR create_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id
|
||||
),
|
||||
refunds AS (
|
||||
SELECT
|
||||
site_id,
|
||||
order_settle_id,
|
||||
order_trade_no,
|
||||
SUM(COALESCE(refund_amount, 0)) AS refund_amount
|
||||
FROM billiards_dwd.fact_refund
|
||||
WHERE (%(site_id)s IS NULL OR site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR refund_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR refund_time::date <= %(end_date)s)
|
||||
GROUP BY site_id, order_settle_id, order_trade_no
|
||||
),
|
||||
combined_ids AS (
|
||||
SELECT site_id, order_settle_id, order_trade_no FROM table_fee
|
||||
UNION
|
||||
SELECT site_id, order_settle_id, order_trade_no FROM assistant_fee
|
||||
UNION
|
||||
SELECT site_id, order_settle_id, order_trade_no FROM goods_fee
|
||||
UNION
|
||||
SELECT site_id, order_settle_id, order_trade_no FROM coupon_usage
|
||||
UNION
|
||||
SELECT site_id, order_settle_id, order_trade_no FROM payments
|
||||
UNION
|
||||
SELECT site_id, order_settle_id, order_trade_no FROM refunds
|
||||
),
|
||||
site_dim AS (
|
||||
SELECT site_id, tenant_id FROM billiards_dwd.dim_site
|
||||
r.site_id,
|
||||
r.relate_id AS order_settle_id,
|
||||
SUM(COALESCE(rx.refund_amount, 0)) AS refund_amount
|
||||
FROM billiards_dwd.dwd_refund r
|
||||
LEFT JOIN billiards_dwd.dwd_refund_ex rx ON r.refund_id = rx.refund_id
|
||||
WHERE (%(site_id)s IS NULL OR r.site_id = %(site_id)s)
|
||||
AND (%(start_date)s IS NULL OR r.pay_time::date >= %(start_date)s)
|
||||
AND (%(end_date)s IS NULL OR r.pay_time::date <= %(end_date)s)
|
||||
GROUP BY r.site_id, r.relate_id
|
||||
)
|
||||
INSERT INTO billiards_dws.dws_order_summary (
|
||||
site_id,
|
||||
@@ -166,58 +137,50 @@ INSERT INTO billiards_dws.dws_order_summary (
|
||||
updated_at
|
||||
)
|
||||
SELECT
|
||||
c.site_id,
|
||||
c.order_settle_id,
|
||||
c.order_trade_no,
|
||||
COALESCE(tf.first_time, af.first_time, gf.first_time, pay.first_time, cu.first_time)::date AS order_date,
|
||||
sd.tenant_id,
|
||||
COALESCE(tf.member_id, af.member_id, gf.member_id, cu.member_id, pay.member_id) AS member_id,
|
||||
COALESCE(tf.member_id, af.member_id, gf.member_id, cu.member_id, pay.member_id) IS NOT NULL AS member_flag,
|
||||
-- recharge flag: no consumption side but has payments
|
||||
(COALESCE(tf.table_fee_amount, 0) + COALESCE(af.assistant_service_amount, 0) + COALESCE(gf.goods_amount, 0) + COALESCE(cu.settle_price, 0) = 0)
|
||||
AND COALESCE(pay.total_paid_amount, 0) > 0 AS recharge_order_flag,
|
||||
b.site_id,
|
||||
b.order_settle_id,
|
||||
b.order_trade_no::text AS order_trade_no,
|
||||
b.order_date,
|
||||
b.tenant_id,
|
||||
b.member_id,
|
||||
b.member_flag,
|
||||
b.recharge_order_flag,
|
||||
COALESCE(gf.item_count, 0) AS item_count,
|
||||
COALESCE(gf.total_item_quantity, 0) AS total_item_quantity,
|
||||
COALESCE(tf.table_fee_amount, 0) AS table_fee_amount,
|
||||
COALESCE(af.assistant_service_amount, 0) AS assistant_service_amount,
|
||||
COALESCE(gf.goods_amount, 0) AS goods_amount,
|
||||
COALESCE(cu.settle_price, 0) AS group_amount,
|
||||
COALESCE(cu.coupon_deduction, 0) AS total_coupon_deduction,
|
||||
COALESCE(tf.member_discount_amount, 0) + COALESCE(af.member_discount_amount, 0) + COALESCE(gf.goods_discount_amount, 0) AS member_discount_amount,
|
||||
COALESCE(tf.manual_discount_amount, 0) + COALESCE(af.manual_discount_amount, 0) AS manual_discount_amount,
|
||||
COALESCE(tf.original_table_fee, 0) + COALESCE(af.original_fee, 0) + COALESCE(gf.goods_original_amount, 0) AS order_original_amount,
|
||||
COALESCE(tf.table_fee_amount, 0) + COALESCE(af.assistant_service_amount, 0) + COALESCE(gf.goods_amount, 0) + COALESCE(cu.settle_price, 0) - COALESCE(cu.coupon_deduction, 0) AS order_final_amount,
|
||||
COALESCE(pay.stored_card_deduct, 0) AS stored_card_deduct,
|
||||
COALESCE(pay.external_paid_amount, 0) AS external_paid_amount,
|
||||
COALESCE(pay.total_paid_amount, 0) AS total_paid_amount,
|
||||
COALESCE(tf.table_fee_amount, 0) AS book_table_flow,
|
||||
COALESCE(af.assistant_service_amount, 0) AS book_assistant_flow,
|
||||
COALESCE(gf.goods_amount, 0) AS book_goods_flow,
|
||||
COALESCE(cu.settle_price, 0) AS book_group_flow,
|
||||
COALESCE(tf.table_fee_amount, 0) + COALESCE(af.assistant_service_amount, 0) + COALESCE(gf.goods_amount, 0) + COALESCE(cu.settle_price, 0) AS book_order_flow,
|
||||
CASE
|
||||
WHEN (COALESCE(tf.table_fee_amount, 0) + COALESCE(af.assistant_service_amount, 0) + COALESCE(gf.goods_amount, 0) + COALESCE(cu.settle_price, 0) = 0)
|
||||
THEN 0
|
||||
ELSE COALESCE(pay.external_paid_amount, 0)
|
||||
END AS order_effective_consume_cash,
|
||||
CASE
|
||||
WHEN (COALESCE(tf.table_fee_amount, 0) + COALESCE(af.assistant_service_amount, 0) + COALESCE(gf.goods_amount, 0) + COALESCE(cu.settle_price, 0) = 0)
|
||||
THEN COALESCE(pay.external_paid_amount, 0)
|
||||
ELSE 0
|
||||
END AS order_effective_recharge_cash,
|
||||
COALESCE(pay.external_paid_amount, 0) + COALESCE(cu.settle_price, 0) AS order_effective_flow,
|
||||
COALESCE(tf.table_fee_amount, b.settle_table_fee_amount) AS table_fee_amount,
|
||||
COALESCE(af.assistant_service_amount, b.settle_assistant_service_amount) AS assistant_service_amount,
|
||||
COALESCE(gf.goods_amount, b.settle_goods_amount) AS goods_amount,
|
||||
COALESCE(gr.group_amount, 0) AS group_amount,
|
||||
b.total_coupon_deduction AS total_coupon_deduction,
|
||||
b.member_discount_amount AS member_discount_amount,
|
||||
b.manual_discount_amount AS manual_discount_amount,
|
||||
-- approximate original amount: final + discounts/coupon
|
||||
(b.total_paid_amount + b.total_coupon_deduction + b.member_discount_amount + b.manual_discount_amount) AS order_original_amount,
|
||||
b.total_paid_amount AS order_final_amount,
|
||||
b.stored_card_deduct AS stored_card_deduct,
|
||||
GREATEST(b.total_paid_amount - b.stored_card_deduct, 0) AS external_paid_amount,
|
||||
b.total_paid_amount AS total_paid_amount,
|
||||
COALESCE(tf.table_fee_amount, b.settle_table_fee_amount) AS book_table_flow,
|
||||
COALESCE(af.assistant_service_amount, b.settle_assistant_service_amount) AS book_assistant_flow,
|
||||
COALESCE(gf.goods_amount, b.settle_goods_amount) AS book_goods_flow,
|
||||
COALESCE(gr.group_amount, 0) AS book_group_flow,
|
||||
COALESCE(tf.table_fee_amount, b.settle_table_fee_amount)
|
||||
+ COALESCE(af.assistant_service_amount, b.settle_assistant_service_amount)
|
||||
+ COALESCE(gf.goods_amount, b.settle_goods_amount)
|
||||
+ COALESCE(gr.group_amount, 0) AS book_order_flow,
|
||||
GREATEST(b.total_paid_amount - b.stored_card_deduct, 0) AS order_effective_consume_cash,
|
||||
0 AS order_effective_recharge_cash,
|
||||
b.total_paid_amount AS order_effective_flow,
|
||||
COALESCE(rf.refund_amount, 0) AS refund_amount,
|
||||
(COALESCE(pay.external_paid_amount, 0) + COALESCE(cu.settle_price, 0)) - COALESCE(rf.refund_amount, 0) AS net_income,
|
||||
b.total_paid_amount - COALESCE(rf.refund_amount, 0) AS net_income,
|
||||
now() AS created_at,
|
||||
now() AS updated_at
|
||||
FROM combined_ids c
|
||||
LEFT JOIN table_fee tf ON c.site_id = tf.site_id AND c.order_settle_id = tf.order_settle_id
|
||||
LEFT JOIN assistant_fee af ON c.site_id = af.site_id AND c.order_settle_id = af.order_settle_id
|
||||
LEFT JOIN goods_fee gf ON c.site_id = gf.site_id AND c.order_settle_id = gf.order_settle_id
|
||||
LEFT JOIN coupon_usage cu ON c.site_id = cu.site_id AND c.order_settle_id = cu.order_settle_id
|
||||
LEFT JOIN payments pay ON c.site_id = pay.site_id AND c.order_settle_id = pay.order_settle_id
|
||||
LEFT JOIN refunds rf ON c.site_id = rf.site_id AND c.order_settle_id = rf.order_settle_id
|
||||
LEFT JOIN site_dim sd ON c.site_id = sd.site_id
|
||||
FROM base b
|
||||
LEFT JOIN table_fee tf ON b.site_id = tf.site_id AND b.order_settle_id = tf.order_settle_id
|
||||
LEFT JOIN assistant_fee af ON b.site_id = af.site_id AND b.order_settle_id = af.order_settle_id
|
||||
LEFT JOIN goods_fee gf ON b.site_id = gf.site_id AND b.order_settle_id = gf.order_settle_id
|
||||
LEFT JOIN group_fee gr ON b.site_id = gr.site_id AND b.order_settle_id = gr.order_settle_id
|
||||
LEFT JOIN refunds rf ON b.site_id = rf.site_id AND b.order_settle_id = rf.order_settle_id
|
||||
ON CONFLICT (site_id, order_settle_id) DO UPDATE SET
|
||||
order_trade_no = EXCLUDED.order_trade_no,
|
||||
order_date = EXCLUDED.order_date,
|
||||
|
||||
783
etl_billiards/scripts/check_ods_gaps.py
Normal file
783
etl_billiards/scripts/check_ods_gaps.py
Normal file
@@ -0,0 +1,783 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Check missing ODS records by comparing API primary keys vs ODS table primary keys.
|
||||
|
||||
Default range:
|
||||
start = 2025-07-01 00:00:00
|
||||
end = now
|
||||
|
||||
For update runs, use --from-cutoff to derive the start time from ODS max(fetched_at),
|
||||
then backtrack by --cutoff-overlap-hours.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import time as time_mod
|
||||
import sys
|
||||
from datetime import datetime, time, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Sequence
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from dateutil import parser as dtparser
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from api.client import APIClient
|
||||
from config.settings import AppConfig
|
||||
from database.connection import DatabaseConnection
|
||||
from models.parsers import TypeParser
|
||||
from tasks.ods_tasks import ENABLED_ODS_CODES, ODS_TASK_SPECS
|
||||
from utils.logging_utils import build_log_path, configure_logging
|
||||
|
||||
DEFAULT_START = "2025-07-01"
|
||||
MIN_COMPLETENESS_WINDOW_DAYS = 30
|
||||
|
||||
|
||||
def _reconfigure_stdout_utf8() -> None:
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _parse_dt(value: str, tz: ZoneInfo, *, is_end: bool) -> datetime:
|
||||
raw = (value or "").strip()
|
||||
if not raw:
|
||||
raise ValueError("empty datetime")
|
||||
has_time = any(ch in raw for ch in (":", "T"))
|
||||
dt = dtparser.parse(raw)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=tz)
|
||||
else:
|
||||
dt = dt.astimezone(tz)
|
||||
if not has_time:
|
||||
dt = dt.replace(hour=23 if is_end else 0, minute=59 if is_end else 0, second=59 if is_end else 0, microsecond=0)
|
||||
return dt
|
||||
|
||||
|
||||
def _iter_windows(start: datetime, end: datetime, window_size: timedelta) -> Iterable[tuple[datetime, datetime]]:
|
||||
if window_size.total_seconds() <= 0:
|
||||
raise ValueError("window_size must be > 0")
|
||||
cur = start
|
||||
while cur < end:
|
||||
nxt = min(cur + window_size, end)
|
||||
yield cur, nxt
|
||||
cur = nxt
|
||||
|
||||
|
||||
def _merge_record_layers(record: dict) -> dict:
|
||||
merged = record
|
||||
data_part = merged.get("data")
|
||||
while isinstance(data_part, dict):
|
||||
merged = {**data_part, **merged}
|
||||
data_part = data_part.get("data")
|
||||
settle_inner = merged.get("settleList")
|
||||
if isinstance(settle_inner, dict):
|
||||
merged = {**settle_inner, **merged}
|
||||
return merged
|
||||
|
||||
|
||||
def _get_value_case_insensitive(record: dict | None, col: str | None):
|
||||
if record is None or col is None:
|
||||
return None
|
||||
if col in record:
|
||||
return record.get(col)
|
||||
col_lower = col.lower()
|
||||
for k, v in record.items():
|
||||
if isinstance(k, str) and k.lower() == col_lower:
|
||||
return v
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_pk_value(value):
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str) and value.isdigit():
|
||||
try:
|
||||
return int(value)
|
||||
except Exception:
|
||||
return value
|
||||
return value
|
||||
|
||||
|
||||
def _chunked(seq: Sequence, size: int) -> Iterable[Sequence]:
|
||||
if size <= 0:
|
||||
size = 500
|
||||
for i in range(0, len(seq), size):
|
||||
yield seq[i : i + size]
|
||||
|
||||
|
||||
def _get_table_pk_columns(conn, table: str) -> list[str]:
|
||||
if "." in table:
|
||||
schema, name = table.split(".", 1)
|
||||
else:
|
||||
schema, name = "public", table
|
||||
sql = """
|
||||
SELECT kcu.column_name
|
||||
FROM information_schema.table_constraints tc
|
||||
JOIN information_schema.key_column_usage kcu
|
||||
ON tc.constraint_name = kcu.constraint_name
|
||||
AND tc.table_schema = kcu.table_schema
|
||||
WHERE tc.constraint_type = 'PRIMARY KEY'
|
||||
AND tc.table_schema = %s
|
||||
AND tc.table_name = %s
|
||||
ORDER BY kcu.ordinal_position
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (schema, name))
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _fetch_existing_pk_set(conn, table: str, pk_cols: Sequence[str], pk_values: list[tuple], chunk_size: int) -> set[tuple]:
|
||||
if not pk_values:
|
||||
return set()
|
||||
select_cols = ", ".join(f't."{c}"' for c in pk_cols)
|
||||
value_cols = ", ".join(f'"{c}"' for c in pk_cols)
|
||||
join_cond = " AND ".join(f't."{c}" = v."{c}"' for c in pk_cols)
|
||||
sql = (
|
||||
f"SELECT {select_cols} FROM {table} t "
|
||||
f"JOIN (VALUES %s) AS v({value_cols}) ON {join_cond}"
|
||||
)
|
||||
existing: set[tuple] = set()
|
||||
with conn.cursor() as cur:
|
||||
for chunk in _chunked(pk_values, chunk_size):
|
||||
execute_values(cur, sql, chunk, page_size=len(chunk))
|
||||
for row in cur.fetchall():
|
||||
existing.add(tuple(row))
|
||||
return existing
|
||||
|
||||
|
||||
def _merge_common_params(cfg: AppConfig, task_code: str, base: dict) -> dict:
|
||||
merged: dict = {}
|
||||
common = cfg.get("api.params", {}) or {}
|
||||
if isinstance(common, dict):
|
||||
merged.update(common)
|
||||
scoped = cfg.get(f"api.params.{task_code.lower()}", {}) or {}
|
||||
if isinstance(scoped, dict):
|
||||
merged.update(scoped)
|
||||
merged.update(base)
|
||||
return merged
|
||||
|
||||
|
||||
def _build_params(cfg: AppConfig, spec, store_id: int, window_start: datetime | None, window_end: datetime | None) -> dict:
|
||||
base: dict = {}
|
||||
if spec.include_site_id:
|
||||
if spec.endpoint == "/TenantGoods/GetGoodsInventoryList":
|
||||
base["siteId"] = [store_id]
|
||||
else:
|
||||
base["siteId"] = store_id
|
||||
if spec.requires_window and spec.time_fields and window_start and window_end:
|
||||
start_key, end_key = spec.time_fields
|
||||
base[start_key] = TypeParser.format_timestamp(window_start, ZoneInfo(cfg.get("app.timezone", "Asia/Taipei")))
|
||||
base[end_key] = TypeParser.format_timestamp(window_end, ZoneInfo(cfg.get("app.timezone", "Asia/Taipei")))
|
||||
base.update(spec.extra_params or {})
|
||||
return _merge_common_params(cfg, spec.code, base)
|
||||
|
||||
|
||||
def _pk_tuple_from_record(record: dict, pk_cols: Sequence[str]) -> tuple | None:
|
||||
merged = _merge_record_layers(record)
|
||||
values = []
|
||||
for col in pk_cols:
|
||||
val = _normalize_pk_value(_get_value_case_insensitive(merged, col))
|
||||
if val is None or val == "":
|
||||
return None
|
||||
values.append(val)
|
||||
return tuple(values)
|
||||
|
||||
|
||||
def _pk_tuple_from_ticket_candidate(value) -> tuple | None:
|
||||
val = _normalize_pk_value(value)
|
||||
if val is None or val == "":
|
||||
return None
|
||||
return (val,)
|
||||
|
||||
|
||||
def _format_missing_sample(pk_cols: Sequence[str], pk_tuple: tuple) -> dict:
|
||||
return {col: pk_tuple[idx] for idx, col in enumerate(pk_cols)}
|
||||
|
||||
|
||||
def _check_spec(
|
||||
*,
|
||||
client: APIClient,
|
||||
db_conn,
|
||||
cfg: AppConfig,
|
||||
tz: ZoneInfo,
|
||||
logger: logging.Logger,
|
||||
spec,
|
||||
store_id: int,
|
||||
start: datetime | None,
|
||||
end: datetime | None,
|
||||
window_days: int,
|
||||
window_hours: int,
|
||||
page_size: int,
|
||||
chunk_size: int,
|
||||
sample_limit: int,
|
||||
sleep_per_window: float,
|
||||
sleep_per_page: float,
|
||||
) -> dict:
|
||||
result = {
|
||||
"task_code": spec.code,
|
||||
"table": spec.table_name,
|
||||
"endpoint": spec.endpoint,
|
||||
"pk_columns": [],
|
||||
"records": 0,
|
||||
"records_with_pk": 0,
|
||||
"missing": 0,
|
||||
"missing_samples": [],
|
||||
"pages": 0,
|
||||
"skipped_missing_pk": 0,
|
||||
"errors": 0,
|
||||
"error_detail": None,
|
||||
}
|
||||
|
||||
pk_cols = _get_table_pk_columns(db_conn, spec.table_name)
|
||||
result["pk_columns"] = pk_cols
|
||||
if not pk_cols:
|
||||
result["errors"] = 1
|
||||
result["error_detail"] = "no primary key columns found"
|
||||
return result
|
||||
|
||||
if spec.requires_window and spec.time_fields:
|
||||
if not start or not end:
|
||||
result["errors"] = 1
|
||||
result["error_detail"] = "missing start/end for windowed endpoint"
|
||||
return result
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
windows = list(_iter_windows(start, end, window_size))
|
||||
else:
|
||||
windows = [(None, None)]
|
||||
|
||||
logger.info(
|
||||
"CHECK_START task=%s table=%s windows=%s start=%s end=%s",
|
||||
spec.code,
|
||||
spec.table_name,
|
||||
len(windows),
|
||||
start.isoformat() if start else None,
|
||||
end.isoformat() if end else None,
|
||||
)
|
||||
missing_seen: set[tuple] = set()
|
||||
|
||||
for window_idx, (window_start, window_end) in enumerate(windows, start=1):
|
||||
window_label = (
|
||||
f"{window_start.isoformat()}~{window_end.isoformat()}"
|
||||
if window_start and window_end
|
||||
else "FULL"
|
||||
)
|
||||
logger.info(
|
||||
"WINDOW_START task=%s idx=%s window=%s",
|
||||
spec.code,
|
||||
window_idx,
|
||||
window_label,
|
||||
)
|
||||
window_pages = 0
|
||||
window_records = 0
|
||||
window_missing = 0
|
||||
window_skipped = 0
|
||||
params = _build_params(cfg, spec, store_id, window_start, window_end)
|
||||
try:
|
||||
for page_no, records, _, _ in client.iter_paginated(
|
||||
endpoint=spec.endpoint,
|
||||
params=params,
|
||||
page_size=page_size,
|
||||
data_path=spec.data_path,
|
||||
list_key=spec.list_key,
|
||||
):
|
||||
window_pages += 1
|
||||
window_records += len(records)
|
||||
result["pages"] += 1
|
||||
result["records"] += len(records)
|
||||
pk_tuples: list[tuple] = []
|
||||
for rec in records:
|
||||
if not isinstance(rec, dict):
|
||||
result["skipped_missing_pk"] += 1
|
||||
window_skipped += 1
|
||||
continue
|
||||
pk_tuple = _pk_tuple_from_record(rec, pk_cols)
|
||||
if not pk_tuple:
|
||||
result["skipped_missing_pk"] += 1
|
||||
window_skipped += 1
|
||||
continue
|
||||
pk_tuples.append(pk_tuple)
|
||||
|
||||
if not pk_tuples:
|
||||
continue
|
||||
|
||||
result["records_with_pk"] += len(pk_tuples)
|
||||
pk_unique = list(dict.fromkeys(pk_tuples))
|
||||
existing = _fetch_existing_pk_set(db_conn, spec.table_name, pk_cols, pk_unique, chunk_size)
|
||||
for pk_tuple in pk_unique:
|
||||
if pk_tuple in existing:
|
||||
continue
|
||||
if pk_tuple in missing_seen:
|
||||
continue
|
||||
missing_seen.add(pk_tuple)
|
||||
result["missing"] += 1
|
||||
window_missing += 1
|
||||
if len(result["missing_samples"]) < sample_limit:
|
||||
result["missing_samples"].append(_format_missing_sample(pk_cols, pk_tuple))
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
"PAGE task=%s idx=%s page=%s records=%s missing=%s skipped=%s",
|
||||
spec.code,
|
||||
window_idx,
|
||||
page_no,
|
||||
len(records),
|
||||
window_missing,
|
||||
window_skipped,
|
||||
)
|
||||
if sleep_per_page > 0:
|
||||
time_mod.sleep(sleep_per_page)
|
||||
except Exception as exc:
|
||||
result["errors"] += 1
|
||||
result["error_detail"] = f"{type(exc).__name__}: {exc}"
|
||||
logger.exception(
|
||||
"WINDOW_ERROR task=%s idx=%s window=%s error=%s",
|
||||
spec.code,
|
||||
window_idx,
|
||||
window_label,
|
||||
result["error_detail"],
|
||||
)
|
||||
break
|
||||
logger.info(
|
||||
"WINDOW_DONE task=%s idx=%s window=%s pages=%s records=%s missing=%s skipped=%s",
|
||||
spec.code,
|
||||
window_idx,
|
||||
window_label,
|
||||
window_pages,
|
||||
window_records,
|
||||
window_missing,
|
||||
window_skipped,
|
||||
)
|
||||
if sleep_per_window > 0:
|
||||
logger.debug(
|
||||
"SLEEP_WINDOW task=%s idx=%s seconds=%.2f",
|
||||
spec.code,
|
||||
window_idx,
|
||||
sleep_per_window,
|
||||
)
|
||||
time_mod.sleep(sleep_per_window)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _check_settlement_tickets(
|
||||
*,
|
||||
client: APIClient,
|
||||
db_conn,
|
||||
cfg: AppConfig,
|
||||
tz: ZoneInfo,
|
||||
logger: logging.Logger,
|
||||
store_id: int,
|
||||
start: datetime | None,
|
||||
end: datetime | None,
|
||||
window_days: int,
|
||||
window_hours: int,
|
||||
page_size: int,
|
||||
chunk_size: int,
|
||||
sample_limit: int,
|
||||
sleep_per_window: float,
|
||||
sleep_per_page: float,
|
||||
) -> dict:
|
||||
table_name = "billiards_ods.settlement_ticket_details"
|
||||
pk_cols = _get_table_pk_columns(db_conn, table_name)
|
||||
result = {
|
||||
"task_code": "ODS_SETTLEMENT_TICKET",
|
||||
"table": table_name,
|
||||
"endpoint": "/Order/GetOrderSettleTicketNew",
|
||||
"pk_columns": pk_cols,
|
||||
"records": 0,
|
||||
"records_with_pk": 0,
|
||||
"missing": 0,
|
||||
"missing_samples": [],
|
||||
"pages": 0,
|
||||
"skipped_missing_pk": 0,
|
||||
"errors": 0,
|
||||
"error_detail": None,
|
||||
"source_endpoint": "/PayLog/GetPayLogListPage",
|
||||
}
|
||||
|
||||
if not pk_cols:
|
||||
result["errors"] = 1
|
||||
result["error_detail"] = "no primary key columns found"
|
||||
return result
|
||||
if not start or not end:
|
||||
result["errors"] = 1
|
||||
result["error_detail"] = "missing start/end for ticket check"
|
||||
return result
|
||||
|
||||
missing_seen: set[tuple] = set()
|
||||
pay_endpoint = "/PayLog/GetPayLogListPage"
|
||||
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
windows = list(_iter_windows(start, end, window_size))
|
||||
logger.info(
|
||||
"CHECK_START task=%s table=%s windows=%s start=%s end=%s",
|
||||
result["task_code"],
|
||||
table_name,
|
||||
len(windows),
|
||||
start.isoformat() if start else None,
|
||||
end.isoformat() if end else None,
|
||||
)
|
||||
|
||||
for window_idx, (window_start, window_end) in enumerate(windows, start=1):
|
||||
window_label = f"{window_start.isoformat()}~{window_end.isoformat()}"
|
||||
logger.info(
|
||||
"WINDOW_START task=%s idx=%s window=%s",
|
||||
result["task_code"],
|
||||
window_idx,
|
||||
window_label,
|
||||
)
|
||||
window_pages = 0
|
||||
window_records = 0
|
||||
window_missing = 0
|
||||
window_skipped = 0
|
||||
base = {
|
||||
"siteId": store_id,
|
||||
"StartPayTime": TypeParser.format_timestamp(window_start, tz),
|
||||
"EndPayTime": TypeParser.format_timestamp(window_end, tz),
|
||||
}
|
||||
params = _merge_common_params(cfg, "ODS_PAYMENT", base)
|
||||
try:
|
||||
for page_no, records, _, _ in client.iter_paginated(
|
||||
endpoint=pay_endpoint,
|
||||
params=params,
|
||||
page_size=page_size,
|
||||
data_path=("data",),
|
||||
list_key=None,
|
||||
):
|
||||
window_pages += 1
|
||||
window_records += len(records)
|
||||
result["pages"] += 1
|
||||
result["records"] += len(records)
|
||||
pk_tuples: list[tuple] = []
|
||||
for rec in records:
|
||||
if not isinstance(rec, dict):
|
||||
result["skipped_missing_pk"] += 1
|
||||
window_skipped += 1
|
||||
continue
|
||||
relate_id = TypeParser.parse_int(
|
||||
(rec or {}).get("relateId")
|
||||
or (rec or {}).get("orderSettleId")
|
||||
or (rec or {}).get("order_settle_id")
|
||||
)
|
||||
pk_tuple = _pk_tuple_from_ticket_candidate(relate_id)
|
||||
if not pk_tuple:
|
||||
result["skipped_missing_pk"] += 1
|
||||
window_skipped += 1
|
||||
continue
|
||||
pk_tuples.append(pk_tuple)
|
||||
|
||||
if not pk_tuples:
|
||||
continue
|
||||
|
||||
result["records_with_pk"] += len(pk_tuples)
|
||||
pk_unique = list(dict.fromkeys(pk_tuples))
|
||||
existing = _fetch_existing_pk_set(db_conn, table_name, pk_cols, pk_unique, chunk_size)
|
||||
for pk_tuple in pk_unique:
|
||||
if pk_tuple in existing:
|
||||
continue
|
||||
if pk_tuple in missing_seen:
|
||||
continue
|
||||
missing_seen.add(pk_tuple)
|
||||
result["missing"] += 1
|
||||
window_missing += 1
|
||||
if len(result["missing_samples"]) < sample_limit:
|
||||
result["missing_samples"].append(_format_missing_sample(pk_cols, pk_tuple))
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
"PAGE task=%s idx=%s page=%s records=%s missing=%s skipped=%s",
|
||||
result["task_code"],
|
||||
window_idx,
|
||||
page_no,
|
||||
len(records),
|
||||
window_missing,
|
||||
window_skipped,
|
||||
)
|
||||
if sleep_per_page > 0:
|
||||
time_mod.sleep(sleep_per_page)
|
||||
except Exception as exc:
|
||||
result["errors"] += 1
|
||||
result["error_detail"] = f"{type(exc).__name__}: {exc}"
|
||||
logger.exception(
|
||||
"WINDOW_ERROR task=%s idx=%s window=%s error=%s",
|
||||
result["task_code"],
|
||||
window_idx,
|
||||
window_label,
|
||||
result["error_detail"],
|
||||
)
|
||||
break
|
||||
logger.info(
|
||||
"WINDOW_DONE task=%s idx=%s window=%s pages=%s records=%s missing=%s skipped=%s",
|
||||
result["task_code"],
|
||||
window_idx,
|
||||
window_label,
|
||||
window_pages,
|
||||
window_records,
|
||||
window_missing,
|
||||
window_skipped,
|
||||
)
|
||||
if sleep_per_window > 0:
|
||||
logger.debug(
|
||||
"SLEEP_WINDOW task=%s idx=%s seconds=%.2f",
|
||||
result["task_code"],
|
||||
window_idx,
|
||||
sleep_per_window,
|
||||
)
|
||||
time_mod.sleep(sleep_per_window)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _compute_ods_cutoff(conn, ods_tables: Sequence[str]) -> datetime | None:
|
||||
values: list[datetime] = []
|
||||
with conn.cursor() as cur:
|
||||
for table in ods_tables:
|
||||
try:
|
||||
cur.execute(f"SELECT MAX(fetched_at) FROM {table}")
|
||||
row = cur.fetchone()
|
||||
if row and row[0]:
|
||||
values.append(row[0])
|
||||
except Exception:
|
||||
continue
|
||||
if not values:
|
||||
return None
|
||||
return min(values)
|
||||
|
||||
|
||||
def _resolve_window_from_cutoff(
|
||||
*,
|
||||
conn,
|
||||
ods_tables: Sequence[str],
|
||||
tz: ZoneInfo,
|
||||
overlap_hours: int,
|
||||
) -> tuple[datetime, datetime, datetime | None]:
|
||||
cutoff = _compute_ods_cutoff(conn, ods_tables)
|
||||
now = datetime.now(tz)
|
||||
if cutoff is None:
|
||||
start = now - timedelta(hours=max(1, overlap_hours))
|
||||
return start, now, None
|
||||
if cutoff.tzinfo is None:
|
||||
cutoff = cutoff.replace(tzinfo=tz)
|
||||
else:
|
||||
cutoff = cutoff.astimezone(tz)
|
||||
start = cutoff - timedelta(hours=max(0, overlap_hours))
|
||||
return start, now, cutoff
|
||||
|
||||
|
||||
def main() -> int:
|
||||
_reconfigure_stdout_utf8()
|
||||
ap = argparse.ArgumentParser(description="Check missing ODS records by comparing API vs ODS PKs.")
|
||||
ap.add_argument("--start", default=DEFAULT_START, help="start datetime (default: 2025-07-01)")
|
||||
ap.add_argument("--end", default="", help="end datetime (default: now)")
|
||||
ap.add_argument("--window-days", type=int, default=1, help="days per API window (default: 1)")
|
||||
ap.add_argument("--window-hours", type=int, default=0, help="hours per API window (default: 0)")
|
||||
ap.add_argument("--page-size", type=int, default=200, help="API page size (default: 200)")
|
||||
ap.add_argument("--chunk-size", type=int, default=500, help="DB query chunk size (default: 500)")
|
||||
ap.add_argument("--sample-limit", type=int, default=50, help="max missing PK samples per table")
|
||||
ap.add_argument("--sleep-per-window-seconds", type=float, default=0, help="sleep seconds after each window")
|
||||
ap.add_argument("--sleep-per-page-seconds", type=float, default=0, help="sleep seconds after each page")
|
||||
ap.add_argument("--task-codes", default="", help="comma-separated task codes to check (optional)")
|
||||
ap.add_argument("--out", default="", help="output JSON path (optional)")
|
||||
ap.add_argument("--tag", default="", help="tag suffix for output filename")
|
||||
ap.add_argument("--from-cutoff", action="store_true", help="derive start from ODS cutoff")
|
||||
ap.add_argument(
|
||||
"--cutoff-overlap-hours",
|
||||
type=int,
|
||||
default=24,
|
||||
help="overlap hours when using --from-cutoff (default: 24)",
|
||||
)
|
||||
ap.add_argument("--log-file", default="", help="log file path (default: logs/check_ods_gaps_YYYYMMDD_HHMMSS.log)")
|
||||
ap.add_argument("--log-dir", default="", help="log directory (default: logs)")
|
||||
ap.add_argument("--log-level", default="INFO", help="log level (default: INFO)")
|
||||
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
|
||||
args = ap.parse_args()
|
||||
|
||||
log_dir = Path(args.log_dir) if args.log_dir else (PROJECT_ROOT / "logs")
|
||||
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "check_ods_gaps", args.tag)
|
||||
log_console = not args.no_log_console
|
||||
|
||||
with configure_logging(
|
||||
"ods_gap_check",
|
||||
log_file,
|
||||
level=args.log_level,
|
||||
console=log_console,
|
||||
tee_std=True,
|
||||
) as logger:
|
||||
cfg = AppConfig.load({})
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
store_id = int(cfg.get("app.store_id"))
|
||||
|
||||
if not cfg.get("api.token"):
|
||||
logger.error("missing api.token; please set API_TOKEN in .env")
|
||||
raise SystemExit("missing api.token; please set API_TOKEN in .env")
|
||||
|
||||
window_days = int(args.window_days)
|
||||
window_hours = int(args.window_hours)
|
||||
if not args.from_cutoff:
|
||||
min_hours = MIN_COMPLETENESS_WINDOW_DAYS * 24
|
||||
if window_hours > 0:
|
||||
if window_hours < min_hours:
|
||||
logger.warning(
|
||||
"window_hours=%s too small for completeness check; adjust to %s",
|
||||
window_hours,
|
||||
min_hours,
|
||||
)
|
||||
window_hours = min_hours
|
||||
elif window_days < MIN_COMPLETENESS_WINDOW_DAYS:
|
||||
logger.warning(
|
||||
"window_days=%s too small for completeness check; adjust to %s",
|
||||
window_days,
|
||||
MIN_COMPLETENESS_WINDOW_DAYS,
|
||||
)
|
||||
window_days = MIN_COMPLETENESS_WINDOW_DAYS
|
||||
|
||||
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
|
||||
if args.from_cutoff:
|
||||
db_tmp = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
ods_tables = [s.table_name for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
|
||||
start, end, cutoff = _resolve_window_from_cutoff(
|
||||
conn=db_tmp.conn,
|
||||
ods_tables=ods_tables,
|
||||
tz=tz,
|
||||
overlap_hours=args.cutoff_overlap_hours,
|
||||
)
|
||||
db_tmp.close()
|
||||
else:
|
||||
start = _parse_dt(args.start, tz, is_end=False)
|
||||
cutoff = None
|
||||
|
||||
logger.info(
|
||||
"START range=%s~%s window_days=%s window_hours=%s page_size=%s chunk_size=%s",
|
||||
start.isoformat() if start else None,
|
||||
end.isoformat() if end else None,
|
||||
window_days,
|
||||
window_hours,
|
||||
args.page_size,
|
||||
args.chunk_size,
|
||||
)
|
||||
if cutoff:
|
||||
logger.info("CUTOFF=%s overlap_hours=%s", cutoff.isoformat(), args.cutoff_overlap_hours)
|
||||
|
||||
client = APIClient(
|
||||
base_url=cfg["api"]["base_url"],
|
||||
token=cfg["api"]["token"],
|
||||
timeout=int(cfg["api"].get("timeout_sec") or 20),
|
||||
retry_max=int(cfg["api"].get("retries", {}).get("max_attempts") or 3),
|
||||
headers_extra=cfg["api"].get("headers_extra") or {},
|
||||
)
|
||||
|
||||
db_conn = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
try:
|
||||
db_conn.conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
db_conn.conn.autocommit = True
|
||||
try:
|
||||
task_filter = {t.strip().upper() for t in args.task_codes.split(",") if t.strip()}
|
||||
specs = [s for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
|
||||
if task_filter:
|
||||
specs = [s for s in specs if s.code in task_filter]
|
||||
|
||||
results: list[dict] = []
|
||||
for spec in specs:
|
||||
if spec.code == "ODS_SETTLEMENT_TICKET":
|
||||
continue
|
||||
result = _check_spec(
|
||||
client=client,
|
||||
db_conn=db_conn.conn,
|
||||
cfg=cfg,
|
||||
tz=tz,
|
||||
logger=logger,
|
||||
spec=spec,
|
||||
store_id=store_id,
|
||||
start=start,
|
||||
end=end,
|
||||
window_days=window_days,
|
||||
window_hours=window_hours,
|
||||
page_size=args.page_size,
|
||||
chunk_size=args.chunk_size,
|
||||
sample_limit=args.sample_limit,
|
||||
sleep_per_window=args.sleep_per_window_seconds,
|
||||
sleep_per_page=args.sleep_per_page_seconds,
|
||||
)
|
||||
results.append(result)
|
||||
logger.info(
|
||||
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
|
||||
result.get("task_code"),
|
||||
result.get("missing"),
|
||||
result.get("records"),
|
||||
result.get("errors"),
|
||||
)
|
||||
|
||||
if (not task_filter) or ("ODS_SETTLEMENT_TICKET" in task_filter):
|
||||
ticket_result = _check_settlement_tickets(
|
||||
client=client,
|
||||
db_conn=db_conn.conn,
|
||||
cfg=cfg,
|
||||
tz=tz,
|
||||
logger=logger,
|
||||
store_id=store_id,
|
||||
start=start,
|
||||
end=end,
|
||||
window_days=window_days,
|
||||
window_hours=window_hours,
|
||||
page_size=args.page_size,
|
||||
chunk_size=args.chunk_size,
|
||||
sample_limit=args.sample_limit,
|
||||
sleep_per_window=args.sleep_per_window_seconds,
|
||||
sleep_per_page=args.sleep_per_page_seconds,
|
||||
)
|
||||
results.append(ticket_result)
|
||||
logger.info(
|
||||
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
|
||||
ticket_result.get("task_code"),
|
||||
ticket_result.get("missing"),
|
||||
ticket_result.get("records"),
|
||||
ticket_result.get("errors"),
|
||||
)
|
||||
|
||||
total_missing = sum(int(r.get("missing") or 0) for r in results)
|
||||
total_errors = sum(int(r.get("errors") or 0) for r in results)
|
||||
|
||||
if args.out:
|
||||
out_path = Path(args.out)
|
||||
else:
|
||||
tag = f"_{args.tag}" if args.tag else ""
|
||||
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
|
||||
out_path = PROJECT_ROOT / "reports" / f"ods_gap_check{tag}_{stamp}.json"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
payload = {
|
||||
"start": start.isoformat(),
|
||||
"end": end.isoformat(),
|
||||
"cutoff": cutoff.isoformat() if cutoff else None,
|
||||
"window_days": window_days,
|
||||
"window_hours": window_hours,
|
||||
"page_size": args.page_size,
|
||||
"chunk_size": args.chunk_size,
|
||||
"sample_limit": args.sample_limit,
|
||||
"store_id": store_id,
|
||||
"base_url": cfg.get("api.base_url"),
|
||||
"results": results,
|
||||
"total_missing": total_missing,
|
||||
"total_errors": total_errors,
|
||||
"generated_at": datetime.now(tz).isoformat(),
|
||||
}
|
||||
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
logger.info("REPORT_WRITTEN path=%s", out_path)
|
||||
logger.info("SUMMARY missing=%s errors=%s", total_missing, total_errors)
|
||||
finally:
|
||||
db_conn.close()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
404
etl_billiards/scripts/rebuild_db_and_run_ods_to_dwd.py
Normal file
404
etl_billiards/scripts/rebuild_db_and_run_ods_to_dwd.py
Normal file
@@ -0,0 +1,404 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
一键重建 ETL 相关 Schema,并执行 ODS → DWD。
|
||||
|
||||
本脚本面向“离线示例 JSON 回放”的开发/运维场景,使用当前项目内的任务实现:
|
||||
1) (可选)DROP 并重建 schema:`etl_admin` / `billiards_ods` / `billiards_dwd`
|
||||
2) 执行 `INIT_ODS_SCHEMA`:创建 `etl_admin` 元数据表 + 执行 `schema_ODS_doc.sql`(内部会做轻量清洗)
|
||||
3) 执行 `INIT_DWD_SCHEMA`:执行 `schema_dwd_doc.sql`
|
||||
4) 执行 `MANUAL_INGEST`:从本地 JSON 目录灌入 ODS
|
||||
5) 执行 `DWD_LOAD_FROM_ODS`:从 ODS 装载到 DWD
|
||||
|
||||
用法(推荐):
|
||||
python -m etl_billiards.scripts.rebuild_db_and_run_ods_to_dwd ^
|
||||
--dsn "postgresql://user:pwd@host:5432/db" ^
|
||||
--store-id 1 ^
|
||||
--json-dir "C:\\dev\\LLTQ\\export\\test-json-doc" ^
|
||||
--drop-schemas
|
||||
|
||||
环境变量(可选):
|
||||
PG_DSN、STORE_ID、INGEST_SOURCE_DIR
|
||||
|
||||
日志:
|
||||
默认同时输出到控制台与文件;文件路径为 `io.log_root/rebuild_db_<时间戳>.log`。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
|
||||
from etl_billiards.config.settings import AppConfig
|
||||
from etl_billiards.database.connection import DatabaseConnection
|
||||
from etl_billiards.database.operations import DatabaseOperations
|
||||
from etl_billiards.tasks.dwd_load_task import DwdLoadTask
|
||||
from etl_billiards.tasks.init_dwd_schema_task import InitDwdSchemaTask
|
||||
from etl_billiards.tasks.init_schema_task import InitOdsSchemaTask
|
||||
from etl_billiards.tasks.manual_ingest_task import ManualIngestTask
|
||||
|
||||
|
||||
DEFAULT_JSON_DIR = r"C:\dev\LLTQ\export\test-json-doc"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunArgs:
|
||||
"""脚本参数对象(用于减少散落的参数传递)。"""
|
||||
|
||||
dsn: str
|
||||
store_id: int
|
||||
json_dir: str
|
||||
drop_schemas: bool
|
||||
terminate_own_sessions: bool
|
||||
demo: bool
|
||||
only_files: list[str]
|
||||
only_dwd_tables: list[str]
|
||||
stop_after: str | None
|
||||
|
||||
|
||||
def _attach_file_logger(log_root: str | Path, filename: str, logger: logging.Logger) -> logging.Handler | None:
|
||||
"""
|
||||
给 root logger 附加文件日志处理器(UTF-8)。
|
||||
|
||||
说明:
|
||||
- 使用 root logger 是为了覆盖项目中不同命名的 logger(包含第三方/子模块)。
|
||||
- 若创建失败仅记录 warning,不中断主流程。
|
||||
|
||||
返回值:
|
||||
创建成功返回 handler(调用方负责 removeHandler/close),失败返回 None。
|
||||
"""
|
||||
log_dir = Path(log_root)
|
||||
try:
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("创建日志目录失败:%s(%s)", log_dir, exc)
|
||||
return None
|
||||
|
||||
log_path = log_dir / filename
|
||||
try:
|
||||
handler: logging.Handler = logging.FileHandler(log_path, encoding="utf-8")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("创建文件日志失败:%s(%s)", log_path, exc)
|
||||
return None
|
||||
|
||||
handler.setLevel(logging.INFO)
|
||||
handler.setFormatter(
|
||||
logging.Formatter(
|
||||
fmt="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
)
|
||||
logging.getLogger().addHandler(handler)
|
||||
logger.info("文件日志已启用:%s", log_path)
|
||||
return handler
|
||||
|
||||
|
||||
def _parse_args() -> RunArgs:
|
||||
"""解析命令行/环境变量参数。"""
|
||||
parser = argparse.ArgumentParser(description="重建 Schema 并执行 ODS→DWD(离线 JSON 回放)")
|
||||
parser.add_argument("--dsn", default=os.environ.get("PG_DSN"), help="PostgreSQL DSN(默认读取 PG_DSN)")
|
||||
parser.add_argument(
|
||||
"--store-id",
|
||||
type=int,
|
||||
default=int(os.environ.get("STORE_ID") or 1),
|
||||
help="门店/租户 store_id(默认读取 STORE_ID,否则为 1)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json-dir",
|
||||
default=os.environ.get("INGEST_SOURCE_DIR") or DEFAULT_JSON_DIR,
|
||||
help=f"示例 JSON 目录(默认 {DEFAULT_JSON_DIR},也可读 INGEST_SOURCE_DIR)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--drop-schemas",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help="是否先 DROP 并重建 etl_admin/billiards_ods/billiards_dwd(默认:是)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--terminate-own-sessions",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help="执行 DROP 前是否终止当前用户的 idle-in-transaction 会话(默认:是)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--demo",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="运行最小 Demo(仅导入 member_profiles 并生成 dim_member/dim_member_ex)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-files",
|
||||
default="",
|
||||
help="仅处理指定 JSON 文件(逗号分隔,不含 .json,例如:member_profiles,settlement_records)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-dwd-tables",
|
||||
default="",
|
||||
help="仅处理指定 DWD 表(逗号分隔,支持完整名或表名,例如:billiards_dwd.dim_member,dim_member_ex)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stop-after",
|
||||
default="",
|
||||
help="在指定阶段后停止(可选:DROP_SCHEMAS/INIT_ODS_SCHEMA/INIT_DWD_SCHEMA/MANUAL_INGEST/DWD_LOAD_FROM_ODS/BASIC_VALIDATE)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.dsn:
|
||||
raise SystemExit("缺少 DSN:请传入 --dsn 或设置环境变量 PG_DSN")
|
||||
only_files = [x.strip().lower() for x in str(args.only_files or "").split(",") if x.strip()]
|
||||
only_dwd_tables = [x.strip().lower() for x in str(args.only_dwd_tables or "").split(",") if x.strip()]
|
||||
stop_after = str(args.stop_after or "").strip().upper() or None
|
||||
return RunArgs(
|
||||
dsn=args.dsn,
|
||||
store_id=args.store_id,
|
||||
json_dir=str(args.json_dir),
|
||||
drop_schemas=bool(args.drop_schemas),
|
||||
terminate_own_sessions=bool(args.terminate_own_sessions),
|
||||
demo=bool(args.demo),
|
||||
only_files=only_files,
|
||||
only_dwd_tables=only_dwd_tables,
|
||||
stop_after=stop_after,
|
||||
)
|
||||
|
||||
|
||||
def _build_config(args: RunArgs) -> AppConfig:
|
||||
"""构建本次执行所需的最小配置覆盖。"""
|
||||
manual_cfg: dict[str, Any] = {}
|
||||
dwd_cfg: dict[str, Any] = {}
|
||||
if args.demo:
|
||||
manual_cfg["include_files"] = ["member_profiles"]
|
||||
dwd_cfg["only_tables"] = ["billiards_dwd.dim_member", "billiards_dwd.dim_member_ex"]
|
||||
if args.only_files:
|
||||
manual_cfg["include_files"] = args.only_files
|
||||
if args.only_dwd_tables:
|
||||
dwd_cfg["only_tables"] = args.only_dwd_tables
|
||||
|
||||
overrides: dict[str, Any] = {
|
||||
"app": {"store_id": args.store_id},
|
||||
"pipeline": {"flow": "INGEST_ONLY", "ingest_source_dir": args.json_dir},
|
||||
"manual": manual_cfg,
|
||||
"dwd": dwd_cfg,
|
||||
# 离线回放/建仓可能耗时较长,关闭 statement_timeout,避免被默认 30s 中断。
|
||||
# 同时关闭 lock_timeout,避免 DROP/DDL 因锁等待稍久就直接失败。
|
||||
"db": {"dsn": args.dsn, "session": {"statement_timeout_ms": 0, "lock_timeout_ms": 0}},
|
||||
}
|
||||
return AppConfig.load(overrides)
|
||||
|
||||
|
||||
def _drop_schemas(db: DatabaseOperations, logger: logging.Logger) -> None:
|
||||
"""删除并重建 ETL 相关 schema(具备破坏性,请谨慎)。"""
|
||||
with db.conn.cursor() as cur:
|
||||
# 避免因为其他会话持锁而无限等待;若确实被占用,提示用户先释放/终止阻塞会话。
|
||||
cur.execute("SET lock_timeout TO '5s'")
|
||||
for schema in ("billiards_dwd", "billiards_ods", "etl_admin"):
|
||||
logger.info("DROP SCHEMA IF EXISTS %s CASCADE ...", schema)
|
||||
cur.execute(f'DROP SCHEMA IF EXISTS "{schema}" CASCADE;')
|
||||
|
||||
|
||||
def _terminate_own_idle_in_tx(db: DatabaseOperations, logger: logging.Logger) -> int:
|
||||
"""终止当前用户在本库中处于 idle-in-transaction 的会话,避免阻塞 DROP/DDL。"""
|
||||
with db.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT pid
|
||||
FROM pg_stat_activity
|
||||
WHERE datname = current_database()
|
||||
AND usename = current_user
|
||||
AND pid <> pg_backend_pid()
|
||||
AND state = 'idle in transaction'
|
||||
"""
|
||||
)
|
||||
pids = [r[0] for r in cur.fetchall()]
|
||||
killed = 0
|
||||
for pid in pids:
|
||||
cur.execute("SELECT pg_terminate_backend(%s)", (pid,))
|
||||
ok = bool(cur.fetchone()[0])
|
||||
logger.info("终止会话 pid=%s ok=%s", pid, ok)
|
||||
killed += 1 if ok else 0
|
||||
return killed
|
||||
|
||||
|
||||
def _run_task(task, logger: logging.Logger) -> dict:
|
||||
"""统一运行任务并打印关键结果。"""
|
||||
result = task.execute(None)
|
||||
logger.info("%s: status=%s counts=%s", task.get_task_code(), result.get("status"), result.get("counts"))
|
||||
return result
|
||||
|
||||
|
||||
def _basic_validate(db: DatabaseOperations, logger: logging.Logger) -> None:
|
||||
"""做最基础的可用性校验:schema 存在、关键表行数可查询。"""
|
||||
checks = [
|
||||
("billiards_ods", "member_profiles"),
|
||||
("billiards_ods", "settlement_records"),
|
||||
("billiards_dwd", "dim_member"),
|
||||
("billiards_dwd", "dwd_settlement_head"),
|
||||
]
|
||||
for schema, table in checks:
|
||||
try:
|
||||
rows = db.query(f'SELECT COUNT(1) AS cnt FROM "{schema}"."{table}"')
|
||||
logger.info("校验行数:%s.%s = %s", schema, table, (rows[0] or {}).get("cnt") if rows else None)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("校验失败:%s.%s(%s)", schema, table, exc)
|
||||
|
||||
|
||||
def _connect_db_with_retry(cfg: AppConfig, logger: logging.Logger) -> DatabaseConnection:
|
||||
"""创建数据库连接(带重试),避免短暂网络抖动导致脚本直接失败。"""
|
||||
dsn = cfg["db"]["dsn"]
|
||||
session = cfg["db"].get("session")
|
||||
connect_timeout = cfg["db"].get("connect_timeout_sec")
|
||||
|
||||
backoffs = [1, 2, 4, 8, 16]
|
||||
last_exc: Exception | None = None
|
||||
for attempt, wait_sec in enumerate([0] + backoffs, start=1):
|
||||
if wait_sec:
|
||||
time.sleep(wait_sec)
|
||||
try:
|
||||
return DatabaseConnection(dsn=dsn, session=session, connect_timeout=connect_timeout)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_exc = exc
|
||||
logger.warning("数据库连接失败(第 %s 次):%s", attempt, exc)
|
||||
raise last_exc or RuntimeError("数据库连接失败")
|
||||
|
||||
|
||||
def _is_connection_error(exc: Exception) -> bool:
|
||||
"""判断是否为连接断开/服务端异常导致的可重试错误。"""
|
||||
return isinstance(exc, (psycopg2.OperationalError, psycopg2.InterfaceError))
|
||||
|
||||
|
||||
def _run_stage_with_reconnect(
|
||||
cfg: AppConfig,
|
||||
logger: logging.Logger,
|
||||
stage_name: str,
|
||||
fn,
|
||||
max_attempts: int = 3,
|
||||
) -> dict | None:
|
||||
"""
|
||||
运行单个阶段:失败(尤其是连接断开)时自动重连并重试。
|
||||
|
||||
fn: (db_ops) -> dict | None
|
||||
"""
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
db_conn = _connect_db_with_retry(cfg, logger)
|
||||
db_ops = DatabaseOperations(db_conn)
|
||||
try:
|
||||
logger.info("阶段开始:%s(第 %s/%s 次)", stage_name, attempt, max_attempts)
|
||||
result = fn(db_ops)
|
||||
logger.info("阶段完成:%s", stage_name)
|
||||
return result
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_exc = exc
|
||||
logger.exception("阶段失败:%s(第 %s/%s 次):%s", stage_name, attempt, max_attempts, exc)
|
||||
# 连接类错误允许重试;非连接错误直接抛出,避免掩盖逻辑问题。
|
||||
if not _is_connection_error(exc):
|
||||
raise
|
||||
time.sleep(min(2**attempt, 10))
|
||||
finally:
|
||||
try:
|
||||
db_ops.close() # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
db_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
raise last_exc or RuntimeError(f"阶段失败:{stage_name}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""脚本主入口:按顺序重建并跑通 ODS→DWD。"""
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("etl_billiards.rebuild_db")
|
||||
|
||||
args = _parse_args()
|
||||
cfg = _build_config(args)
|
||||
|
||||
# 默认启用文件日志,便于事后追溯(即便运行失败也应尽早落盘)。
|
||||
file_handler = _attach_file_logger(
|
||||
log_root=cfg["io"]["log_root"],
|
||||
filename=time.strftime("rebuild_db_%Y%m%d-%H%M%S.log"),
|
||||
logger=logger,
|
||||
)
|
||||
|
||||
try:
|
||||
json_dir = Path(args.json_dir)
|
||||
if not json_dir.exists():
|
||||
logger.error("示例 JSON 目录不存在:%s", json_dir)
|
||||
return 2
|
||||
|
||||
def stage_drop(db_ops: DatabaseOperations):
|
||||
if not args.drop_schemas:
|
||||
return None
|
||||
if args.terminate_own_sessions:
|
||||
killed = _terminate_own_idle_in_tx(db_ops, logger)
|
||||
if killed:
|
||||
db_ops.commit()
|
||||
_drop_schemas(db_ops, logger)
|
||||
db_ops.commit()
|
||||
return None
|
||||
|
||||
def stage_init_ods(db_ops: DatabaseOperations):
|
||||
return _run_task(InitOdsSchemaTask(cfg, db_ops, None, logger), logger)
|
||||
|
||||
def stage_init_dwd(db_ops: DatabaseOperations):
|
||||
return _run_task(InitDwdSchemaTask(cfg, db_ops, None, logger), logger)
|
||||
|
||||
def stage_manual_ingest(db_ops: DatabaseOperations):
|
||||
logger.info("开始执行:MANUAL_INGEST(json_dir=%s)", json_dir)
|
||||
return _run_task(ManualIngestTask(cfg, db_ops, None, logger), logger)
|
||||
|
||||
def stage_dwd_load(db_ops: DatabaseOperations):
|
||||
logger.info("开始执行:DWD_LOAD_FROM_ODS")
|
||||
return _run_task(DwdLoadTask(cfg, db_ops, None, logger), logger)
|
||||
|
||||
_run_stage_with_reconnect(cfg, logger, "DROP_SCHEMAS", stage_drop, max_attempts=3)
|
||||
if args.stop_after == "DROP_SCHEMAS":
|
||||
return 0
|
||||
_run_stage_with_reconnect(cfg, logger, "INIT_ODS_SCHEMA", stage_init_ods, max_attempts=3)
|
||||
if args.stop_after == "INIT_ODS_SCHEMA":
|
||||
return 0
|
||||
_run_stage_with_reconnect(cfg, logger, "INIT_DWD_SCHEMA", stage_init_dwd, max_attempts=3)
|
||||
if args.stop_after == "INIT_DWD_SCHEMA":
|
||||
return 0
|
||||
_run_stage_with_reconnect(cfg, logger, "MANUAL_INGEST", stage_manual_ingest, max_attempts=5)
|
||||
if args.stop_after == "MANUAL_INGEST":
|
||||
return 0
|
||||
_run_stage_with_reconnect(cfg, logger, "DWD_LOAD_FROM_ODS", stage_dwd_load, max_attempts=5)
|
||||
if args.stop_after == "DWD_LOAD_FROM_ODS":
|
||||
return 0
|
||||
|
||||
# 校验阶段复用一条新连接即可
|
||||
_run_stage_with_reconnect(
|
||||
cfg,
|
||||
logger,
|
||||
"BASIC_VALIDATE",
|
||||
lambda db_ops: _basic_validate(db_ops, logger),
|
||||
max_attempts=3,
|
||||
)
|
||||
if args.stop_after == "BASIC_VALIDATE":
|
||||
return 0
|
||||
return 0
|
||||
finally:
|
||||
if file_handler is not None:
|
||||
try:
|
||||
logging.getLogger().removeHandler(file_handler)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
file_handler.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
182
etl_billiards/scripts/reload_ods_windowed.py
Normal file
182
etl_billiards/scripts/reload_ods_windowed.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Reload ODS tasks by fixed time windows with optional sleep between windows.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
import time as time_mod
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from dateutil import parser as dtparser
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from config.settings import AppConfig
|
||||
from utils.logging_utils import build_log_path, configure_logging
|
||||
|
||||
MIN_RELOAD_WINDOW_DAYS = 30
|
||||
|
||||
def _parse_dt(value: str, tz: ZoneInfo, *, is_end: bool) -> datetime:
|
||||
raw = (value or "").strip()
|
||||
if not raw:
|
||||
raise ValueError("empty datetime")
|
||||
has_time = any(ch in raw for ch in (":", "T"))
|
||||
dt = dtparser.parse(raw)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=tz)
|
||||
else:
|
||||
dt = dt.astimezone(tz)
|
||||
if not has_time:
|
||||
dt = dt.replace(hour=23 if is_end else 0, minute=59 if is_end else 0, second=59 if is_end else 0, microsecond=0)
|
||||
return dt
|
||||
|
||||
|
||||
def _iter_windows(start: datetime, end: datetime, window_size: timedelta):
|
||||
if window_size.total_seconds() <= 0:
|
||||
raise ValueError("window_size must be > 0")
|
||||
cur = start
|
||||
while cur < end:
|
||||
nxt = min(cur + window_size, end)
|
||||
yield cur, nxt
|
||||
cur = nxt
|
||||
|
||||
|
||||
def _run_task_window(
|
||||
task_code: str,
|
||||
window_start: datetime,
|
||||
window_end: datetime,
|
||||
api_page_size: int,
|
||||
api_timeout: int,
|
||||
logger: logging.Logger,
|
||||
) -> None:
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"cli.main",
|
||||
"--pipeline-flow",
|
||||
"FULL",
|
||||
"--tasks",
|
||||
task_code,
|
||||
"--window-start",
|
||||
window_start.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"--window-end",
|
||||
window_end.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"--force-window-override",
|
||||
]
|
||||
if api_page_size > 0:
|
||||
cmd += ["--api-page-size", str(api_page_size)]
|
||||
if api_timeout > 0:
|
||||
cmd += ["--api-timeout", str(api_timeout)]
|
||||
logger.info(
|
||||
"RUN_TASK task=%s window_start=%s window_end=%s",
|
||||
task_code,
|
||||
window_start.isoformat(),
|
||||
window_end.isoformat(),
|
||||
)
|
||||
logger.debug("CMD %s", " ".join(cmd))
|
||||
subprocess.run(cmd, check=True, cwd=str(PROJECT_ROOT))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Reload ODS tasks by window slices.")
|
||||
ap.add_argument("--tasks", required=True, help="comma-separated ODS task codes")
|
||||
ap.add_argument("--start", required=True, help="start datetime, e.g. 2025-07-01")
|
||||
ap.add_argument("--end", default="", help="end datetime (default: now)")
|
||||
ap.add_argument("--window-days", type=int, default=1, help="days per window (default: 1)")
|
||||
ap.add_argument("--window-hours", type=int, default=0, help="hours per window (default: 0)")
|
||||
ap.add_argument("--sleep-seconds", type=float, default=0, help="sleep seconds after each window")
|
||||
ap.add_argument("--api-page-size", type=int, default=200, help="API page size override")
|
||||
ap.add_argument("--api-timeout", type=int, default=20, help="API timeout seconds override")
|
||||
ap.add_argument("--log-file", default="", help="log file path (default: logs/reload_ods_windowed_YYYYMMDD_HHMMSS.log)")
|
||||
ap.add_argument("--log-dir", default="", help="log directory (default: logs)")
|
||||
ap.add_argument("--log-level", default="INFO", help="log level (default: INFO)")
|
||||
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
|
||||
args = ap.parse_args()
|
||||
|
||||
log_dir = Path(args.log_dir) if args.log_dir else (PROJECT_ROOT / "logs")
|
||||
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "reload_ods_windowed")
|
||||
log_console = not args.no_log_console
|
||||
|
||||
with configure_logging(
|
||||
"reload_ods_windowed",
|
||||
log_file,
|
||||
level=args.log_level,
|
||||
console=log_console,
|
||||
tee_std=True,
|
||||
) as logger:
|
||||
cfg = AppConfig.load({})
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
|
||||
start = _parse_dt(args.start, tz, is_end=False)
|
||||
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
|
||||
window_days = int(args.window_days)
|
||||
window_hours = int(args.window_hours)
|
||||
min_hours = MIN_RELOAD_WINDOW_DAYS * 24
|
||||
if window_hours > 0:
|
||||
if window_hours < min_hours:
|
||||
logger.warning(
|
||||
"window_hours=%s too small; adjust to %s",
|
||||
window_hours,
|
||||
min_hours,
|
||||
)
|
||||
window_hours = min_hours
|
||||
elif window_days < MIN_RELOAD_WINDOW_DAYS:
|
||||
logger.warning(
|
||||
"window_days=%s too small; adjust to %s",
|
||||
window_days,
|
||||
MIN_RELOAD_WINDOW_DAYS,
|
||||
)
|
||||
window_days = MIN_RELOAD_WINDOW_DAYS
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
|
||||
task_codes = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
|
||||
if not task_codes:
|
||||
raise SystemExit("no tasks specified")
|
||||
|
||||
logger.info(
|
||||
"START range=%s~%s window_days=%s window_hours=%s sleep=%.2f",
|
||||
start.isoformat(),
|
||||
end.isoformat(),
|
||||
window_days,
|
||||
window_hours,
|
||||
args.sleep_seconds,
|
||||
)
|
||||
|
||||
for task_code in task_codes:
|
||||
logger.info("TASK_START task=%s", task_code)
|
||||
for window_start, window_end in _iter_windows(start, end, window_size):
|
||||
start_ts = time_mod.monotonic()
|
||||
_run_task_window(
|
||||
task_code=task_code,
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
api_page_size=args.api_page_size,
|
||||
api_timeout=args.api_timeout,
|
||||
logger=logger,
|
||||
)
|
||||
elapsed = time_mod.monotonic() - start_ts
|
||||
logger.info(
|
||||
"WINDOW_DONE task=%s window_start=%s window_end=%s elapsed=%.2fs",
|
||||
task_code,
|
||||
window_start.isoformat(),
|
||||
window_end.isoformat(),
|
||||
elapsed,
|
||||
)
|
||||
if args.sleep_seconds > 0:
|
||||
logger.debug("SLEEP seconds=%.2f", args.sleep_seconds)
|
||||
time_mod.sleep(args.sleep_seconds)
|
||||
logger.info("TASK_DONE task=%s", task_code)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
267
etl_billiards/scripts/test_db_performance.py
Normal file
267
etl_billiards/scripts/test_db_performance.py
Normal file
@@ -0,0 +1,267 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""PostgreSQL connection performance test (ASCII-only output)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, Iterable, List
|
||||
|
||||
from psycopg2.extensions import make_dsn, parse_dsn
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from database.connection import DatabaseConnection
|
||||
|
||||
|
||||
def _load_env() -> Dict[str, str]:
|
||||
env: Dict[str, str] = {}
|
||||
try:
|
||||
from config.env_parser import _load_dotenv_values
|
||||
except Exception:
|
||||
_load_dotenv_values = None
|
||||
if _load_dotenv_values:
|
||||
try:
|
||||
env.update(_load_dotenv_values())
|
||||
except Exception:
|
||||
pass
|
||||
env.update(os.environ)
|
||||
return env
|
||||
|
||||
|
||||
def _apply_dsn_overrides(dsn: str, host: str | None, port: int | None) -> str:
|
||||
overrides = {}
|
||||
if host:
|
||||
overrides["host"] = host
|
||||
if port:
|
||||
overrides["port"] = str(port)
|
||||
if not overrides:
|
||||
return dsn
|
||||
return make_dsn(dsn, **overrides)
|
||||
|
||||
|
||||
def _build_dsn_from_env(
|
||||
host: str,
|
||||
port: int,
|
||||
user: str | None,
|
||||
password: str | None,
|
||||
dbname: str | None,
|
||||
) -> str | None:
|
||||
if not user or not dbname:
|
||||
return None
|
||||
params = {
|
||||
"host": host,
|
||||
"port": str(port),
|
||||
"user": user,
|
||||
"dbname": dbname,
|
||||
}
|
||||
if password:
|
||||
params["password"] = password
|
||||
return make_dsn("", **params)
|
||||
|
||||
|
||||
def _safe_dsn_summary(dsn: str, host: str | None, port: int | None) -> str:
|
||||
try:
|
||||
info = parse_dsn(dsn)
|
||||
except Exception:
|
||||
info = {}
|
||||
if host:
|
||||
info["host"] = host
|
||||
if port:
|
||||
info["port"] = str(port)
|
||||
info.pop("password", None)
|
||||
if not info:
|
||||
return "dsn=(hidden)"
|
||||
items = " ".join(f"{k}={info[k]}" for k in sorted(info.keys()))
|
||||
return items
|
||||
|
||||
|
||||
def _percentile(values: List[float], pct: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
ordered = sorted(values)
|
||||
if len(ordered) == 1:
|
||||
return ordered[0]
|
||||
rank = (len(ordered) - 1) * (pct / 100.0)
|
||||
low = int(math.floor(rank))
|
||||
high = int(math.ceil(rank))
|
||||
if low == high:
|
||||
return ordered[low]
|
||||
return ordered[low] + (ordered[high] - ordered[low]) * (rank - low)
|
||||
|
||||
|
||||
def _format_stats(label: str, values: Iterable[float]) -> str:
|
||||
data = list(values)
|
||||
if not data:
|
||||
return f"{label}: no samples"
|
||||
avg = statistics.mean(data)
|
||||
stdev = statistics.stdev(data) if len(data) > 1 else 0.0
|
||||
return (
|
||||
f"{label}: count={len(data)} "
|
||||
f"min={min(data):.2f}ms avg={avg:.2f}ms "
|
||||
f"p50={_percentile(data, 50):.2f}ms "
|
||||
f"p95={_percentile(data, 95):.2f}ms "
|
||||
f"max={max(data):.2f}ms stdev={stdev:.2f}ms"
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="PostgreSQL connection performance test")
|
||||
parser.add_argument("--dsn", help="Override PG_DSN/TEST_DB_DSN/.env value")
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
default="100.64.0.4",
|
||||
help="Override host in DSN (default: 100.64.0.4)",
|
||||
)
|
||||
parser.add_argument("--port", type=int, help="Override port in DSN")
|
||||
parser.add_argument("--user", help="User when building DSN from PG_* env")
|
||||
parser.add_argument("--password", help="Password when building DSN from PG_* env")
|
||||
parser.add_argument("--dbname", help="Database name when building DSN from PG_* env")
|
||||
parser.add_argument("--rounds", type=int, default=20, help="Measured connection rounds")
|
||||
parser.add_argument("--warmup", type=int, default=2, help="Warmup rounds (not recorded)")
|
||||
parser.add_argument("--query", default="SELECT 1", help="SQL to run after connect")
|
||||
parser.add_argument(
|
||||
"--query-repeat",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Query repetitions per connection (0 to skip)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--connect-timeout",
|
||||
type=int,
|
||||
default=10,
|
||||
help="connect_timeout seconds (capped at 20, default: 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--statement-timeout-ms",
|
||||
type=int,
|
||||
help="Optional statement_timeout applied per connection",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep-ms",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Sleep between rounds in milliseconds",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--continue-on-error",
|
||||
action="store_true",
|
||||
help="Continue even if a round fails",
|
||||
)
|
||||
parser.add_argument("--verbose", action="store_true", help="Print per-round timings")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def _run_round(
|
||||
dsn: str,
|
||||
timeout: int,
|
||||
query: str,
|
||||
query_repeat: int,
|
||||
session: Dict[str, int] | None,
|
||||
) -> tuple[float, List[float]]:
|
||||
start = time.perf_counter()
|
||||
conn = DatabaseConnection(dsn, connect_timeout=timeout, session=session)
|
||||
connect_ms = (time.perf_counter() - start) * 1000.0
|
||||
query_times: List[float] = []
|
||||
try:
|
||||
for _ in range(query_repeat):
|
||||
q_start = time.perf_counter()
|
||||
conn.query(query)
|
||||
query_times.append((time.perf_counter() - q_start) * 1000.0)
|
||||
return connect_ms, query_times
|
||||
finally:
|
||||
try:
|
||||
conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
conn.close()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
if args.rounds < 0 or args.warmup < 0 or args.query_repeat < 0:
|
||||
print("rounds/warmup/query-repeat must be >= 0", file=sys.stderr)
|
||||
return 2
|
||||
env = _load_env()
|
||||
|
||||
dsn = args.dsn or env.get("PG_DSN") or env.get("TEST_DB_DSN")
|
||||
host = args.host
|
||||
port = args.port
|
||||
|
||||
if not dsn:
|
||||
user = args.user or env.get("PG_USER")
|
||||
password = args.password if args.password is not None else env.get("PG_PASSWORD")
|
||||
dbname = args.dbname or env.get("PG_NAME")
|
||||
try:
|
||||
resolved_port = port or int(env.get("PG_PORT", "5432"))
|
||||
except ValueError:
|
||||
resolved_port = port or 5432
|
||||
dsn = _build_dsn_from_env(host, resolved_port, user, password, dbname)
|
||||
if not dsn:
|
||||
print(
|
||||
"Missing DSN. Provide --dsn or set PG_DSN/TEST_DB_DSN, or PG_USER + PG_NAME.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
dsn = _apply_dsn_overrides(dsn, host, port)
|
||||
|
||||
timeout = max(1, min(int(args.connect_timeout), 20))
|
||||
session = None
|
||||
if args.statement_timeout_ms is not None:
|
||||
session = {"statement_timeout_ms": int(args.statement_timeout_ms)}
|
||||
|
||||
print("Target:", _safe_dsn_summary(dsn, host, port))
|
||||
print(
|
||||
f"Rounds: {args.rounds} (warmup {args.warmup}), "
|
||||
f"query_repeat={args.query_repeat}, timeout={timeout}s"
|
||||
)
|
||||
if args.query_repeat > 0:
|
||||
print("Query:", args.query)
|
||||
|
||||
connect_times: List[float] = []
|
||||
query_times: List[float] = []
|
||||
failures: List[str] = []
|
||||
|
||||
total = args.warmup + args.rounds
|
||||
for idx in range(total):
|
||||
is_warmup = idx < args.warmup
|
||||
try:
|
||||
c_ms, q_times = _run_round(
|
||||
dsn, timeout, args.query, args.query_repeat, session
|
||||
)
|
||||
if not is_warmup:
|
||||
connect_times.append(c_ms)
|
||||
query_times.extend(q_times)
|
||||
if args.verbose:
|
||||
tag = "warmup" if is_warmup else "sample"
|
||||
q_msg = ""
|
||||
if args.query_repeat > 0:
|
||||
q_avg = statistics.mean(q_times) if q_times else 0.0
|
||||
q_msg = f", query_avg={q_avg:.2f}ms"
|
||||
print(f"[{tag} {idx + 1}/{total}] connect={c_ms:.2f}ms{q_msg}")
|
||||
except Exception as exc:
|
||||
msg = f"round {idx + 1}: {exc}"
|
||||
failures.append(msg)
|
||||
print("Failure:", msg, file=sys.stderr)
|
||||
if not args.continue_on_error:
|
||||
break
|
||||
if args.sleep_ms > 0:
|
||||
time.sleep(args.sleep_ms / 1000.0)
|
||||
|
||||
if connect_times:
|
||||
print(_format_stats("Connect", connect_times))
|
||||
if args.query_repeat > 0:
|
||||
print(_format_stats("Query", query_times))
|
||||
if failures:
|
||||
print(f"Failures: {len(failures)}", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user