# -*- coding: utf-8 -*- """ Property 7: Core schema 最小字段集 Validates: Requirements 7.5 对于任意 core schema 中的表,其字段数量应严格少于对应 dwd schema 中同名(或对应)表的字段数量。 使用 hypothesis 从 core 表列表中随机选取,验证 core 表字段数 < 对应 dwd 表字段数。 """ import re import os from pathlib import Path import pytest from hypothesis import given, settings, assume from hypothesis.strategies import sampled_from # --------------------------------------------------------------------------- # SQL 解析工具 # --------------------------------------------------------------------------- def _parse_tables(sql_text: str) -> dict[str, int]: """从 SQL 文本中提取每个 CREATE TABLE 的表名和字段数量。 只统计显式声明的列(不含 CONSTRAINT / PRIMARY KEY / CHECK 等行)。 """ tables: dict[str, int] = {} # 匹配 CREATE TABLE ... ( ... ); 允许 IF NOT EXISTS pattern = re.compile( r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)\s*\((.*?)\);", re.DOTALL | re.IGNORECASE, ) for match in pattern.finditer(sql_text): table_name = match.group(1).lower() body = match.group(2) # 按逗号拆分,过滤掉约束行 col_count = 0 for line in body.split(","): line = line.strip() if not line: continue # 跳过约束 / 索引 / 空行 upper = line.upper().lstrip() if upper.startswith(("PRIMARY KEY", "UNIQUE", "CHECK", "CONSTRAINT", "EXCLUDE", "FOREIGN KEY", "INDEX")): continue # 剩余视为列定义 col_count += 1 tables[table_name] = col_count return tables # --------------------------------------------------------------------------- # 加载 SQL 文件并建立映射 # --------------------------------------------------------------------------- _CORE_SQL = Path(r"C:\NeoZQYY\db\etl_feiqiu\schemas\core.sql") _DWD_SQL = Path(r"C:\NeoZQYY\db\etl_feiqiu\schemas\dwd.sql") _core_tables = _parse_tables(_CORE_SQL.read_text(encoding="utf-8")) _dwd_tables = _parse_tables(_DWD_SQL.read_text(encoding="utf-8")) # core → dwd 映射(手动定义,因为命名规则不完全一致) # 维度表:core 与 dwd 同名 # 事实表:core.fact_settlement → dwd.dwd_settlement_head # core.fact_payment → dwd.dwd_payment _CORE_TO_DWD_MAP: dict[str, str] = { "dim_site": "dim_site", "dim_member": "dim_member", "dim_assistant": "dim_assistant", "dim_table": "dim_table", "dim_goods_category": "dim_goods_category", "fact_settlement": "dwd_settlement_head", "fact_payment": "dwd_payment", } # 预检:确保映射中的表在两侧 SQL 中都存在 _valid_pairs: list[tuple[str, str, int, int]] = [] for core_name, dwd_name in _CORE_TO_DWD_MAP.items(): if core_name in _core_tables and dwd_name in _dwd_tables: _valid_pairs.append( (core_name, dwd_name, _core_tables[core_name], _dwd_tables[dwd_name]) ) # 确保至少有可测试的映射对 assert len(_valid_pairs) > 0, ( f"未找到有效的 core→dwd 映射对。" f" core 表: {list(_core_tables.keys())}," f" dwd 表: {list(_dwd_tables.keys())}" ) # --------------------------------------------------------------------------- # 属性测试 # --------------------------------------------------------------------------- @settings(max_examples=100) @given(pair=sampled_from(_valid_pairs)) def test_core_table_has_fewer_fields_than_dwd(pair): """**Validates: Requirements 7.5** 对于任意 core schema 中的表,其字段数量应严格少于 对应 dwd schema 中同名(或对应)表的字段数量。 """ core_name, dwd_name, core_count, dwd_count = pair assert core_count < dwd_count, ( f"core.{core_name} 有 {core_count} 个字段," f"但 dwd.{dwd_name} 只有 {dwd_count} 个字段。" f" 期望 core 字段数严格少于 dwd。" )