feiqiu-ETL/hebing.py

import sys
from pathlib import Path


# 分隔线定义，保持与你示例中的长度一致
HEADER_LINE_SHORT = "=" * 22        # ======================
HEADER_LINE_LONG = "=" * 26         # ==========================
MIDDLE_LINE = "-" * 31              # -------------------------------


def merge_md_and_json(directory: str, output_file: str = "merged_output.txt") -> None:
    """
    在指定目录下，将 .md 和 .json 文件按规则合并输出到一个文件中。

    规则：
    - 遍历所有 .md 文件；
    - 对每个 .md 文件，取“文件名（不含扩展名）中第一个空格前的字符串”作为 key；
    - 在同目录下按 key 精确匹配 .json 的“文件名（不含扩展名）”；
    - 只有存在匹配的 json 文件时才合并；找不到 json 的 md 文件丢弃。
    """
    base_dir = Path(directory).resolve()
    if not base_dir.is_dir():
        raise NotADirectoryError(f"指定路径不是目录: {base_dir}")

    # 收集所有 json 文件，以“文件名（不含扩展名）”为 key
    json_map = {}
    for json_path in base_dir.glob("*.json"):
        key = json_path.stem  # 文件名不含扩展名
        json_map[key] = json_path

    # 收集所有 md 文件
    md_files = list(base_dir.glob("*.md"))
    # 为了输出顺序稳定，按文件名排序
    md_files.sort(key=lambda p: p.name)

    output_path = base_dir / output_file

    with output_path.open("w", encoding="utf-8") as out_f:
        first_section_written = False

        for md_path in md_files:
            # 取 md 文件名（不含扩展名）第一个空格前部分作为 key
            md_stem = md_path.stem
            key = md_stem.split(" ", 1)[0]  # 仅按第一个空格切分

            json_path = json_map.get(key)
            if json_path is None:
                # 没有匹配到 json，丢弃此 md
                continue

            # 读 md 内容
            with md_path.open("r", encoding="utf-8") as f_md:
                md_content = f_md.read()

            # 读 json 内容
            with json_path.open("r", encoding="utf-8") as f_json:
                json_content = f_json.read()

            # 如果不是第一段，可以视需要在前面插入一个空行，避免段落粘连
            if first_section_written:
                out_f.write("\n")
            first_section_written = True

            # 写入合并内容
            # 结构：
            # ======================
            # XXX.md
            # ======================
            # <md 内容>
            #
            # -------------------------------
            # 示例数据：
            # <json 内容>
            #
            # ==========================
            # <下一个 md 文件名>
            # ==========================

            # 头部（短等号）
            out_f.write(f"{HEADER_LINE_SHORT}\n")
            out_f.write(f"{md_path.name}\n")
            out_f.write(f"{HEADER_LINE_SHORT}\n")
            out_f.write(md_content.rstrip() + "\n")  # 去掉尾部多余换行，统一在后面加一个

            out_f.write("\n")
            out_f.write(f"{MIDDLE_LINE}\n")
            out_f.write("示例数据：\n")
            out_f.write(json_content.rstrip() + "\n")

            # 底部（长等号）
            out_f.write("\n")
            out_f.write(f"{HEADER_LINE_LONG}\n")
            out_f.write(f"{md_path.name}\n")
            out_f.write(f"{HEADER_LINE_LONG}\n")

    print(f"合并完成，输出文件：{output_path}")


if __name__ == "__main__":
    """
    用法示例：
        python merge_md_json.py /path/to/dir
        python merge_md_json.py /path/to/dir result.txt
    """
    if len(sys.argv) < 2:
        print("用法: python merge_md_json.py <目录路径> [输出文件名]")
        sys.exit(1)

    input_dir = sys.argv[1]
    if len(sys.argv) >= 3:
        output_name = sys.argv[2]
    else:
        output_name = "merged_output.txt"

    merge_md_and_json(input_dir, output_name)