HomeTrans/three_reference_tables/extract_table.py-代码预览-HomeTrans:基于 Claude Code 技能的 Android 转 HarmonyOS 应用转换项目 - AtomGit

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract tables from markdown files and save as JSON
"""
import os
import re
import json
import glob

def extract_tables_from_markdown(md_content):
    """
    Extract all tables from markdown content
    Returns a list of tables, each table is a list of rows (each row is a list of cells)
    """
    tables = []
    lines = md_content.split('\n')

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Check if this line looks like a table header row (contains |)
        if '|' in line and not line.startswith('#') and not line.startswith('```'):
            # Extract cells from header row
            header_row = [cell.strip() for cell in line.split('|')[1:-1]]

            # Check if next line is a separator row
            if i + 1 < len(lines):
                separator_line = lines[i + 1].strip()
                if re.match(r'^\|[:\- ]+\|$', separator_line) or (separator_line.startswith('|') and separator_line.endswith('|')):
                    # Valid separator found, this is a table
                    table = [header_row]

                    # Read data rows
                    j = i + 2
                    while j < len(lines):
                        data_line = lines[j].strip()
                        if data_line.startswith('|') and data_line.endswith('|') and not data_line.startswith('#') and not data_line.startswith('```'):
                            # Check if it's not a separator
                            if not re.match(r'^\|[:\- ]+\|$', data_line):
                                data_row = [cell.strip() for cell in data_line.split('|')[1:-1]]
                                if data_row:  # Only add if we have data
                                    table.append(data_row)
                        elif data_line == '':
                            # Empty line might indicate end of table
                            j += 1
                            break
                        else:
                            # Not a table row, table ended
                            break
                        j += 1

                    if len(table) > 1:  # Only add if we have header + at least one data row
                        tables.append(table)

                    i = j
                    continue

        i += 1

    return tables

def table_to_dict(table):
    """
    Convert table (list of rows) to list of dictionaries using first row as header
    """
    if len(table) < 2:
        return []

    headers = table[0]
    result = []

    for row in table[1:]:
        if len(row) == 0:
            continue

        # Create dict for each row
        row_dict = {}
        for i, header in enumerate(headers):
            if i < len(row):
                row_dict[header] = row[i]
            else:
                row_dict[header] = ''
        result.append(row_dict)

    return result

def process_markdown_file(md_path, output_dir=None):
    """
    Process a single markdown file and save tables to JSON
    """
    # Read markdown file
    with open(md_path, 'r', encoding='utf-8') as f:
        md_content = f.read()

    # Extract tables
    tables = extract_tables_from_markdown(md_content)

    if not tables:
        print(f"No tables found in: {md_path}")
        return

    # Determine output path
    base_name = os.path.splitext(os.path.basename(md_path))[0]
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        json_path = os.path.join(output_dir, base_name + '.json')
    else:
        json_path = os.path.splitext(md_path)[0] + '.json'

    # Convert to JSON-friendly format
    json_data = []
    for i, table in enumerate(tables):
        table_dict = table_to_dict(table)
        json_data.append({
            'table_index': i + 1,
            'rows': table_dict,
            'raw_table': table
        })

    # Write to JSON file
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)

    print(f"Extracted {len(tables)} tables from {os.path.basename(md_path)} -> {os.path.basename(json_path)}")
    for i, table in enumerate(tables):
        print(f"  Table {i+1}: {len(table)} rows, {len(table[0])} columns")

def main():
    # Current directory
    current_dir = os.path.dirname(__file__)

    # Find all .md files
    md_files = glob.glob(os.path.join(current_dir, '*.md'))

    print(f"Found {len(md_files)} markdown files")

    for md_file in md_files:
        process_markdown_file(md_file)

    print("\nDone!")

if __name__ == '__main__':
    main()