"""
Extract tables from markdown files and save as JSON
"""
import os
import re
import json
import glob
def extract_tables_from_markdown(md_content):
"""
Extract all tables from markdown content
Returns a list of tables, each table is a list of rows (each row is a list of cells)
"""
tables = []
lines = md_content.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
if '|' in line and not line.startswith('#') and not line.startswith('```'):
header_row = [cell.strip() for cell in line.split('|')[1:-1]]
if i + 1 < len(lines):
separator_line = lines[i + 1].strip()
if re.match(r'^\|[:\- ]+\|$', separator_line) or (separator_line.startswith('|') and separator_line.endswith('|')):
table = [header_row]
j = i + 2
while j < len(lines):
data_line = lines[j].strip()
if data_line.startswith('|') and data_line.endswith('|') and not data_line.startswith('#') and not data_line.startswith('```'):
if not re.match(r'^\|[:\- ]+\|$', data_line):
data_row = [cell.strip() for cell in data_line.split('|')[1:-1]]
if data_row:
table.append(data_row)
elif data_line == '':
j += 1
break
else:
break
j += 1
if len(table) > 1:
tables.append(table)
i = j
continue
i += 1
return tables
def table_to_dict(table):
"""
Convert table (list of rows) to list of dictionaries using first row as header
"""
if len(table) < 2:
return []
headers = table[0]
result = []
for row in table[1:]:
if len(row) == 0:
continue
row_dict = {}
for i, header in enumerate(headers):
if i < len(row):
row_dict[header] = row[i]
else:
row_dict[header] = ''
result.append(row_dict)
return result
def process_markdown_file(md_path, output_dir=None):
"""
Process a single markdown file and save tables to JSON
"""
with open(md_path, 'r', encoding='utf-8') as f:
md_content = f.read()
tables = extract_tables_from_markdown(md_content)
if not tables:
print(f"No tables found in: {md_path}")
return
base_name = os.path.splitext(os.path.basename(md_path))[0]
if output_dir:
os.makedirs(output_dir, exist_ok=True)
json_path = os.path.join(output_dir, base_name + '.json')
else:
json_path = os.path.splitext(md_path)[0] + '.json'
json_data = []
for i, table in enumerate(tables):
table_dict = table_to_dict(table)
json_data.append({
'table_index': i + 1,
'rows': table_dict,
'raw_table': table
})
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
print(f"Extracted {len(tables)} tables from {os.path.basename(md_path)} -> {os.path.basename(json_path)}")
for i, table in enumerate(tables):
print(f" Table {i+1}: {len(table)} rows, {len(table[0])} columns")
def main():
current_dir = os.path.dirname(__file__)
md_files = glob.glob(os.path.join(current_dir, '*.md'))
print(f"Found {len(md_files)} markdown files")
for md_file in md_files:
process_markdown_file(md_file)
print("\nDone!")
if __name__ == '__main__':
main()