import datetime
import json
import ssl
import urllib.request
from rdflib import Graph
ctx = ssl.create_default_context()
graph = Graph()
date = datetime.datetime.now()
def preprocess_rdf_data(raw_data):
invalid_date_patterns = [
"3190000-01-01T00:00:00Z",
"-3190000-01-01T00:00:00Z"
]
for pattern in invalid_date_patterns:
raw_data = raw_data.replace(pattern, "0001-01-01T00:00:00Z")
return raw_data
def process_data(graph, lang, specials):
prefixes = []
suffixes = []
apprentices = 0
kits = 0
query = graph.query(f'''SELECT ?translation ?suffix WHERE {{
?cat wdt:P3 wd:Q622 ;
p:P84 ?name .
?name pq:P85 ?translation ;
pq:P111 ?suffix .
FILTER (lang(?translation) = "{lang}")
}}''')
for row in query:
if row.translation.value in specials:
prefix, suffix = specials[row.translation.value]
prefixes.append(prefix)
suffixes.append(suffix)
else:
if row.suffix.value == 'paw':
prefixes.append(row.translation.value[:-1])
apprentices += 1
elif row.suffix.value == 'kit':
if row.translation.value[-1] == '崽':
prefixes.append(row.translation.value[:-1])
else:
prefixes.append(row.translation.value[1:])
kits += 1
else:
prefixes.append(row.translation.value[:-1])
suffixes.append(row.translation.value[-1])
return {
'language': lang,
'date': {
'year': date.year,
'month': date.month,
'day': date.day
},
'prefixes': prefixes,
'suffixes': suffixes,
'apprentices': apprentices,
'kits': kits
}
url = 'https://raw.gitcode.com/ryj/dummp/raw/master/wbdump.ttl'
with urllib.request.urlopen(url, context=ctx) as dump:
raw_data = dump.read().decode('utf-8')
preprocessed_data = preprocess_rdf_data(raw_data)
graph.parse(data=preprocessed_data, format='turtle')
specials_CN = {'桦树皮': ('桦', '树皮')}
specials_TW = {'樺樹皮': ('樺', '樹皮')}
data_en = process_data(graph, 'en', {})
data_CN = process_data(graph, 'zh-cn', specials_CN)
data_TW = process_data(graph, 'zh-tw', specials_TW)
output_dir = 'data/'
import os
os.makedirs(output_dir, exist_ok=True)
with open(f'{output_dir}en.js', 'w', encoding='utf-8') as fw_en:
fw_en.write('const data = ' + json.dumps(data_en, ensure_ascii=False))
with open(f'{output_dir}zh-cn.js', 'w', encoding='utf-8') as fw_CN:
fw_CN.write('const data = ' + json.dumps(data_CN, ensure_ascii=False))
with open(f'{output_dir}zh-tw.js', 'w', encoding='utf-8') as fw_TW:
fw_TW.write('const data = ' + json.dumps(data_TW, ensure_ascii=False))
print("数据已成功处理并保存!")