import xml.sax
from xml.sax.handler import *
import gzip
import struct, sys, string, codecs,os
def text(nodes):
label = ""
textnodes = [x for x in nodes if x.nodeName == "#text"]
for t in textnodes:
label += t.data
return label
def strcasecmp(a, b):
result = 0
result = cmp(a[0].lower() , b[0].lower())
return result
def merge_dup(list):
newlist = []
lastkey = ""
for x in list:
if x[0] == lastkey:
newlist[-1] = (newlist[-1][0], newlist[-1][1] + "\n" + x[1])
else:
newlist.append(x)
lastkey = x[0]
return newlist
class JMDictHandler(ContentHandler):
def __init__(self):
self.mapping = []
self.state = ""
self.buffer = ""
def startElement(self, name, attrs):
if name == "entry":
self.kanji = []
self.chars = []
self.gloss = []
self.state = ""
self.buffer = ""
elif name == "keb":
self.state = "keb"
elif name == "reb":
self.state = "reb"
elif name == "gloss" and not attrs:
self.state = "gloss"
elif name == "xref":
self.state = "xref"
def endElement(self, name):
if name == "entry":
self.mapping.append((self.kanji, self.chars, self.gloss))
elif name == "keb":
self.kanji.append(self.buffer)
elif name == "reb":
self.chars.append(self.buffer)
elif name == "gloss" and self.buffer:
self.gloss.append(self.buffer)
elif name == "xref":
self.gloss.append(self.buffer)
self.buffer = ""
self.state = ""
def characters(self, ch):
if self.state in ["keb", "reb", "gloss", "xref"]:
self.buffer = self.buffer + ch
def map_to_file(dictmap, filename):
dict = open(filename + ".dict","wb")
idx = open(filename + ".idx","wb")
offset = 0
idx.write("StarDict's idx file\nversion=2.1.0\n");
idx.write("bookname=" + filename + "\nauthor=Jim Breen\nemail=j.breen@csse.monash.edu.au\nwebsite=http://www.csse.monash.edu.au/~jwb/j_jmdict.html\ndescription=Convert to stardict by Alastair Tse <liquidx@gentoo.org>, http://www-lce.eng.cam.ac.uk/~acnt2/code/\ndate=2003.07.01\n")
idx.write("sametypesequence=m\n")
idx.write("BEGIN:\n")
idx.write(struct.pack("!I",len(dictmap)))
for k,v in dictmap:
k_utf8 = k.encode("utf-8")
v_utf8 = v.encode("utf-8")
idx.write(k_utf8 + "\0")
idx.write(struct.pack("!I",offset))
idx.write(struct.pack("!I",len(v_utf8)))
offset += len(v_utf8)
dict.write(v_utf8)
dict.close()
idx.close()
if __name__ == "__main__":
print("opening xml dict ..")
f = gzip.open("JMdict.gz")
print("parsing xml file ..")
parser = xml.sax.make_parser()
handler = JMDictHandler()
parser.setContentHandler(handler)
parser.parse(f)
f.close()
print("creating dictionary ..")
jap_to_eng = []
for kanji,chars,gloss in handler.mapping:
for k in kanji:
key = k
value = string.join(chars + gloss, "\n")
jap_to_eng.append((key,value))
for c in chars:
key = c
value = string.join(kanji + gloss, "\n")
jap_to_eng.append((key,value))
eng_to_jap = []
for kanji,chars,gloss in handler.mapping:
for k in gloss:
key = k
value = string.join(kanji + chars, "\n")
eng_to_jap.append((key,value))
print("sorting dictionary ..")
jap_to_eng.sort(strcasecmp)
eng_to_jap.sort(strcasecmp)
print("merging and pruning dups..")
jap_to_eng = merge_dup(jap_to_eng)
eng_to_jap = merge_dup(eng_to_jap)
print("writing to files..")
map_to_file(jap_to_eng, "jmdict-ja-en")
map_to_file(eng_to_jap, "jmdict-en-ja")