VERSION = "0.4"
import getopt, sys
def usage():
print("Lingea Dictionary Decoder")
print("-------------------------")
print("Version: %s" % VERSION)
print("Copyright (C) 2007 - Klokan Petr Pridal, Petr Dlouhy")
print()
print("Usage: python lingea-trd-decoder.py DICTIONARY.trd > DICTIONARY.tab")
print("Result conversion by stardict-tools: /usr/lib/stardict-tools/tabfile")
print()
print(" -o <num> --out-style : Output style")
print(" 0 no tags")
print(" 1 \\n tags")
print(" 2 html tags")
print(" -h --help : Print this message")
print(" -d --debug : Degub")
print(" -r --debug-header : Degub - print headers")
print(" -a --debug-all : Degub - print all records")
print(" -l --debug-limit : Degub limit")
print()
print("For HTML support in StarDict dictionary .ifo has to contain:")
print("sametypesequence=g")
print("!!! Change the .ifo file after generation by tabfile !!!")
print()
try:
opts, args = getopt.getopt(sys.argv[1:], "hdo:ral:", ["help", "debug", "out-style=", "debug-header", "debug-all", "debug-limit="])
except getopt.GetoptError:
usage()
print("ERROR: Bad option")
sys.exit(2)
import locale
DEBUG = False
OUTSTYLE = 2
DEBUGHEADER = False
DEBUGALL = False
DEBUGLIMIT = 1
for o, a in opts:
if o in ("-d", "-debug"):
DEBUG = True
if o in ("-o", "--out-style"):
OUTSTYLE = locale.atoi(a)
if OUTSTYLE > 2:
usage()
print("ERROR: Output style not specified")
if o in ("-r", "--debug-header"):
DEBUGHEADER = True
if o in ("-a", "--debug-all"):
DEBUGALL = True
if o in ("-h", "--help"):
usage()
sys.exit(0)
if o in ("-l", "--debug-limit"):
DEBUGLIMIT = locale.atoi(a)
if len(args) == 1:
FILENAME = args[0]
else:
usage()
print("ERROR: You have to specify .trd file to decode")
sys.exit(2)
from struct import *
import re
alpha = ['\x00', 'a','b','c','d','e','f','g','h','i',
'j','k','l','m','n','o','p','q','r','s',
't','u','v','w','x','y','z','#AL27#','#AL28#','#AL29#',
'#AL30#','#AL31#', ' ', '.', '<', '>', ',', ';', '-', '#AL39#',
'#GRAVE#', '#ACUTE#', '#CIRC#', '#TILDE#', '#UML#', '#AL45#', '#AL46#', '#CARON#', '#AL48#', '#CEDIL#',
'#AL50#', '#AL51#', '#GREEK#', '#AL53#', '#AL54#', '#AL55#', '#AL56#', '#AL57#', '#AL58#', '#SYMBOL#',
'#AL60#', '#UPCASE#', '#SPECIAL#', '#UNICODE#']
upcase = ['#UP0#','#UP1#','#UP2#','#UP3#','#UP4#','#UP5#','#UP6#','#UP7#','#UP8#','#UP9#',
'#UP10#','#UP11#','#UP12#','#UP13#','#UP14#','#UP15#','#UP16#','#UP17#','#UP18#','#UP19#',
'#UP20#','#UP21#','#UP22#','#UP23#','#UP24#','#UP25#','#UP26#','#UP27#','#UP28#','#UP29#',
'#UP30#','#UP31#','A','B','C','D','E','F','G','H',
'I','J','K','L','M','N','O','P','Q','R',
'S','T','U','V','W','X','Y','Z','#UP58#','#UP59#',
'#UP60#','#UP61#','#UP62#','#UP63#']
upcase_pron = ['#pr0#', '#pr1#','#pr2#','#pr3#','#pr4#','#pr5#','#pr6#','#pr7#','#pr8#','#pr9#',
'#pr10#', '#pr11#','#pr12#','#pr13#','#pr14#','#pr15#','#pr16#','#pr17#','#pr18#','#pr19#',
'#pr20#', '#pr21#','#pr22#','#pr23#','#pr24#','#pr25#','#pr26#','#pr27#','#pr28#','#pr29#',
'#pr30#', '#pr31#','ɑ','#pr33#','ʧ','ð','ə','ɜ','#pr38#','æ',
'ɪ', 'ɭ','#pr42#','ŋ','#pr44#','ɳ','ɔ','#pr47#','ɒ','ɽ',
'ʃ', 'θ','ʊ','ʌ','#pr54#','#pr55#','#pr56#','ʒ','#pr58#','#pr59#',
'#pr60#', '#pr61#','#pr62#','#pr63#']
symbol = ['#SY0#', '#SY1#','#SY2#','#SY3#','§','#SY5#','#SY6#','#SY7#','#SY8#','#SY9#',
'#SY10#', '#SY11#','#SY12#','#SY13#','#SY14#','™','#SY16#','#SY17#','¢','£',
'#SY20#', '#SY21#','#SY22#','#SY23#','©','#SY25#','#SY26#','#SY27#','®','°',
'#SY30#', '²','³','#SY33#','#SY34#','#SY35#','¹','#SY37#','#SY38#','#SY39#',
'½', '#SY41#','#SY42#','×','÷','#SY45#','#SY46#','#SY47#','#SY48#','#SY49#',
'#SY50#', '#SY51#','#SY52#','#SY53#','#SY54#','#SY55#','#SY56#','#SY57#','#SY58#','#SY59#',
'#SY60#', '#SY61#','#SY62#','#SY63#']
special = ['#SP0#', '!','"','#','$','%','&','\'','(',')',
'*', '+','#SP12#','#SP13#','#SP14#','/','0','1','2','3',
'4', '5','6','7','8','9',':',';','<','=',
'>', '?','@','[','\\',']','^','_','`','{',
'|', '}','~','#SP43#','#SP44#','#SP45#','#SP46#','#SP47#','#SP48#','#SP49#',
'#SP50#', '#SP51#','#SP52#','#SP53#','#SP54#','#SP55#','#SP56#','#SP57#','#SP58#','#SP59#',
'#SP60#', '#SP61#','#SP62#','#SP63#']
wordclass = ('#0#','n:','adj:','pron:','#4#','v:','adv:','prep:','#8#','#9#',
'intr:','phr:','#12#','#13#','#14#','#15#','#16#','#17#','#18#','#19#',
'#20#','#21#','#22#','#23#','#24#','#25#','#26#','#27#','#28#','#29#',
'#30#','#31#')
if OUTSTYLE == 0:
tag = {
'db':('' ,''),
'rn':('' ,'\t'),
'va':('' ,' '),
'wc':('(' ,')'),
'pa':('' ,' '),
'fo':('(' ,') '),
'on':('(' ,')' ),
'pr':('[' ,']'),
'dv':('{' ,'} '),
'sa':('`' ,'`' ),
'sw':('' ,''),
'do':('`' ,'`' ),
'df':('' ,' '),
'ps':('"' ,'" '),
'pg':('"' ,' = '),
'pc':('`' ,'`'),
'p1':('"' ,' = '),
'p2':('' ,'" ' ),
'sp':('"' ,' = ' ),
'b1':('"' ,' = '),
'b2':('" ' ,''),
}
if OUTSTYLE == 1:
tag = {
'db':('•' ,''),
'rn':('' ,'\t'),
'va':('' ,' '),
'wc':('' ,'\\n'),
'pa':('' ,':\\n'),
'fo':('(' ,') '),
'on':('(' ,')\\n' ),
'pr':('[' ,']\\n'),
'dv':('{' ,'} '),
'sa':(' ' ,'\\n' ),
'sw':('' ,''),
'do':(' ' ,' ' ),
'df':(' ' ,'\\n'),
'ps':(' ' ,'\\n'),
'pg':(' ' ,' '),
'pc':(' ' ,' '),
'p1':(' ' ,' '),
'p2':(' ' ,'\\n' ),
'sp':('' ,'\\n' ),
'b1':('"' ,' = '),
'b2':('" ' ,''),
}
if OUTSTYLE == 2:
tag = {
'db':('•' ,''),
'rn':('' ,'\t'),
'va':('' ,' '),
'wc':('<span size="larger" color="darkred" weight="bold">','</span>\\n'),
'pa':('<span size="larger" color="darkred" weight="bold">',':</span>\\n'),
'fo':('(' ,') '),
'on':('<span color="blue">(' ,')</span>\\n' ),
'pr':('[' ,']\\n'),
'dv':('{' ,'} '),
'sa':(' <span color="darkred" weight="bold">' ,'</span>\\n' ),
'sw':('' ,''),
'do':(' <span color="darkred" weight="bold">' ,'</span> ' ),
'df':(' <span weight="bold">' ,'</span>\\n'),
'ps':(' <span color="dimgray" weight="bold">' ,'</span>\\n'),
'pg':(' <span color="darkgreen" style="italic">' ,'</span> '),
'pc':(' <span color="darkgreen" style="italic">' ,'</span> '),
'p1':(' <span color="dimgray" style="italic">' ,'</span> '),
'p2':(' ' ,'\\n' ),
'sp':('<span color="cyan">' ,'</span>\\n' ),
'b1':('"' ,' = '),
'b2':('" ' ,''),
}
purple = lambda c: '\x1b[1;35m'+c+'\x1b[0m'
blue = lambda c: '\x1b[1;34m'+c+'\x1b[0m'
cyan = lambda c: '\x1b[36m'+c+'\x1b[0m'
gray = lambda c: '\x1b[1m'+c+'\x1b[0m'
def getRec(n):
"""Get data stream for record of given number"""
if n >= 0 and n < entryCount:
f.seek(index[n])
return f.read(index[n+1] - index[n])
else:
return ''
def decode_alpha( stream, nullstop=True):
"""Decode 6-bit encoding data stream from the begining untit first NULL"""
offset = 0
triple = 0
result = []
while triple < len( stream ):
if offset % 4 == 0:
c = stream[triple] >> 2
triple += 1
if offset % 4 == 1:
c = (stream[triple-1] & 3) << 4 | stream[triple] >> 4
triple += 1
if offset % 4 == 2:
c = (stream[triple-1] & 15) << 2 | (stream[triple] & 192) >> 6
triple += 1
if offset % 4 == 3:
c = stream[triple-1] & 63
if c == 0 and nullstop:
break
offset += 1
result.append(c)
return decode_alpha_postprocessing(result), triple - 1
def decode_alpha_postprocessing( input ):
"""Lowlevel alphabet decoding postprocessing, combines tuples into one character"""
result = ""
input.extend([0x00]*5)
skip = False
for i in range(0,len(input)-1):
if skip:
skip = False
continue
bc = input[i]
c = alpha[bc]
bc1 = input[i+1]
c1 = alpha[bc1]
if bc < 40:
result += c
else:
if c == "#GRAVE#":
if c1 == 'a': result += 'à'
else: result += '#GRAVE%s#' % c1
elif c == "#UML#":
if c1 == 'o': result += 'ö'
elif c1 == 'u': result += 'ü'
elif c1 == 'a': result += 'ä'
elif c1 == ' ': result += 'Ä'
elif c1 == '#AL46#': result += 'Ö'
elif c1 == '#GREEK#': result += 'Ü'
else: result += '#UML%s#' % c1
elif c == "#ACUTE#":
if c1 == 'a': result += 'á'
elif c1 == 'e': result += 'é'
elif c1 == 'i': result += 'í'
elif c1 == 'o': result += 'ó'
elif c1 == 'u': result += 'ú'
elif c1 == 'y': result += 'ý'
elif c1 == ' ': result += 'Á'
elif c1 == '#GRAVE#': result += 'Í'
else: result += '#ACUTE%s#' % c1
elif c == "#CARON#":
if c1 == 'r': result += 'ř'
elif c1 == 'c': result += 'č'
elif c1 == 's': result += 'š'
elif c1 == 'z': result += 'ž'
elif c1 == 'e': result += 'ě'
elif c1 == 'd': result += 'ď'
elif c1 == 't': result += 'ť'
elif c1 == 'a': result += 'å'
elif c1 == 'u': result += 'ů'
elif c1 == 'n': result += 'ň'
elif c1 == '<': result += 'Č'
elif c1 == '#CEDIL#': result += 'Ř'
elif c1 == '#AL50#': result += 'Š'
elif c1 == '#AL57#': result += 'Ž'
else: result += '#CARON%s#' % c1
elif c == "#UPCASE#":
result += upcase[bc1]
elif c == "#SYMBOL#":
result += symbol[bc1]
elif c == "#AL51#":
if c1 == 's': result += 'ß'
elif c == "#AL48#":
result += "#AL48#%s" % c1
elif c == "#SPECIAL#":
result += special[bc1]
elif c == "#UNICODE#":
result += '#UNICODE%s#' % bc1
elif c == "#CIRC#":
if c1 == 'a': result += 'â'
else: result += '#CARON%s#' % c1
else:
result += '%sX%s#' % (c[:-1], bc1)
skip = True
return result
def pronunciation_encode(s):
"""Encode pronunciation upcase symbols into IPA symbols"""
for i in range(0, 64):
s = s.replace(upcase[i], upcase_pron[i])
return s
re_d = re.compile(r'<d(.*?)>')
re_w = re.compile(r'<w(.*?)>')
re_y = re.compile(r'<y(.*?)>')
re_c = re.compile(r'<c(.*?)>')
def decode_tag_postprocessing(input):
"""Decode and replace tags used in lingea dictionaries; decode internal tags"""
s = input
if OUTSTYLE == 0:
s = re_d.sub(r'(\1)',s)
s = re_w.sub(r'(\1)',s)
s = re_y.sub(r'(\1)',s)
s = re_c.sub(r'(\1)',s)
if OUTSTYLE == 1:
s = re_d.sub(r'(\1)',s)
s = re_w.sub(r'(\1)',s)
s = re_y.sub(r'(\1)',s)
s = re_c.sub(r'(\1)',s)
if OUTSTYLE == 2:
s = re_d.sub(r'<span size="small" color="blue">(\1)</span>',s)
s = re_w.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
s = re_y.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
s = re_c.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
return s
def toBin( b ):
"""Prettify debug output format: hex(bin)dec"""
original = b
r = 0;
i = 1;
while b > 0:
if b & 0x01 != 0: r += i
i *= 10
b = b >> 1
return "0x%02X(%08d)%03d" % (original, r, original)
def out( comment = "", skip = False):
"""Read next byte or string (with skip=True) and output DEBUG info"""
global bs, pos
s, triple = decode_alpha(bs[pos:])
s = s.split('\x00')[0]
if (comment.find('%') != -1):
if skip:
comment = comment % s
else:
comment = comment % bs[pos]
if DEBUG: print("%03d %s %s | %s | %03d" % (pos, toBin(bs[pos]),comment, s, (triple + pos)))
if skip:
pos += triple + 1
return s.replace('`','')
else:
pos += 1
return bs[pos-1]
outInt = lambda c: out(c)
outStr = lambda c: out(c, True)
def decode(stream):
"""Decode byte stream of one record, return decoded string with formatting in utf"""
result = ""
global bs, pos
bs = unpack("<%sB" % len(stream), stream)
pos = 0
itemCount = outInt("ItemCount: %s")
mainFlag = outInt("MainFlag: %s")
if mainFlag & 0x01:
headerFlag = outInt("HeaderFlag: %s")
if headerFlag & 0x01:
result += tag['rn'][0] + outStr("Header record name: %s").replace('_','') + tag['rn'][1]
if headerFlag & 0x02:
result += tag['va'][0] + outStr("Header variant: %s") + tag['va'][1]
if headerFlag & 0x04:
s = outInt("Header wordclass: %s")
if s < 32:
result += tag['wc'][0] + wordclass[s] + tag['wc'][1]
else:
raise "Header wordclass out of range in: %s" % result
if headerFlag & 0x08:
result += tag['pa'][0] + outStr("Header parts: %s") + tag['pa'][1]
if headerFlag & 0x10:
result += tag['fo'][0] + outStr("Header forms: %s") + tag['fo'][1]
if headerFlag & 0x20:
result += tag['on'][0] + outStr("Header origin note: %s") + tag['on'][1]
if headerFlag & 0x80:
result += tag['pr'][0] + pronunciation_encode(outStr("Header pronunciation: %s")) + tag['pr'][1]
if mainFlag & 0x02:
headerFlag = outInt("Header dataFlag: %s")
if headerFlag & 0x02:
result += tag['dv'][0] + outStr("Header dataVariant: %s")+ tag['dv'][1]
pass
if mainFlag & 0x80:
outInt("Sound reference byte #1: %s")
outInt("Sound reference byte #2: %s")
outInt("Sound reference byte #3: %s")
outInt("Sound reference byte #4: %s")
outInt("Sound reference byte #5: %s")
li = 0
global lastWordClass
lastWordClass = 0
for i in range(0, itemCount):
item = tag['db'][0] + tag['db'][1]
ol = False
dataFlag = outInt("DataFlag: %s -----------------------------")
if dataFlag & 0x01:
sampleFlag = outInt("Data sampleFlag: %s")
if sampleFlag & 0x01:
result += tag['sa'][0] + outStr("Data sample: %s") + tag['sa'][1]
if sampleFlag & 0x04:
s = outInt("Data wordclass: %s")
if s != lastWordClass:
if s < 32:
result += tag['wc'][0] + wordclass[s] + tag['wc'][1]
else:
raise "Header wordclass out of range in: %s" % result
lastWordClass = s
if sampleFlag & 0x08:
result += tag['sw'][0] + outStr("Data sample wordclass: %s") + tag['sw'][1]
if sampleFlag & 0x10:
outInt("Data sample Int: %s")
outInt("Data sample Int: %s")
outInt("Data sample Int: %s")
if sampleFlag & 0x20:
item += tag['do'][0] + outStr("Data origin note: %s") + tag['do'][1]
if sampleFlag & 0x80:
item += " "
result += tag['pr'][0] + pronunciation_encode(outStr("Data sample pronunciation: %s")) + tag['pr'][1]
if dataFlag & 0x02:
item += " "
subFlag = outInt("Data subFlag: %s")
if subFlag == 0x80:
outStr("Data sub prefix: %s")
li += 1
ol = True
if dataFlag & 0x04:
pass
if dataFlag & 0x08:
item += tag['df'][0] + outStr("Data definition: %s") + tag['df'][1]
if dataFlag & 0x10:
pass
if dataFlag & 0x20:
phraseFlag1 = outInt("Data phraseFlag1: %s")
if phraseFlag1 & 0x01:
item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1]
if phraseFlag1 & 0x02:
phraseCount = outInt("Data phraseCount: %s")
for i in range(0, phraseCount):
phraseComment = outInt("Data phrase prefix")
if phraseComment & 0x04:
item += tag['pc'][0] + outStr("Data phrase comment: %s") + tag['pc'][1]
item += tag['p1'][0] + outStr("Data phrase 1: %s") + tag['p1'][1]
item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1]
if phraseFlag1 & 0x04:
phraseCount = outInt("Data phraseCount: %s")
for i in range(0, phraseCount):
phraseComment = outInt("Data phrase prefix")
if phraseComment & 0x04:
item += tag['pc'][0] + outStr("Data phrase 1: %s") + tag['pc'][1]
item += tag['pg'][0] + outStr("Data phrase comment: %s") + tag['pg'][1]
item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1]
if phraseFlag1 & 0x08:
phraseCount = outInt("Data simple phraseCount: %s")
for i in range(0, phraseCount):
item += " "
item += tag['sp'][0] + outStr("Data simple phrase: %s") + tag['sp'][1]
if phraseFlag1 & 0x40:
item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1]
if dataFlag & 0x40:
pass
if dataFlag & 0x80:
flags = [
out("Data phrase block: %s"),
out("Data phrase block: %s"),
out("Data phrase block: %s"),
out("Data phrase block: %s"),
out("Data phrase block: %s"),
out("Data phrase block: %s"),
out("Data phrase block: %s"),
out("Data phrase block: %s")]
if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x0B,0x01]:
result += "\\nphr: "
li = 1
ol = True
item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1]
out("Data phrase block: %s")
out("Data phrase block: %s")
out("Data phrase block: %s")
out("Data phrase block: %s")
item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1]
if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x23,0x01]:
result += "\\nphr: "
li = 1
ol = True
item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1]
out("Data phrase block: %s")
out("Data phrase block: %s")
out("Data phrase block: %s")
out("Data phrase block: %s")
out("Data phrase block: %s")
item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1]
if ol:
result += "\\n%d. %s" % (li, item)
else:
result += item
ok = True
while pos < len(stream):
ok = (out() == 0x00) and ok
if ok:
result += '\n'
return decode_tag_postprocessing(result)
f = open(FILENAME,'rb')
copyright = unpack("<64s",f.read(64))[0]
a = unpack("<16L",f.read(64))
entryCount = a[4]
indexBaseCount = a[6]
indexOffsetCount = a[7]
pos1 = a[8]
indexPos = a[9]
bodyPos = a[10]
smallIndex = (a[3] == 2052)
index = []
f.seek(indexPos)
bases = unpack("<%sL" % indexBaseCount, f.read(indexBaseCount * 4))
if smallIndex:
bases4 = []
for i in bases:
bases4.extend([i,i,i,i])
bases = bases4
for b in bases:
offsets = unpack("<64H", f.read(64*2))
for o in offsets:
if len(index) < indexOffsetCount:
index.append(bodyPos + b + o * 4)
if DEBUG:
for i in range(1,entryCount):
if not DEBUGALL:
DEBUG = False
s = decode(getRec(i))
if DEBUGHEADER:
print(s)
if DEBUGLIMIT > 0 and not s.endswith('\n'):
DEBUG = True
print("-"*80)
print("%s) at address %s" % (i, toBin(index[i])))
print()
s = decode(getRec(i))
print(s)
DEBUGLIMIT -= 1
DEBUG = True
else:
for i in range(1,entryCount):
s = decode(getRec(i))
if s.endswith('\n'):
print(s, end=' ')
else:
print(s)
print("!!! RECORD STRUCTURE DECODING ERROR !!!")
print("Please run this script in DEBUG mode and repair DATA BLOCK(S) section in function decode()")
print("If you succeed with whole dictionary send report (name of the dictionary and source code of script) to slovniky@googlegroups.com")
break