# Adapted to produce DICT-compatible files by Petr Rockai in 2012 # Based on code from wiktiondict by Greg Hewgill import re import sys import codecs import os import textwrap import time import xml.sax class Text: def __init__(self, s): self.s = s def process(self): return s class TemplateCall: def __init__(self): pass def process(self): pass class Template: def __init__(self): self.parts = [] def append(self, part): self.parts.append(part) def process(self): return ''.join(x.process() for x in self.parts) class Whitespace: def __init__(self, s): self.s = s class OpenDouble: pass class OpenTriple: pass class CloseDouble: pass class CloseTriple: pass class Equals: def __str__(self): return "=" class Delimiter: def __init__(self, c): self.c = c def __str__(self): return self.c def Tokenise(s): s = unicode(s) stack = [] last = 0 i = 0 while i < len(s): if s[i] == '{' and i+1 < len(s) and s[i+1] == '{': if i > last: yield s[last:i] if i+2 < len(s) and s[i+2] == '{': yield OpenTriple() stack.append(3) i += 3 else: yield OpenDouble() stack.append(2) i += 2 last = i elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}': if i > last: yield s[last:i] if len(stack) == 0: yield "}}" i += 2 elif stack[-1] == 2: yield CloseDouble() i += 2 stack.pop() elif i+2 < len(s) and s[i+2] == '}': yield CloseTriple() i += 3 stack.pop() else: raise SyntaxError() last = i elif s[i] == ':' or s[i] == '|': if i > last: yield s[last:i] yield Delimiter(s[i]) i += 1 last = i elif s[i] == '=': if i > last: yield s[last:i] yield Equals() i += 1 last = i #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n': # if i > last: # yield s[last:i] # last = i # m = re.match(r"\s+", s[i:]) # assert m # yield Whitespace(m.group(0)) # i += len(m.group(0)) # last = i else: i += 1 if i > last: yield s[last:i] def processSub(templates, tokens, args): t = tokens.next() if not isinstance(t, unicode): raise SyntaxError name = t t = tokens.next() default = None if isinstance(t, Delimiter) and t.c == '|': default = "" while True: t = tokens.next() if isinstance(t, unicode): default += t elif isinstance(t, OpenDouble): default += processTemplateCall(templates, tokens, args) elif isinstance(t, OpenTriple): default += processSub(templates, tokens, args) elif isinstance(t, CloseTriple): break else: print "Unexpected:", t raise SyntaxError() if name in args: return args[name] if default is not None: return default if name == "lang": return "en" return "{{{%s}}}" % name def processTemplateCall(templates, tokens, args): template = tokens.next().strip().lower() args = {} a = 1 t = tokens.next() while True: if isinstance(t, Delimiter): name = unicode(a) arg = "" while True: t = tokens.next() if isinstance(t, unicode): arg += t elif isinstance(t, OpenDouble): arg += processTemplateCall(templates, tokens, args) elif isinstance(t, OpenTriple): arg += processSub(templates, tokens, args) elif isinstance(t, Delimiter) and t.c != '|': arg += str(t) else: break if isinstance(t, Equals): name = arg.strip() arg = "" while True: t = tokens.next() if isinstance(t, (unicode, Equals)): arg += unicode(t) elif isinstance(t, OpenDouble): arg += processTemplateCall(templates, tokens, args) elif isinstance(t, OpenTriple): arg += processSub(templates, tokens, args) elif isinstance(t, Delimiter) and t.c != '|': arg += str(t) else: break arg = arg.strip() else: a += 1 args[name] = arg elif isinstance(t, CloseDouble): break else: print "Unexpected:", t raise SyntaxError #print template, args if template[0] == '#': if template == "#if": if args['1'].strip(): return args['2'] elif '3' in args: return args['3'] else: return "" elif template == "#ifeq": if args['1'].strip() == args['2'].strip(): return args['3'] elif '4' in args: return args['4'] else: return "" elif template == "#ifexist": return "" elif template == "#switch": sw = args['1'].strip() if sw in args: return args[sw] else: return "" else: print "Unknown ParserFunction:", template sys.exit(1) if template not in templates: return "{{%s}}" % template return process(templates, templates[template], args) def process(templates, s, args = {}): s = re.compile(r"", re.DOTALL).sub("", s) s = re.compile(r".*?", re.DOTALL).sub("", s) assert "" not in s #s = re.sub(r"(.*?)(.*?)(.*)", r"\1", s) s = re.compile(r"(.*?)", re.DOTALL).sub(r"\1", s) r = "" #print list(Tokenise(s)) tokens = Tokenise(s) try: while True: t = tokens.next() if isinstance(t, OpenDouble): r += processTemplateCall(templates, tokens, args) elif isinstance(t, OpenTriple): r += processSub(templates, tokens, args) else: r += unicode(t) except StopIteration: pass return r def test(): templates = { 'lb': "{{", 'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].", 't': "start-{{{1|pqr}}}-end", 't0': "start-{{{1}}}-end", 't1': "start{{{1}}}endmoo", 't2a1': "{{t2demo|a|{{{1}}}}}", 't2a2': "{{t2demo|a|2={{{1}}}}}", 't2demo': "start-{{{1}}}-middle-{{{2}}}-end", 't5': "{{t2demo|{{{a}}}=b}}", 't6': "t2demo|a", } def t(text, expected): print "text:", text s = process(templates, text) if s != expected: print "got:", s print "expected:", expected sys.exit(1) t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].") t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].") t("{{t0|a}}", "start-a-end") t("{{t0| }}", "start- -end") t("{{t0|}}", "start--end") t("{{t0}}", "start-{{{1}}}-end") t("{{t0| }}", "start- -end") t("{{t0|\n}}", "start-\n-end") t("{{t0|1= }}", "start--end") t("{{t0|1=\n}}", "start--end") t("{{T}}", "start-pqr-end") t("{{T|}}", "start--end") t("{{T|abc}}", "start-abc-end") t("{{T|abc|def}}", "start-abc-end") t("{{T|1=abc|1=def}}", "start-def-end") t("{{T|abc|1=def}}", "start-def-end") t("{{T|1=abc|def}}", "start-def-end") t("{{T|{{T}}}}", "start-start-pqr-end-end") t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end") t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end") t("{{T|a{{t|b}}}}", "start-astart-b-end-end") t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end") t("{{T|a=b}}", "start-pqr-end") t("{{T|1=a=b}}", "start-a=b-end") #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}") #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end") #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end") #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end") #t("{{ {{t6}} }}", "{{ t2demo|a }}") t("{{t|[[a|b]]}}", "start-b-end") t("{{t|[[a|b]] }}", "start-b -end") Parts = { # Standard POS headers 'noun': "n.", 'Noun': "n.", 'Noun 1': "n.", 'Noun 2': "n.", 'Verb': "v.", 'Adjective': "adj.", 'Adverb': "adv.", 'Pronoun': "pron.", 'Conjunction': "conj.", 'Interjection': "interj.", 'Preposition': "prep.", 'Proper noun': "n.p.", 'Proper Noun': "n.p.", 'Article': "art.", # Standard non-POS level 3 headers '{{acronym}}': "acr.", 'Acronym': "acr.", '{{abbreviation}}': "abbr.", '[[Abbreviation]]': "abbr.", 'Abbreviation': "abbr.", '[[initialism]]': "init.", '{{initialism}}': "init.", 'Initialism': "init.", 'Contraction': "cont.", 'Prefix': "prefix", 'Suffix': "suffix", 'Symbol': "sym.", 'Letter': "letter", 'Idiom': "idiom", 'Idioms': "idiom", 'Phrase': "phrase", # Debated POS level 3 headers 'Number': "num.", 'Numeral': "num.", 'Cardinal number': "num.", 'Ordinal number': "num.", 'Cardinal numeral': "num.", 'Ordinal numeral': "num.", # Other headers in use 'Personal pronoun': "pers.pron.", 'Adjective/Adverb': "adj./adv.", 'Proper adjective': "prop.adj.", 'Determiner': "det.", 'Demonstrative determiner': "dem.det.", 'Clitic': "clitic", 'Infix': "infix", 'Counter': "counter", 'Kanji': None, 'Kanji reading': None, 'Hiragana letter': None, 'Katakana letter': None, 'Pinyin': None, 'Han character': None, 'Hanzi': None, 'Hanja': None, 'Proverb': "prov.", 'Expression': None, 'Adjectival noun': None, 'Quasi-adjective': None, 'Particle': "part.", 'Infinitive particle': "part.", 'Possessive adjective': "poss.adj.", 'Verbal prefix': "v.p.", 'Postposition': "post.", 'Prepositional article': "prep.art.", 'Phrasal verb': "phr.v.", 'Participle': "participle", 'Interrogative auxiliary verb': "int.aux.v.", 'Pronominal adverb': "pron.adv.", 'Adnominal': "adn.", 'Abstract pronoun': "abs.pron.", 'Conjunction particle': None, 'Root': "root", # Non-standard, deprecated headers 'Noun form': "n.", 'Verb form': "v.", 'Adjective form': "adj.form.", 'Nominal phrase': "nom.phr.", 'Noun phrase': "n. phrase", 'Verb phrase': "v. phrase", 'Transitive verb': "v.t.", 'Intransitive verb': "v.i.", 'Reflexive verb': "v.r.", 'Cmavo': None, 'Romaji': "rom.", 'Hiragana': None, 'Furigana': None, 'Compounds': None, # Other headers seen 'Alternative forms': None, 'Alternative spellings': None, 'Anagrams': None, 'Antonym': None, 'Antonyms': None, 'Conjugation': None, 'Declension': None, 'Declension and pronunciations': None, 'Definite Article': "def.art.", 'Definite article': "def.art.", 'Demonstrative pronoun': "dem.pron.", 'Derivation': None, 'Derived expression': None, 'Derived expressions': None, 'Derived forms': None, 'Derived phrases': None, 'Derived terms': None, 'Derived, Related terms': None, 'Descendants': None, #'Etymology': None, #'Etymology 1': None, #'Etymology 2': None, #'Etymology 3': None, #'Etymology 4': None, #'Etymology 5': None, 'Examples': None, 'External links': None, '[[Gismu]]': None, 'Gismu': None, 'Homonyms': None, 'Homophones': None, 'Hyphenation': None, 'Indefinite article': "art.", 'Indefinite pronoun': "ind.pron.", 'Indefinite Pronoun': "ind.pron.", 'Indetermined pronoun': "ind.pron.", 'Interrogative conjunction': "int.conj.", 'Interrogative determiner': "int.det.", 'Interrogative particle': "int.part.", 'Interrogative pronoun': "int.pron.", 'Legal expression': "legal", 'Mass noun': "n.", 'Miscellaneous': None, 'Mutations': None, 'Noun and verb': "n/v.", 'Other language': None, 'Pinyin syllable': None, 'Possessive determiner': "poss.det.", 'Possessive pronoun': "poss.pron.", 'Prepositional phrase': "prep.phr.", 'Prepositional Pronoun': "prep.pron.", 'Pronunciation': None, 'Pronunciation 1': None, 'Pronunciation 2': None, 'Quotations': None, 'References': None, 'Reflexive pronoun': "refl.pron.", 'Related expressions': None, 'Related terms': None, 'Related words': None, 'Relative pronoun': "rel.pron.", 'Saying': "saying", 'See also': None, 'Shorthand': None, '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None, 'Sister projects': None, 'Spelling note': None, 'Synonyms': None, 'Translation': None, 'Translations': None, 'Translations to be checked': None, 'Transliteration': None, 'Trivia': None, 'Usage': None, 'Usage in English': None, 'Usage notes': None, 'Verbal noun': "v.n.", } PartsUsed = {} for p in Parts.keys(): PartsUsed[p] = 0 def encode(s): r = e(s) assert r[1] == len(s) return r[0] def dowikilink(m): a = m.group(1).split("|") if len(a) > 1: link = a[1] else: link = a[0] if ':' in link: link = "" return link seentemplates = {} def dotemplate(m): aa = m.group(1).split("|") args = {} n = 0 for a in aa: am = re.match(r"(.*?)(=(.*))?", a) if am: args[am.group(1)] = am.group(3) else: n += 1 args[n] = am.group(1) #if aa[0] in seentemplates: # seentemplates[aa[0]] += 1 #else: # seentemplates[aa[0]] = 1 # print len(seentemplates), aa[0] #print aa[0] #if aa[0] not in Templates: # return "(unknown template %s)" % aa[0] #body = Templates[aa[0]] #body = re.sub(r".*?", "", body) #assert "" not in body ##body = re.sub(r"(.*?)(.*?)(.*)", r"\1", body) #body = re.sub(r"(.*?)", r"\1", body) #def dotemplatearg(m): # ta = m.group(1).split("|") # if ta[0] in args: # return args[ta[0]] # elif len(ta) > 1: # return ta[1] # else: # return "{{{%s}}}" % ta[0] #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body) #return dewiki(body) def doparserfunction(m): a = m.group(2).split("|") if m.group(1) == "ifeq": if a[0] == a[1]: return a[2] elif len(a) >= 4: return a[3] return "" def dewiki(body, indent = 0): # process in this order: # {{{ }}} # <> <> # [[ ]] # {{ }} # ''' ''' # '' '' #body = wikimediatemplate.process(Templates, body) body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body) #body = re.sub(r"{{(.*?)}}", dotemplate, body) #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body) body = re.sub(r"'''(.*?)'''", r"\1", body) body = re.sub(r"''(.*?)''", r"\1", body) lines = body.split("\n") n = 0 i = 0 while i < len(lines): if len(lines[i]) > 0 and lines[i][0] == "#": if len(lines[i]) > 1 and lines[i][1] == '*': wlines = textwrap.wrap(lines[i][2:].strip(), initial_indent = " * ", subsequent_indent = " ") elif len(lines[i]) > 1 and lines[i][1] == ':': wlines = textwrap.wrap(lines[i][2:].strip(), initial_indent = " ", subsequent_indent = " ") else: n += 1 wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(), subsequent_indent = " ") elif len(lines[i]) > 0 and lines[i][0] == "*": n = 0 wlines = textwrap.wrap(lines[i][1:].strip(), initial_indent = "* ", subsequent_indent = " ") else: n = 0 wlines = textwrap.wrap(lines[i].strip()) if len(wlines) == 0: wlines = [''] lines[i:i+1] = wlines i += len(wlines) return ''.join(" "*(indent-1)+x+"\n" for x in lines) class WikiSection: def __init__(self, heading, body): self.heading = heading self.body = body #self.lines = re.split("\n+", body.strip()) #if len(self.lines) == 1 and len(self.lines[0]) == 0: # self.lines = [] self.children = [] def __str__(self): return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children])) def add(self, section): self.children.append(section) def parse(word, text): headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE)) #print [x.group(1) for x in headings] doc = WikiSection(word, "") stack = [doc] for i, m in enumerate(headings): depth = len(m.group(1)) if depth < len(stack): stack = stack[:depth] else: while depth > len(stack): s = WikiSection(None, "") stack[-1].add(s) stack.append(s) if i+1 < len(headings): s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip()) else: s = WikiSection(m.group(2), text[m.end(0):].strip()) assert len(stack) == depth stack[-1].add(s) stack.append(s) #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1: # doc = doc.children[0] return doc def formatFull(word, doc): def f(depth, section): if section.heading: r = " "*(depth-1) + section.heading + "\n\n" else: r = "" if section.body: r += dewiki(section.body, depth+1)+"\n" #r += "".join(" "*depth + x + "\n" for x in dewiki(section.body)) #if len(section.lines) > 0: # r += "\n" for c in section.children: r += f(depth+1, c) return r s = f(0, doc) s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word return s def formatNormal(word, doc): def f(depth, posdepth, section): r = "" if depth == posdepth: if not section.heading or section.heading.startswith("Etymology"): posdepth += 1 elif section.heading in Parts: #p = Parts[section.heading] #if p: # r += " "*(depth-1) + word + " (" + p + ")\n\n" r += " "*(depth-1) + section.heading + "\n\n" else: print >>errors, "Unknown part: (%s) %s" % (word, section.heading) return "" elif depth > posdepth: return "" elif section.heading: r += " "*(depth-1) + section.heading + "\n\n" if section.body: r += dewiki(section.body, depth+1)+"\n" #r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines)) #if len(section.lines) > 0: # r += "\n" for c in section.children: r += f(depth+1, posdepth, c) return r s = f(0, 3, doc) s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word return s def formatBrief(word, doc): def f(depth, posdepth, section): if depth == posdepth: h = section.heading if not section.heading or section.heading.startswith("Etymology"): posdepth += 1 elif section.heading in Parts: #h = Parts[section.heading] #if h: # h = "%s (%s)" % (word, h) pass stack.append([h, False]) elif depth > 0: stack.append([section.heading, False]) else: stack.append(["%h " + section.heading, False]) r = "" #if section.heading: # r += " "*(depth-1) + section.heading + "\n" body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#') if len(body) > 0: for i in range(len(stack)): if not stack[i][1]: if stack[i][0]: r += " "*(i-1) + stack[i][0] + "\n" stack[i][1] = True r += dewiki(body, depth+1) for c in section.children: r += f(depth+1, posdepth, c) stack.pop() return r stack = [] s = f(0, 3, doc) s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word return s class WikiHandler(xml.sax.ContentHandler): def __init__(self): self.element = None self.page = None self.text = "" self.long = {} def startElement(self, name, attrs): #print "start", name, attrs self.element = name def endElement(self, name): #print "end", name if self.element == "text": if self.page: if self.page in self.long: print self.page, len(self.text) print self.doPage(self.page, self.text) self.page = None self.text = "" self.element = None def characters(self, content): #print "characters", content if self.element == "title": if self.checkPage(content): self.page = content elif self.element == "text": if self.page: self.text += content if len(self.text) > 100000 and self.page not in self.long: self.long[self.page] = 1 def checkPage(self, page): return False def doPage(self, page, text): pass class TemplateHandler(WikiHandler): def checkPage(self, page): return page.startswith("Template:") def doPage(self, page, text): Templates[page[page.find(':')+1:].lower()] = text class WordHandler(WikiHandler): def checkPage(self, page): return ':' not in page def doPage(self, page, text): m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE) if m: out.write(" See <%s>" % page) return doc = parse(page, text) out.write(formatBrief(page, doc)) #print formatBrief(page, doc) fn = sys.argv[1] info = """ This file was converted from the original database on: %s The original data is available from: http://en.wiktionary.org The version from which this file was generated was: %s Wiktionary is available under the GNU Free Documentation License. """ % (time.ctime(), os.path.basename(fn)) errors = codecs.open("mkdict.err", "w", "utf_8") e = codecs.getencoder("utf_8") Templates = {} f = os.popen("bunzip2 -c %s" % fn, "r") xml.sax.parse(f, TemplateHandler()) f.close() f = os.popen("bunzip2 -c %s" % fn, "r") out = codecs.getwriter("utf_8")( os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")) out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8')) xml.sax.parse(f, WordHandler()) f.close() out.close()