#!/usr/bin/env python
# -*- coding: utf8 -*-
import sys
import re
import codecs
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
TODO = "todo"
#############################
# OUTPUT #
#############################
class LMF:
def __init__(self, lang):
self.lang = lang
self.lexical_entries = []
self._lexical_entries_set = set()
self._le_senses = set()
def add_lexical_entry(self, lexical_entry):
self.lexical_entries.append(lexical_entry)
self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf]))
def __unicode__(self):
return "\n".join([
'',
'',
'',
'',
' ',
'',
'',
' ' % self.lang,
"\n".join([unicode(e) for e in self.lexical_entries]),
'',
''])
class LexicalEntry:
def __init__(self):
self.features = []
self.senses = []
self._pos = ""
self._wf = ""
self.idattr = ""
def add_sense(self, sense):
self.senses.append(sense)
def __unicode__(self):
le_string = ''
if(self.idattr):
le_string = '' % (self.idattr)
return "\n".join([
le_string,
'',
'\n'.join([unicode(s) for s in self.senses]),
'\n'.join([unicode(f) for f in self.features]),
''])
def add_feature(self, feature):
self.features.append(feature)
class Feature:
def __init__(self, att, val):
self.att = att
self.val = val
def __unicode__(self):
return '' % (self.att, escape(self.val))
class Sense:
def __init__(self, sense):
self.sense = sense
self.relations = []
self.sense_examples = []
self.konst_examples = []
self.konst_definitions = []
self.features = []
self.int_const_elems = []
self.ext_const_elems = []
def add_int_const_elem(self, const_elem):
self.int_const_elems.append(const_elem)
def add_ext_const_elem(self, const_elem):
self.ext_const_elems.append(const_elem)
def add_sense_relation(self, sense_relation):
self.relations.append(sense_relation)
def add_sense_example(self, sense_example):
self.sense_examples.append(sense_example)
def add_konst_example(self, konst_example):
self.konst_examples.append(konst_example)
def add_konst_definition(self, konst_definition):
self.konst_definitions.append(konst_definition)
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
construction_elements_block = ""
return "\n".join(['' % (self.sense),
"\n".join([unicode(rel) for rel in self.relations]),
"\n".join([unicode(ex) for ex in self.sense_examples]),
"\n".join([unicode(ke) for ke in self.konst_examples]),
"\n".join([unicode(kd) for kd in self.konst_definitions]),
"\n".join([unicode(ice) for ice in self.int_const_elems]),
"\n".join([unicode(ece) for ece in self.ext_const_elems]),
'\n'.join([unicode(f) for f in self.features]),
''
])
class KonstExample:
def __init__(self, example_parts):
self.example_parts = example_parts
def __unicode__(self):
return '%s' % (self.unicodeRec())
def unicodeRec(self):
out = []
c = 0
for p in self.example_parts:
if p[0] == "/freetext":
out.append('%s' % (c, escapeContent(p[1])))
elif p[0] == "/leaf":
out.append('%s' % (p[1].get("name", ""), c, escapeContent(p[2])))
elif p[0] == "/branch":
out.append('%s' % (p[1].get("name", ""), c, KonstExample(p[2]).unicodeRec()))
c += 1
return "".join(out)
class KonstDefinition:
def __init__(self, definition_parts):
self.definition_parts = definition_parts
def __unicode__(self):
return '%s' % (self.unicodeRec())
def unicodeRec(self):
out = []
c = 0
for p in self.definition_parts:
if p[0] == "/freetext":
out.append('%s' % (c, escapeContent(p[1])))
elif p[0] == "/leaf":
out.append('%s' % (p[1].get("name", ""), c, escapeContent(p[2])))
elif p[0] == "/branch":
out.append('%s' % (p[1].get("name", ""), c, KonstDefinition(p[2]).unicodeRec()))
c += 1
return "".join(out)
class KonstInternalConstructionElement:
def __init__(self, elemtype, attributes_dict):
self.attributes = attributes_dict
self.elemtype = elemtype
def __unicode__(self):
attributes_text = " ".join(['%s="%s"' % (x, escape(self.attributes[x]) ) for x in self.attributes])
return '' % (self.elemtype, attributes_text)
class KonstExternalConstructionElement:
def __init__(self, elemtype, attributes_dict):
self.attributes = attributes_dict
self.elemtype = elemtype
def __unicode__(self):
attributes_text = " ".join(['%s="%s"' % (x, escape(self.attributes[x]) ) for x in self.attributes])
return '' % (self.elemtype, attributes_text)
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
s = s.replace('[NI]', '(NI)') # special for konstruktikon
s = s.replace('[DNI]', '(DNI)') # special for konstruktikon
return s.replace('"', '"')
def escapeContent(s):
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
s = s.replace('[NI]', '(NI)') # special for konstruktikon
s = s.replace('[DNI]', '(DNI)') # special for konstruktikon
return s
#############################
# INPUT #
#############################
def handle_example(example):
example_parts = []
if example.text != None:
first_text = example.text.strip()
else:
first_text = ""
if first_text != "":
example_parts.append(("/freetext", first_text))
for part in example:
inners = part.findall("e")
if inners == None or len(inners) == 0:
example_parts.append(("/leaf", part.attrib, part.text.strip() ))
else:
example_parts.append(("/branch", part.attrib, handle_example(part) ))
thetail = part.tail
if thetail != None:
thetail = thetail.strip()
if thetail != "":
example_parts.append(("/freetext", thetail))
return example_parts
def take(entry, tagname, resort):
e2 = entry.find(tagname)
if e2 != None:
t = e2.text
if(t == None):
return resort
else:
return t.strip()
else:
return resort
def list_to_dict(alist):
dict = {}
for item in alist:
dict[item[0]] = item[1]
return dict
if __name__ == '__main__':
global mode
global language
if len(sys.argv) < 2:
mode = 0
language = 0 # english
else:
if sys.argv[1] == "simplified":
mode = 1
else:
mode = 0
if sys.argv[2] == "english":
language = 0
else:
language = 1
lines = []
tree = ElementTree()
tree.parse("../konstruktikon-data/constructicon.xml")
lmf = LMF('swe')
entries = tree.findall("entry")
for entry in entries:
if entry.tag == TODO:
continue
e = LexicalEntry()
the_id = entry.get("{http://www.w3.org/XML/1998/namespace}id")
s = Sense("konstruktikon--" + the_id)
e.add_sense(s)
typee = take(entry, "type", "")
if typee != "":
s.add_feature(Feature("type", typee))
cat = take(entry, "cat", "")
if cat != "":
s.add_feature(Feature("cat", cat))
inheritance = take(entry, "inheritance", "")
if inheritance != "":
s.add_feature(Feature("inheritance", inheritance))
evokes = take(entry, "evokes", "")
if evokes != "":
s.add_feature(Feature("evokes", evokes))
definition = entry.find("definition")
def_parts = []
if definition != None:
if definition.text != None:
first_text = definition.text.strip()
if first_text != "":
def_parts.append(("/freetext", first_text))
for atom in definition:
if atom.tag == TODO:
continue
if atom.text != None:
def_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip()))
thetail = atom.tail
if thetail != None:
thetail = thetail.strip()
if thetail != "":
def_parts.append(("/freetext", thetail))
if len(def_parts) != 0:
s.add_konst_definition(KonstDefinition(def_parts))
structure = take(entry, "structure", "")
if structure != "":
s.add_feature(Feature("structure", structure))
saldo_link_set = set()
cee = take(entry, "cee", "")
if cee != "":
for cp in cee.split():
s.add_feature(Feature("cee", cp))
# Also add a saldo-link (if its a saldo sense id)
if cp.count(".") == 2:
saldo_link_set.add(cp)
coll = take(entry, "coll", "")
if coll != "":
# first take all this kind: {apa: bpa}
coll_parts = re.findall(r'\{.*?\}', coll)
# then delete them
coll = re.sub(r'\{.*?\}', "", coll)
lu_parts = coll.split()
for lup in lu_parts:
lup = lup.strip()
if lup != "":
coll_parts.append(lup)
for cp in coll_parts:
s.add_feature(Feature("coll", cp))
saldo_senses = re.findall(r'\w+\.\.\d+', cp, re.U)
for ss in saldo_senses:
saldo_link_set.add(ss)
for sl in saldo_link_set:
e.add_feature(Feature("saldoLink", sl))
c_e = entry.find("construction_elements")
if c_e != None:
internal = c_e.find("internal")
if internal != None:
for el in internal:
if el.tag == TODO:
continue
element = None
if el.tag == "role":
element = KonstInternalConstructionElement("role", list_to_dict(el.items()))
elif el.tag == "cat":
element = KonstInternalConstructionElement("cat", list_to_dict(el.items()))
s.add_int_const_elem(element)
external = c_e.find("external")
if external != None:
for el in external:
if el.tag == TODO:
continue
element = None
if el.tag == "role":
element = KonstExternalConstructionElement("role", list_to_dict(el.items()))
elif el.tag == "cat":
element = KonstExternalConstructionElement("cat", list_to_dict(el.items()))
s.add_ext_const_elem(element)
examples = entry.find("examples")
for example in examples:
if example != None:
if example.tag == TODO:
continue
example_parts = handle_example(example)
s.add_konst_example(KonstExample(example_parts))
comment = take(entry, "comment", "")
if comment != "":
s.add_feature(Feature("comment", comment))
reference = take(entry, "reference", "")
if reference != "":
s.add_feature(Feature("reference", reference))
lmf.add_lexical_entry(e)
print unicode(lmf).encode("utf-8")