#!/usr/bin/env python # -*- coding: utf8 -*- import sys import re import codecs from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring ############################# # OUTPUT # ############################# def read_saldom(xml='lexikon/saldom/saldom.xml'): """Read the (sblex) XML version of SALDO's morphological lexicon (lexikon/saldom/saldom.xml).""" import xml.etree.cElementTree as cet context = cet.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = context.next() possibilities = {} # in the form { (gf,pos) : [saldo..1,saldo..2 ... ] } for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': pos = elem.findtext("pos") gf = elem.findtext("gf") if not (gf, pos) in possibilities: possibilities[(gf, pos)] = [] saldo_list = elem.findall("saldo") for sitem in saldo_list: possibilities[(gf, pos)].append(sitem.text) # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() return possibilities class LMF: def __init__(self, lang): self.lang = lang self.lexical_entries = [] self._lexical_entries_set = set() self._le_senses = set() self.semantic_predicates = [] def add_lexical_entry(self, lexical_entry): self.lexical_entries.append(lexical_entry) self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden) def add_semantic_predicate(self, semantic_predicate): self.semantic_predicates.append(semantic_predicate) def __unicode__(self): return "\n".join([ '', '', '', '', ' ', '', '', ' ' % self.lang, "\n".join([unicode(e) for e in self.lexical_entries]), "\n".join([unicode(s) for s in self.semantic_predicates]), '', '']) class LexicalEntry: def __init__(self): self.features = [] self.lemma = None self.wordforms = [] self.senses = [] self._pos = "" self._wf = "" self.idattr = "" def add_sense(self, sense): self.senses.append(sense) def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def add_wordform(self, wordform): self.wordforms.append(wordform) def __unicode__(self): le_unicodeing = '' if(self.idattr): le_unicodeing = '' % (self.idattr) return "\n".join([ le_unicodeing, '\n'.join([unicode(f) for f in self.features]), unicode(self.lemma), '\n'.join([unicode(w) for w in self.wordforms]), '\n'.join([unicode(s) for s in self.senses]), '']) """ class Lemma: def __init__(self): self.features = [] # now including writtenForm and partOfSpeech! def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def __unicode__(self): if self.features: return "\n".join(['\n', '\n'.join([unicode(f) for f in self.features]), '\n']) else: return '' """ class Lemma: def __init__(self): self.form_representations = [] self.features = [] # now including writtenForm and partOfSpeech! def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def add_form_representation(self, form_representation): self.form_representations.append(form_representation) def __unicode__(self): if self.features or self.form_representations: return "\n".join(['', '\n'.join(unicode(fr) for fr in self.form_representations),'']) #return "\n".join(['\n', # '\n'.join([unicode(f) for f in self.features]), # '\n']) else: return '' class WordForm: def __init__(self): self.features = [] def add_feature(self, feature): self.features.append(feature) def __unicode__(self): return "\n".join(['', '\n'.join([unicode(f) for f in self.features]), '']) class FormRepresentation: def __init__(self): self.features = [] def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def __unicode__(self): if self.features: return "\n".join(['','\n'.join([unicode(f) for f in self.features]),'']) else: return '' class Feature: def __init__(self, att, val): self.att = att self.val = val def __unicode__(self): return '' % (self.att, escape(self.val)) class Sense: def __init__(self, sense): self.sense = sense self.relations = [] self.predicative_representations = [] self.sense_examples = [] self.features = [] def add_feature(self, feature): self.features.append(feature) def add_sense_relation(self, sense_relation): self.relations.append(sense_relation) def add_predicative_representation(self, predicative_representation): self.predicative_representations.append(predicative_representation) def add_sense_example(self, sense_example): self.sense_examples.append(sense_example) def __unicode__(self): if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features: return '' % (self.sense) else: return "\n".join(['' % (self.sense), "\n".join([unicode(pre) for pre in self.predicative_representations]), "\n".join([unicode(rel) for rel in self.relations]), "\n".join([unicode(ex) for ex in self.sense_examples]), "\n".join([unicode(f) for f in self.features]), '' ]) class SenseRelation: def __init__(self, target, relation_types): self.target = target self.relation_types = relation_types self.features = [] def add_feature(self, feature): self.features.append(feature) def __unicode__(self): return "\n".join(['' % (self.target), '\n'.join(['' % t for t in self.relation_types]), '\n'.join([unicode(f) for f in self.features]), '' ]) class SenseExample: def __init__(self, example): self.example = example self.features = [] def add_feature(self, feature): self.features.append(feature) def __unicode__(self): return "\n".join([ '', '' % (escape(self.example)), "\n".join([unicode(f) for f in self.features]), '' ]) class SemanticPredicate: def __init__(self, id, domain, semantic_types): self.id = id #self.domain = domain self.semantic_types = semantic_types self.semantic_arguments = [] self.features = [] if domain != None and domain != "": self.add_feature(Feature("domain", domain)) def add_semantic_argument(self, argument): self.semantic_arguments.append(argument) def add_feature(self, feature): self.features.append(feature) def generateFeatures(self, att, vals): for val in vals: self.add_feature(Feature(att, val.unicodeip())) def __unicode__(self): extras = "" for st in self.semantic_types: extras += '' return "\n".join([ '' % (self.id), "\n".join(['\n' % (st) for st in self.semantic_types]), "\n".join([unicode(fe) for fe in self.features]), "\n".join([unicode(sa) for sa in self.semantic_arguments]), '' ]) class SemanticArgument: def __init__(self, semantic_role, core_type): self.semantic_role = semantic_role self.core_type = core_type def __unicode__(self): return '' % (self.semantic_role, self.core_type) class PredicativeRepresentation: def __init__(self, idref): self.idref = idref def __unicode__(self): return '' % (self.idref, self.idref) def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') return s.replace('"', '"') def escapeContent(s): s = s.replace('&', '&') s = s.replace('<', '<') s = s.replace('>', '>') return s ############################# # INPUT # ############################# def take(entry, tagname, resort): e2 = entry.find(tagname) if e2 != None: t = e2.text if(t == None): return resort else: return t.unicodeip() else: return resort def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict posHash = { u"subst." : u"nn", u"prep." : u"pp", u"pron." : u"pn", u"förk." : u"nna", u"subst. plural" : u"nn", u"verb" : u"vb", u"subst., ingen böjning" : u"nn", u"adj." : u"av", u"adv." : u"ab", u"interj." : u"in", u"räkn." : u"nl", u"förled" : u"", u"namn" : u"pm", u"adj., ingen böjning" : u"av", u"subst. bestämd form singular" : u"nn", u"konj." : u"kn" } multiPosHash = { u"subst." : u"nnm", u"prep." : u"ppm", u"pron." : u"pnm", u"förk." : u"nna", u"subst. plural" : u"nnm", u"verb" : u"vbm", u"subst., ingen böjning" : u"nnm", u"adj." : u"avm", u"adv." : u"abm", u"interj." : u"inm", u"räkn." : u"nlm", u"förled" : u"", u"namn" : u"pmm", u"adj., ingen böjning" : u"avm", u"subst. bestämd form singular" : u"nnm", u"konj." : u"knm" } lexinInflectionToMsd = { u"best.f.sing." : u"sg def nom", u"obest.f.pl." : u"pl indef nom", u"best.f.pl." : u"pl def nom", u"tform" : u"pos indef sg n nom", # ? u"aform" : u"pos indef pl nom", # ? u"infinitiv" : u"inf aktiv", u"imperfekt" : u"pret ind aktiv", u"supinum" : u"sup aktiv", u"perf.part." : u"", u"imperativ" : u"imper" } def lexinPosToSaldoPos(inpos, inword): if " " in inword: pos = multiPosHash.get(inpos, None) else: pos = posHash.get(inpos, None) if inpos in [u"subst. plural"]: extra = u"plural" elif inpos in [u"subst., ingen böjning", u"adj., ingen böjning"]: extra = u"oböjl" elif inpos in [u"subst. bestämd form singular"]: extra = u"sing best" elif inpos in [u"förled"]: extra = u"förled" else: extra = None return (pos, extra) if __name__ == '__main__': global mode global language global saldom saldom = read_saldom() tree = ElementTree() tree.parse("orig_lexin/svenska4/swe_swe.xml") lmf = LMF('swe') usedSensesCount = {} entries = tree.findall("Article") for entry in entries: e = LexicalEntry() lexin_lemma = entry.find("Lemma") saldo_link_set = set() lemma = Lemma() e.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) fr.add_feature(Feature("rawForm", lexin_lemma.get("Value"))) lexin_pos = lexin_lemma.get("Type") pos = None if lexin_pos != None: fr.add_feature(Feature("lexinPartOfSpeech", lexin_pos)) pos = lexinPosToSaldoPos(lexin_pos, lexin_lemma.get("Value"))[0] if pos != None: fr.add_feature(Feature("partOfSpeech", pos)) infinitiv = None for lexin_inflection in lexin_lemma.findall("Inflection"): wordform = WordForm() lexin_text = lexin_inflection.text wordform.add_feature(Feature("writtenForm", lexin_text)) lexin_form = lexin_inflection.get("Form") if lexin_form != None: if lexin_form == "infinitiv": infinitiv = lexin_text msd = lexinInflectionToMsd.get(lexin_form,"") wordform.add_feature(Feature("msd", msd)) wordform.add_feature(Feature("lexinForm", lexin_form)) if lexin_inflection.get("Spec") != None: wordform.add_feature(Feature("lexinSpec", lexin_inflection.get("Spec"))) e.add_wordform(wordform) writtenForm = None if (pos == "vb" or pos == "vbm") and infinitiv != None: # we have to find the infinitive since Lexin is based on present tense writtenForm = infinitiv else: writtenForm = lexin_lemma.get("Value").replace("|", "") fr.add_feature(Feature("writtenForm", writtenForm)) if pos != None and writtenForm != None: possible_saldo_senses = saldom.get((writtenForm, pos),[]) #if len(possible_saldo_senses) == 0: #print writtenForm + " (" + pos + ")" for pss in possible_saldo_senses: e.add_feature(Feature("saldoLink", pss)) lexin_phonetic = lexin_lemma.find("Phonetic") if lexin_phonetic != None: fr.add_feature(Feature("phoneticForm", lexin_phonetic.text)) if lexin_lemma.get("Hyphenate", "") != "": fr.add_feature(Feature("hyphenatedForm", lexin_lemma.get("Hyphenate"))) if lexin_lemma.get("Rank", "") != "": fr.add_feature(Feature("rank", lexin_lemma.get("Rank"))) if lexin_lemma.get("ID", "") != "": fr.add_feature(Feature("lexinID", lexin_lemma.get("ID"))) if lexin_lemma.get("Variant", "") != "": fr.add_feature(Feature("lexinVariant", lexin_lemma.get("Variant"))) for lexin_reference in lexin_lemma.findall("Reference"): reftype = lexin_reference.get("Type") if reftype == "see": e.add_feature(Feature("see", lexin_reference.get("Value"))) elif reftype == "compare": e.add_feature(Feature("compareWith", lexin_reference.get("Value"))) for lexin_lexeme in lexin_lemma.findall("Lexeme"): baseform = lexin_lemma.get("Value").replace("|", "") index = usedSensesCount.get(baseform, 0) + 1 #if not index.isdigit(): # index = "1" #if usedSensesCount.get(baseform, 0) >= index: # index = usedSensesCount.get(baseform, 0) + 1 sense = Sense("lexin--" + baseform + ".." + str(index)) lexin_lexemeno = lexin_lexeme.get("Lexemeno") lexin_variantID = lexin_lemma.get("VariantID","") sense.add_feature(Feature("lexinVariantID", lexin_variantID)) if not lexin_lexemeno.isdigit(): lexin_lexemeno = "1" sense.add_feature(Feature("lexinLexemeNumber", lexin_lexemeno)) if baseform in usedSensesCount: usedSensesCount[baseform] = usedSensesCount[baseform] + 1 else: usedSensesCount[baseform] = 1 if lexin_lexeme.get("ID", None) != None: sense.add_feature(Feature("lexinID", lexin_lexeme.get("ID"))) if lexin_lexeme.find("Definition") != None: sense.add_feature(Feature("definition", lexin_lexeme.find("Definition").text)) for lexin_example in lexin_lexeme.findall("Example"): sense_example = SenseExample(lexin_example.text) sense_example.add_feature(Feature("type", "example")) sense.add_sense_example(sense_example) for lexin_idiom in lexin_lexeme.findall("Idiom"): sense_example = SenseExample(lexin_idiom.text) sense_example.add_feature(Feature("type", "idiom")) sense.add_sense_example(sense_example) for lexin_compound in lexin_lexeme.findall("Compound"): sense_example = SenseExample(lexin_compound.text) sense_example.add_feature(Feature("type", "compound")) sense.add_sense_example(sense_example) for lexin_comment in lexin_lexeme.findall("Comment"): comment_type = lexin_comment.get("Type") if comment_type == "style": sense.add_feature(Feature("usg", lexin_comment.text)) elif comment_type == "def": sense.add_feature(Feature("desc", lexin_comment.text)) for lexin_reference in lexin_lexeme.findall("Reference"): reftype = lexin_reference.get("Type") if reftype == "see": sense.add_feature(Feature("see", lexin_reference.get("Value"))) elif reftype == "compare": sense.add_feature(Feature("compareWith", lexin_reference.get("Value"))) elif reftype == "antonym": sense.add_feature(Feature("antonym", lexin_reference.get("Value"))) for lexin_gramcom in lexin_lexeme.findall("Gramcom"): sense.add_feature(Feature("gram", lexin_gramcom.text)) for lexin_graminfo in lexin_lexeme.findall("Graminfo"): sense.add_feature(Feature("gram", lexin_graminfo.text)) for lexin_theme in lexin_lexeme.findall("Theme"): for theme_string in lexin_theme.get("Tema").split(","): sense.add_feature(Feature("lexinTheme", theme_string.split(":")[0].strip())) e.add_sense(sense) lmf.add_lexical_entry(e) print unicode(lmf).encode("utf-8")