#!/usr/bin/env python # -*- coding: utf8 -*- import sys import re import codecs import os import copy import glob from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring def read_saldom(xml='lexikon/saldom/saldom.xml'): """Read the (sblex) XML version of SALDO's morphological lexicon (lexikon/saldom/saldom.xml).""" import xml.etree.cElementTree as cet context = cet.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = context.next() possibilities = {} # in the form { (gf,pos) : [saldo..1,saldo..2 ... ] } for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': pos = elem.findtext("pos") gf = elem.findtext("gf") if not (gf, pos) in possibilities: possibilities[(gf, pos)] = [] saldo_list = elem.findall("saldo") for sitem in saldo_list: possibilities[(gf, pos)].append(sitem.text) # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() return possibilities def read_languages(): output = {} all_languages = [("russian", "orig_lexin/ryska"), ("albanian", "orig_lexin/albanska"), ("arabic", "orig_lexin/arabic"), ("bosnic", "orig_lexin/bosniska"), ("english", "orig_lexin/engelska"), ("finnish", "orig_lexin/finska"), ("greek", "orig_lexin/grekiska"), ("croatian", "orig_lexin/kroatiska"), ("northKurdish", "orig_lexin/nordkurdiska"), ("farsi", "orig_lexin/persiska"), ("serbian", "orig_lexin/serbiska"), ("serbianCyrillic", "orig_lexin/serbiska_kyrillisk"), ("somali", "orig_lexin/somaliska"), ("spanish", "orig_lexin/spanska"), ("southKurdish", "orig_lexin/sydkurdiska"), ("turkish", "orig_lexin/turkiska")] # TODO: add the others as well wrong_keys = 0 for (lang_name, folder_path) in all_languages: for infile in glob.glob( os.path.join(folder_path, '*.xml') ): tree = ElementTree() tree.parse(infile) lexin_words = tree.findall("Word") for lw in lexin_words: item = {"comment": (), "idioms": [], "synonyms": [], "examples": [], "compounds": [], "trans": None} #lexin_Value = lw.get("Value", "") #lexin_Variant = lw.get("Variant", "") #if ! lexin_Variant.isdigit(): # lexin_Variant = "1" key = lw.get("VariantID") if key == None: wrong_keys += 1 continue #key = (lexin_Value, lexin_Variant) baseLang = lw.find("BaseLang") targetLang = None if lw.find("TargetLang") != None: targetLang = lw.find("TargetLang") if targetLang == None: print "missing TargetLang" continue translation = targetLang.find("Translation") if translation != None: item["trans"] = translation.text comment = targetLang.find("Comment") if comment != None: swe_comment = baseLang.find("Comment").text item["comment"] = (swe_comment, comment.text) for example in targetLang.findall("Example"): example_id = example.get("ID") for swe_example in baseLang.findall("Example"): if example_id == swe_example.get("ID"): item["examples"].append((swe_example.text, example.text)) break for compound in targetLang.findall("Compound"): compound_id = compound.get("ID") for swe_compound in baseLang.findall("Compound"): if compound_id == swe_compound.get("ID"): item["compounds"].append((swe_compound.text, compound.text)) break for idiom in targetLang.findall("Idiom"): idiom_id = idiom.get("ID") for swe_idiom in baseLang.findall("Idiom"): if idiom_id == swe_idiom.get("ID"): item["idioms"].append((swe_idiom.text, idiom.text)) break for synonym in targetLang.findall("Synonym"): item["synonyms"].append(synonym.text) if not key in output: output[key] = [] output[key].append((lang_name, item)) return output ############################# # OUTPUT # ############################# class LMF: def __init__(self, lang): self.lang = lang self.lexical_entries = [] self._lexical_entries_set = set() self._le_senses = set() self.semantic_predicates = [] def add_lexical_entry(self, lexical_entry): self.lexical_entries.append(lexical_entry) self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden) def add_semantic_predicate(self, semantic_predicate): self.semantic_predicates.append(semantic_predicate) def __unicode__(self): return "\n".join([ '', '', '', '', ' ', '', '', ' ' % self.lang, "\n".join([unicode(e) for e in self.lexical_entries]), "\n".join([unicode(s) for s in self.semantic_predicates]), '', '']) class LexicalEntry: def __init__(self): self.features = [] self.lemma = None self.wordforms = [] self.senses = [] self._pos = "" self._wf = "" self.idattr = "" self.saldoLinks = [] def add_sense(self, sense): self.senses.append(sense) def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def add_wordform(self, wordform): self.wordforms.append(wordform) def add_saldoLink(self, sense): self.saldoLinks.append(sense) def __unicode__(self): le_unicodeing = '' if(self.idattr): le_unicodeing = '' % (self.idattr) return "\n".join([ le_unicodeing, '\n'.join([unicode(f) for f in self.features]), unicode(self.lemma), '\n'.join([unicode(w) for w in self.wordforms]), '\n'.join([unicode(s) for s in self.senses]), '\n'.join(['' for se in self.saldoLinks]), '']) """ class Lemma: def __init__(self): self.features = [] # now including writtenForm and partOfSpeech! def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def __unicode__(self): if self.features: return "\n".join(['\n', '\n'.join([unicode(f) for f in self.features]), '\n']) else: return '' """ class Lemma: def __init__(self): self.form_representations = [] self.features = [] # now including writtenForm and partOfSpeech! def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def add_form_representation(self, form_representation): self.form_representations.append(form_representation) def __unicode__(self): if self.features or self.form_representations: return "\n".join(['', '\n'.join(unicode(fr) for fr in self.form_representations),'']) #return "\n".join(['\n', # '\n'.join([unicode(f) for f in self.features]), # '\n']) else: return '' class WordForm: def __init__(self): self.features = [] def add_feature(self, feature): self.features.append(feature) def __unicode__(self): return "\n".join(['', '\n'.join([unicode(f) for f in self.features]), '']) class FormRepresentation: def __init__(self): self.features = [] def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def __unicode__(self): if self.features: return "\n".join(['','\n'.join([unicode(f) for f in self.features]),'']) else: return '' class Feature: def __init__(self, att, val): self.att = att self.val = val def __unicode__(self): return '' % (self.att, escape(self.val)) class Sense: def __init__(self, sense): self.sense = sense self.relations = [] self.predicative_representations = [] self.sense_examples = [] self.features = [] def add_feature(self, feature): self.features.append(feature) def add_sense_relation(self, sense_relation): self.relations.append(sense_relation) def add_predicative_representation(self, predicative_representation): self.predicative_representations.append(predicative_representation) def add_sense_example(self, sense_example): self.sense_examples.append(sense_example) def __unicode__(self): if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features: return '' % (self.sense) else: return "\n".join(['' % (self.sense), "\n".join([unicode(pre) for pre in self.predicative_representations]), "\n".join([unicode(rel) for rel in self.relations]), "\n".join([unicode(ex) for ex in self.sense_examples]), "\n".join([unicode(f) for f in self.features]), '' ]) class SenseRelation: def __init__(self, target, relation_types): self.target = target self.relation_types = relation_types self.features = [] def add_feature(self, feature): self.features.append(feature) def __unicode__(self): return "\n".join(['' % (self.target), '\n'.join(['' % t for t in self.relation_types]), '\n'.join([unicode(f) for f in self.features]), '' ]) class SenseExample: def __init__(self, example): self.example = example self.features = [] def add_feature(self, feature): self.features.append(feature) def __unicode__(self): return "\n".join([ '', '' % (escape(self.example)), "\n".join([unicode(f) for f in self.features]), '' ]) class SemanticPredicate: def __init__(self, id, domain, semantic_types): self.id = id #self.domain = domain self.semantic_types = semantic_types self.semantic_arguments = [] self.features = [] if domain != None and domain != "": self.add_feature(Feature("domain", domain)) def add_semantic_argument(self, argument): self.semantic_arguments.append(argument) def add_feature(self, feature): self.features.append(feature) def generateFeatures(self, att, vals): for val in vals: self.add_feature(Feature(att, val.unicodeip())) def __unicode__(self): extras = "" for st in self.semantic_types: extras += '' return "\n".join([ '' % (self.id), "\n".join(['\n' % (st) for st in self.semantic_types]), "\n".join([unicode(fe) for fe in self.features]), "\n".join([unicode(sa) for sa in self.semantic_arguments]), '' ]) class SemanticArgument: def __init__(self, semantic_role, core_type): self.semantic_role = semantic_role self.core_type = core_type def __unicode__(self): return '' % (self.semantic_role, self.core_type) class PredicativeRepresentation: def __init__(self, idref): self.idref = idref def __unicode__(self): return '' % (self.idref, self.idref) def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') return s.replace('"', '"') def escapeContent(s): s = s.replace('&', '&') s = s.replace('<', '<') s = s.replace('>', '>') return s ############################# # INPUT # ############################# def take(entry, tagname, resort): e2 = entry.find(tagname) if e2 != None: t = e2.text if(t == None): return resort else: return t.unicodeip() else: return resort def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict posHash = { u"subst." : u"nn", u"prep." : u"pp", u"pron." : u"pn", u"förk." : u"nna", u"subst. plural" : u"nn", u"verb" : u"vb", u"subst., ingen böjning" : u"nn", u"adj." : u"av", u"adv." : u"ab", u"interj." : u"in", u"räkn." : u"nl", u"förled" : u"", u"namn" : u"pm", u"adj., ingen böjning" : u"av", u"subst. bestämd form singular" : u"nn", u"konj." : u"kn", u"obestämd artikel" : u"al", u"bestämd artikel" : u"al" } multiPosHash = { u"subst." : u"nnm", u"prep." : u"ppm", u"pron." : u"pnm", u"förk." : u"nna", u"subst. plural" : u"nnm", u"verb" : u"vbm", u"subst., ingen böjning" : u"nnm", u"adj." : u"avm", u"adv." : u"abm", u"interj." : u"inm", u"räkn." : u"nlm", u"förled" : u"", u"namn" : u"pmm", u"adj., ingen böjning" : u"avm", u"subst. bestämd form singular" : u"nnm", u"konj." : u"knm" } lexinInflectionToMsd = { u"best.f.sing." : u"sg def nom", u"obest.f.pl." : u"pl indef nom", u"best.f.pl." : u"pl def nom", u"tform" : u"pos indef sg n nom", # ? u"aform" : u"pos indef pl nom", # ? u"infinitiv" : u"inf aktiv", u"imperfekt" : u"pret ind aktiv", u"supinum" : u"sup aktiv", u"perf.part." : u"", u"imperativ" : u"imper" } def lexinPosToSaldoPos(inpos, inword): if " " in inword: pos = multiPosHash.get(inpos, None) else: pos = posHash.get(inpos, None) if inpos in [u"subst. plural"]: extra = u"plural" elif inpos in [u"subst., ingen böjning", u"adj., ingen böjning"]: extra = u"oböjl" elif inpos in [u"subst. bestämd form singular"]: extra = u"sing best" elif inpos in [u"förled"]: extra = u"förled" else: extra = None return (pos, extra) if __name__ == '__main__': global mode global language global saldom saldom = read_saldom() extra_languages = read_languages() tree = ElementTree() tree.parse("orig_lexin/svenska4/swe_swe.xml") lmf = LMF('swe') usedSensesCount = {} entries = tree.findall("Article") idioms = [] for entry in entries: lexin_lemma = entry.find("Lemma") rawForm = lexin_lemma.get("Value") lexin_pos = lexin_lemma.get("Type") pos = None if lexin_pos != None: pos = lexinPosToSaldoPos(lexin_pos, lexin_lemma.get("Value"))[0] lexin_phonetic = lexin_lemma.find("Phonetic") hyphenate = lexin_lemma.get("Hyphenate") rank = lexin_lemma.get("Rank") lexin_id = lexin_lemma.get("ID") lexin_variant = lexin_lemma.get("Variant") for lexin_lexeme in lexin_lemma.findall("Lexeme"): e = LexicalEntry() saldo_link_set = set() lemma = Lemma() e.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) fr.add_feature(Feature("rawForm", rawForm)) if lexin_pos != None: fr.add_feature(Feature("lexinPartOfSpeech", lexin_pos)) if pos != None: fr.add_feature(Feature("partOfSpeech", pos)) infinitiv = None for lexin_inflection in lexin_lemma.findall("Inflection"): wordform = WordForm() lexin_text = lexin_inflection.text wordform.add_feature(Feature("writtenForm", lexin_text)) lexin_form = lexin_inflection.get("Form") if lexin_form != None: if lexin_form == "infinitiv": infinitiv = lexin_text msd = lexinInflectionToMsd.get(lexin_form,"") wordform.add_feature(Feature("msd", msd)) wordform.add_feature(Feature("lexinForm", lexin_form)) if lexin_inflection.get("Spec") != None: spec = lexin_inflection.get("Spec") wordform.add_feature(Feature("lexinSpec", lexin_inflection.get("Spec"))) e.add_wordform(wordform) writtenForm = None if (pos == "vb" or pos == "vbm") and infinitiv != None: # we have to find the infinitive since Lexin is based on present tense writtenForm = infinitiv else: writtenForm = lexin_lemma.get("Value").replace("|", "") fr.add_feature(Feature("writtenForm", writtenForm)) if pos != None and writtenForm != None: possible_saldo_senses = saldom.get((writtenForm, pos),[]) for pss in possible_saldo_senses: e.add_saldoLink(pss) # fix new format! if lexin_phonetic != None: fr.add_feature(Feature("phoneticForm", lexin_phonetic.text)) if hyphenate: fr.add_feature(Feature("hyphenatedForm", hyphenate)) if rank: fr.add_feature(Feature("rank", rank)) if lexin_id != "": fr.add_feature(Feature("lexinID", lexin_id)) if lexin_variant: fr.add_feature(Feature("lexinVariant", lexin_variant)) for lexin_reference in lexin_lemma.findall("Reference"): reftype = lexin_reference.get("Type") if reftype == "see": e.add_feature(Feature("see", lexin_reference.get("Value"))) elif reftype == "compare": e.add_feature(Feature("compareWith", lexin_reference.get("Value"))) template = copy.deepcopy(e) # save the form data for later baseform = lexin_lemma.get("Value").replace("|", "") baseform = baseform.replace(" ", "_").replace("/", "__") index = usedSensesCount.get(baseform, 0) + 1 #if not index.isdigit(): # index = "1" #if usedSensesCount.get(baseform, 0) >= index: # index = usedSensesCount.get(baseform, 0) + 1 sense = Sense("lexin--" + baseform + ".." + str(index)) lexin_lexemeno = lexin_lexeme.get("Lexemeno") lexin_variantID = lexin_lexeme.get("VariantID","") langdata = extra_languages.get(lexin_variantID, []) sense.add_feature(Feature("lexinVariantID", lexin_variantID)) if not lexin_lexemeno.isdigit(): lexin_lexemeno = "1" sense.add_feature(Feature("lexinLexemeNumber", lexin_lexemeno)) if baseform in usedSensesCount: usedSensesCount[baseform] = usedSensesCount[baseform] + 1 else: usedSensesCount[baseform] = 1 if lexin_lexeme.get("ID", None) != None: sense.add_feature(Feature("lexinID", lexin_lexeme.get("ID"))) if lexin_lexeme.find("Definition") != None: sense.add_feature(Feature("definition", lexin_lexeme.find("Definition").text)) for lexin_example in lexin_lexeme.findall("Example"): sense_example = SenseExample(lexin_example.text) sense_example.add_feature(Feature("type", "example")) for (lang, item) in langdata: for (swe_ex, ex) in item["examples"]: if ex != None: if swe_ex.strip() == lexin_example.text.strip(): sense_example.add_feature(Feature(lang + "Text", ex)) break sense.add_sense_example(sense_example) for lexin_idiom in lexin_lexeme.findall("Idiom"): idiom = {} idiom["gf"] = lexin_idiom.text if lexin_idiom.find("Definition") != None: idiom["definition"] = lexin_idiom.find("Definition").text for (lang, item) in langdata: for (swe_idiom, tr_idiom) in item["idioms"]: if tr_idiom != None: swe_idiom = swe_idiom.split(" (")[0] if swe_idiom.strip() == lexin_idiom.text.strip(): if "translations" not in idiom: idiom["translations"] = [] idiom["translations"].append((lang, tr_idiom)) idiom["mainword"] = writtenForm idiom["lexinID"] = lexin_id #sense_example = SenseExample(lexin_idiom.text) #sense_example.add_feature(Feature("type", "idiom")) #if lexin_idiom.find("Definition") != None: # sense_example.add_feature(Feature("definition", lexin_idiom.find("Definition").text)) #for (lang, item) in langdata: # for (swe_idiom, idiom) in item["idioms"]: # if idiom != None: # swe_idiom = swe_idiom.split(" (")[0] # if swe_idiom.strip() == lexin_idiom.text.strip(): # sense_example.add_feature(Feature(lang + "Text", idiom)) # break #sense.add_sense_example(sense_example) for lexin_compound in lexin_lexeme.findall("Compound"): sense_example = SenseExample(lexin_compound.text) sense_example.add_feature(Feature("type", "compound")) for (lang, item) in langdata: for (swe_comp, comp) in item["compounds"]: if comp != None: #print "COMP " + swe_comp.strip() + " --- " + lexin_compound.text.strip().replace("|","") if swe_comp.strip() == lexin_compound.text.strip().replace("|",""): sense_example.add_feature(Feature(lang + "Text", comp)) break sense.add_sense_example(sense_example) for lexin_comment in lexin_lexeme.findall("Comment"): comment_type = lexin_comment.get("Type") if comment_type == "style": sense.add_feature(Feature("usg", lexin_comment.text)) elif comment_type == "def": sense.add_feature(Feature("desc", lexin_comment.text)) for lexin_reference in lexin_lexeme.findall("Reference"): reftype = lexin_reference.get("Type") if reftype == "see": sense.add_feature(Feature("see", lexin_reference.get("Value"))) elif reftype == "compare": sense.add_feature(Feature("compareWith", lexin_reference.get("Value"))) elif reftype == "antonym": sense.add_feature(Feature("antonym", lexin_reference.get("Value"))) for lexin_gramcom in lexin_lexeme.findall("Gramcom"): sense.add_feature(Feature("gram", lexin_gramcom.text)) for lexin_graminfo in lexin_lexeme.findall("Graminfo"): sense.add_feature(Feature("gram", lexin_graminfo.text)) for lexin_theme in lexin_lexeme.findall("Theme"): for theme_string in lexin_theme.get("Tema").split(","): sense.add_feature(Feature("lexinTheme", theme_string.split(":")[0].strip())) for (lang, item) in langdata: if item["trans"] != None: sense.add_feature(Feature(lang + "Translation", item["trans"])) if item["synonyms"] != None: for syn in item["synonyms"]: sense.add_feature(Feature(lang + "Synonyme", syn)) e.add_sense(sense) # add the cycles if there are any # if the cycle has a definition we "guess" that it's a new sense and make a new LE # if it's not we simply flatten the features into the current sense/LE les_from_cycles = [] for cycle in lexin_lexeme.findall("Cycle"): definition = cycle.find("Definition") if definition is not None: cycle_le = copy.deepcopy(template) # make a copy of the form data usedSensesCount[baseform] = usedSensesCount[baseform] + 1 index = usedSensesCount.get(baseform, 1) cycle_sense = Sense("lexin--" + baseform + ".." + str(index)) cycle_le.add_sense(cycle_sense) cycle_le.add_feature(Feature("entryType", "cycle")) cycle_sense.add_feature(Feature("definition", definition.text)) for com in cycle.findall("Comment"): cycle_sense.add_feature(Feature("comment", com.text)) for com in cycle.findall("Compound"): c_sense_example = SenseExample(com.text) c_sense_example.add_feature(Feature("type", "compound")) cycle_sense.add_sense_example(c_sense_example) for ex in cycle.findall("Example"): c_sense_example = SenseExample(ex.text) c_sense_example.add_feature(Feature("type", "example")) cycle_sense.add_sense_example(c_sense_example) for g in cycle.findall("Graminfo"): cycle_sense.add_feature(Feature("gram", g.text)) les_from_cycles.append(cycle_le) else: for com in cycle.findall("Comment"): sense.add_feature(Feature("comment", com.text)) for com in cycle.findall("Compound"): c_sense_example = SenseExample(com.text) c_sense_example.add_feature(Feature("type", "compound")) sense.add_sense_example(c_sense_example) for ex in cycle.findall("Example"): c_sense_example = SenseExample(ex.text) c_sense_example.add_feature(Feature("type", "example")) sense.add_sense_example(c_sense_example) for g in cycle.findall("Graminfo"): sense.add_feature(Feature("gram", g.text)) e.add_feature(Feature("entryType", "lexinEntry")) lmf.add_lexical_entry(e) for lesfc in les_from_cycles: lmf.add_lexical_entry(lesfc) for idiom in idioms: idiom_le = LexicalEntry() idiom_le.add_feature(Feature("entryType", "idiom")) lemma = Lemma() idiom_le.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) fr.add_feature(Feature("writtenForm", idiom["gf"])) sense_id = idiom["gf"].replace(" ", "_") # check to see if there needs to be more replacements later sense = Sense(sense_id) idiom_le.add_sense(sense) fr.add_feature(Feature("lexinID", idiom["lexinID"])) fr.add_feature(Feature("mainWord", idiom["mainword"])) sense.add_feature(Feature("definition", idiom["definition"])) lmf.add_lexical_entry(idiom_le) """ idiom = {} idiom["gf"] = lexin_idiom.text if lexin_idiom.find("Definition") != None: idiom["definition"] = lexin_idiom.find("Definition").text for (lang, item) in langdata: for (swe_idiom, tr_idiom) in item["idioms"]: if tr_idiom != None: swe_idiom = swe_idiom.split(" (")[0] if swe_idiom.strip() == lexin_idiom.text.strip(): if "translations" not in idiom: idiom["translations"] = [] idiom["translations"].append((lang, tr_idiom)) idiom["mainword"] = writtenForm idiom["lexinID"] = lexin_id """ print unicode(lmf).encode("utf-8")