#!/usr/bin/env python # -*- coding: utf8 -*- import sys import re import codecs from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring from pprint import PrettyPrinter TODO = "todo" ############################# # OUTPUT # ############################# def recursivelyConvertIndexedStructure(obj): # We need to rearrange obj.e and obj.text in the correct order by their n-indexes: ke = [] ktext = [] if "e" in obj: ke = obj["e"] if type(obj["e"]) == list else [obj["e"]] if "text" in obj: ktext = obj["text"] if type(obj["text"]) == list else [obj["text"]] ke = [{"t" : "e", "item" : obj} for obj in ke] ktext = [{"t" : "text", "item" : obj} for obj in ktext] res = ke + ktext res.sort(indexedStructureSort); for objin in res: if objin["item"]: children = recursivelyConvertIndexedStructure(objin["item"]) if children and len(children) != 0: objin["item"]["children"] = children return res; def indexedStructureSort(first, second): return first["item"]["n"] - second["item"]["n"] def branchToXML(structure): out = "" c = 0 for obj in structure: if obj["t"] == "text": if obj["item"]["#text"].strip() != "": out += '' + obj["item"]["#text"].strip() + ''; else: # e label = obj["item"]["name"] if "children" in obj["item"]: out += '' + branchToXML(obj["item"]["children"]) + ''; else: if "#text" in obj["item"]: out += '' + obj["item"]["#text"] + ''; c += 1 return out def recReconstruct(arr): texts = [] es = [] index = -1 for obj in arr: index += 1 if obj["type"] == "text": texts.append({"#text" : obj["text"], "n" : index}) else: es.append({"name" : obj["name"], "n" : index}) children = recReconstruct(obj["children"]) if len(children[0]) == 1 and len(children[1]) == 0: es[len(es)-1]["#text"] = children[0][0]["#text"] else: if len(children[0]) > 0: es[len(es)-1]["text"] = children[0] if len(children[1]) > 0: es[len(es)-1]["e"] = children[1] return [texts, es] def markupStringToBranch(instr): queue = list(instr) tokens = [] tokenTypes = [] while len(queue) > 0: c = queue.pop(0) # Could propbably be optimized to O(1) instead of O(n) token = "" lastTokenType = None if c == "[": lastTokenType = 1 token = "[" tokens.append(token) tokenTypes.append(1) elif c == "]": lastTokenType = 2 token = "]" tokens.append(token) tokenTypes.append(2) token = "" while len(queue) >= 1 and queue[0] not in " []": token += queue.pop(0) if token and token != "": tokens.append(token) tokenTypes.append(3) else: token = c while len(queue) >= 1 and queue[0] not in "[]": token += queue.pop(0) if token and token != "": tokens.append(token) tokenTypes.append(0) # Make AST output = [] e_stack = [] while len(tokens) > 0: t = tokens.pop(0) tt = tokenTypes.pop(0) if tt == 0: # Text item = {"type" : "text", "name" : "", "text" : t} if len(e_stack) > 0: e_stack[len(e_stack)-1]["children"].append(item) else: output.append(item) elif tt == 1: # [ e_stack.append({"type" : None, "name" : "", "children" : []}) elif tt == 2: # ] pass elif tt == 3: # entity e = e_stack.pop() e["name"] = t if len(e_stack) != 0: parent_e = e_stack[len(e_stack)-1] parent_e["children"].append(e) else: output.append(e) #Reconstruct the AST reconstructed = recReconstruct(output) outObj = {} if len(reconstructed[0]) > 0: outObj["text"] = reconstructed[0] if len(reconstructed[1]) > 0: outObj["e"] = reconstructed[1] return outObj def print_as_new(tree, indent=0): output = "" if tree["items"] and len(tree["items"]) > 0: for item in tree["items"]: if type(item) == unicode: output += item elif type(item) == dict: output += print_as_new(item, indent + 1) return "\n" + "\t" * indent + output def print_as_markup(tree, indent=0): output = "" if tree["items"] and len(tree["items"]) > 0: last = "" for item in tree["items"]: if type(item) == unicode: if indent != 0: if len(item.split()) != 0: if not item.split()[0].isupper(): output += item else: if len(item.strip().split()) <= 1: last = item else: role = item.split()[0] text = " ".join(item.split()[1:]) output += text + " ~" + role else: output += item elif type(item) == dict: output += "[" + print_as_markup(item, indent + 1) + "]" if last != "": output += " ~" + last.strip() return output class LMF: def __init__(self, lang): self.lang = lang self.lexical_entries = [] self._lexical_entries_set = set() self._le_senses = set() def add_lexical_entry(self, lexical_entry): self.lexical_entries.append(lexical_entry) self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) def __unicode__(self): return "\n".join([ '', '', '', '', ' ', '', '', ' ' % self.lang, "\n".join([unicode(e) for e in self.lexical_entries]), '', '']) class LexicalEntry: def __init__(self): self.features = [] self.senses = [] self._pos = "" self._wf = "" self.idattr = "" def add_sense(self, sense): self.senses.append(sense) def __unicode__(self): le_string = '' if(self.idattr): le_string = '' % (self.idattr) return "\n".join([ le_string, '', '\n'.join([unicode(s) for s in self.senses]), '\n'.join([unicode(f) for f in self.features]), '']) def add_feature(self, feature): self.features.append(feature) class Feature: def __init__(self, att, val): self.att = att self.val = val def __unicode__(self): return '' % (self.att, escape(self.val)) class Sense: def __init__(self, sense): self.sense = sense self.relations = [] self.sense_examples = [] self.konst_examples = [] self.features = [] def add_sense_relation(self, sense_relation): self.relations.append(sense_relation) def add_sense_example(self, sense_example): self.sense_examples.append(sense_example) def add_konst_example(self, konst_example): self.konst_examples.append(konst_example) def add_feature(self, feature): self.features.append(feature) def __unicode__(self): construction_elements_block = "" return "\n".join(['' % (self.sense), "\n".join([unicode(rel) for rel in self.relations]), "\n".join([unicode(ex) for ex in self.sense_examples]), "\n".join([unicode(ke) for ke in self.konst_examples]), '\n'.join([unicode(f) for f in self.features]), '' ]) class KonstExample: def __init__(self, example_parts): self.example_parts = example_parts def __unicode__(self): return '%s' % self.example_parts """ return '%s' % (self.unicodeRec()) def unicodeRec(self): out = [] c = 0 for p in self.example_parts: if p[0] == "/freetext": out.append('%s' % (c, escapeContent(p[1]))) elif p[0] == "/leaf": out.append('%s' % (p[1].get("name", ""), c, escapeContent(p[2]))) elif p[0] == "/branch": out.append('%s' % (p[1].get("name", ""), c, KonstExample(p[2]).unicodeRec())) c += 1 return "".join(out) """ def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') return s.replace('"', '"') def escapeContent(s): s = s.replace('&', '&') s = s.replace('<', '<') s = s.replace('>', '>') return s ############################# # INPUT # ############################# def read_csv_file(filepath, num_of_fields, tolerates=-1): if tolerates == -1: tolerates = num_of_fields with codecs.open(filepath, encoding='utf-8') as f: skip = True for line in f: if skip: skip = False #print "SKIPPING" #print line else: e = line[:-1].split('\t') if len(e) == num_of_fields: yield e elif len(e) >= tolerates and len(e) < num_of_fields: f = [""] * (num_of_fields - len(e)) yield e + f return #def handle_example(example): # example_parts = [] # if example.text != None: # first_text = example.text.strip() # else: # first_text = "" # if first_text != "": # example_parts.append(("/freetext", first_text)) # for part in example: # inners = part.findall("e") # if inners == None or len(inners) == 0: # example_parts.append(("/leaf", part.attrib, part.text.strip() )) # else: # example_parts.append(("/branch", part.attrib, handle_example(part) )) # thetail = part.tail # if thetail != None: # thetail = thetail.strip() # if thetail != "": # example_parts.append(("/freetext", thetail)) # return example_parts def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict if __name__ == '__main__': lmf = LMF('swe') for (fr, st, d, fr_example, fe_core_list, fe_noncore_list, fe_cmp_list, fe_cmp_example, lus, lus_new, notes, created_by, createdate, modifdate) in read_csv_file("swefn-db.csv", num_of_fields=14): le = LexicalEntry() s = Sense(u"swefn--" + fr) le.add_sense(s) s.add_feature(Feature(u"BFNID", fr)) if st.strip() != "": s.add_feature(Feature(u"semanticType", st)) if d.strip() != "": s.add_feature(Feature(u"domain", d)) local_fe_table = {"LU" : "LU"} for fe_core in fe_core_list.split(","): fe_core = fe_core.strip() if fe_core != "": fe_parts = fe_core.split() if len(fe_parts) == 2: name = fe_parts[0].strip() token = fe_parts[1].strip()[1:-1] local_fe_table[token] = name s.add_feature(Feature(u"coreElement", name)) elif "(" in fe_core: #print fr + " can be fixed" fe_parts = fe_core.split("(") name = fe_parts[0].strip() token = fe_parts[1].strip()[:-1] local_fe_table[token] = name s.add_feature(Feature(u"coreElement", name)) else: s.add_feature(fe_core.strip()) for fe_per in fe_noncore_list.split(","): fe_per = fe_per.strip() if fe_per != "": fe_parts = fe_per.split(" ") if len(fe_parts) == 2: name = fe_parts[0].strip() token = fe_parts[1].strip()[1:-1] local_fe_table[token] = name s.add_feature(Feature(u"peripheralElement", name)) elif "(" in fe_per: fe_parts = fe_per.split("(") name = fe_parts[0].strip() token = fe_parts[1].strip()[:-1] local_fe_table[token] = name s.add_feature(Feature(u"peripheralElement", name)) else: s.add_feature(fe_per.strip()) if fe_cmp_list.strip() != "": for compound in fe_cmp_list.split(","): compound = compound.strip() s.add_feature(Feature(u"compound", compound)) if fe_cmp_example.strip() != "": for compound_example in fe_cmp_example.split(";;"): compound_example = compound_example.strip() s.add_feature(Feature(u"compoundExample", compound_example)) if lus.strip() != "": for lu_category in lus.split(";;"): if ":" in lu_category: lu_category = lu_category.split(":")[1] for lu in lu_category.split(","): lu = lu.strip() if lu != "": s.add_feature(Feature(u"LU", lu)) le.add_feature(Feature(u"saldoLink", lu)) if fr_example.strip() != "": for example in fr_example.split(";;"): example = escape(example.strip()) example = re.sub(r'](\.|\?|\!|,)', r'] \1', example) level = 0 tokens = [] current = "" for i in range(0, len(example)): char = example[i] if char == "[": if current != "": tokens.append(current) current = "" level += 1 tokens.append('[') elif char == "]": if current != "": tokens.append(current) current = "" tokens.append(']') level -= 1 else: current += char if current != "": tokens.append(current) if level == 0: root = {"parent" : None, "items" : []} current = root for i in range(0, len(tokens)): token = tokens[i] if token == '[': new_tree = {"parent" : current, "items" : []} current["items"].append(new_tree) current = new_tree elif token == ']': current = current["parent"] else: current["items"].append(token) #pp = PrettyPrinter() #pp.pprint(current) ##### #print #print tokens #print markup = print_as_markup(current) def repfunc(x): return "]" + local_fe_table.get(x.group(1), x.group(1)) new_markup = re.sub(r'\s\~(\S*)]', repfunc, markup) #print new_markup #print #print branchToXML(recursivelyConvertIndexedStructure(markupStringToBranch(new_markup))) s.add_konst_example(KonstExample(branchToXML(recursivelyConvertIndexedStructure(markupStringToBranch(new_markup))))) #print else: pass#print "NOT MATCHING [ AND ]" if lus_new.strip() != "": for new_lu_category in lus_new.split(";;"): if ":" in new_lu_category: new_lu_category = new_lu_category.split(":")[1] for new_lu in new_lu_category.split(","): new_lu = new_lu.strip() if new_lu != "": s.add_feature(Feature(u"suggestionForLU", new_lu)) if notes.strip() != "": # TODO: Det finns notes som delas med ;; men hur hantera det analogt med konstruktikon? s.add_feature(Feature(u"comment", notes)) s.add_feature(Feature(u"createdBy", created_by)) s.add_feature(Feature(u"createdDate", createdate)) s.add_feature(Feature(u"modifDate", modifdate)) lmf.add_lexical_entry(le) print unicode(lmf).encode("utf-8") def structify(input): input = join_parts(input) for i in range(0, len(input)): if "[" in input[i]: input[i] = structify(input[i]) return input def join_parts(input): status = 0 starting = None index = -1 for element in input: index += 1 if element == '[': if status == 0: starting = index status += 1 elif element == ']': status -= 1 if status == 0 and starting != None: return [input[:starting], input[starting+1:index], input[index+1:]] def conv(input): status = 0 starting = None index = -1 for element in input: index += 1 if element == '[': if status == 0: starting = index status += 1 elif element == ']': status -= 1 if status == 0 and starting != None: part = input[starting+1:index] items = part.split() label = "" other = "" if len(items[0]) == 1: label = items[0] other = " ".join(items[1:]) return input[:starting] + "[" + other + "]" + label + input[index+1:] #return [input[:starting], input[starting+1:index], input[index+1:]]