#!/usr/bin/python # -*- coding: utf-8 -*- import sys import re import codecs from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring TODO = "todo" KARP = "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}" URL = "swefn.xml" saldo_dict = {} class HTML: def __init__(self): self.entries = [] def __str__(self): return "\n".join([ '', '', '', 'Utvecklingsversion av SweFN', '', '', '', '', "\n".join(["%s" % (e) for e in sorted(self.entries, key=lambda x: x.id)]), '', ''])#.encode('utf-8') def add_entry(self, entry): self.entries.append(entry) class SweFNEntry: def __init__(self): self.inheritance = "" self.lus = [] self.lu_suggestions = [] self.comps = [] self.comp_examples = [] self.domains = [] self.cemtypes = [] self.c_elements = [] self.p_elements = [] self.examples = [] self.created_by = "" self.comment = None self.status = "" self.id = "" self.bid = "" self.modifications = [] self.createdDate = "" self.modifDate = "" def add_example(self, contents): self.examples.append(Example(contents)) def set_comment(self, contents): self.comment = Example(contents) def __str__(self): global language if language == 0: # english l_comment = u"kommentar"#u"comment" l_structure = u"structure" l_examples = u"exempel"#u"examples" l_coll = u"common words" l_status = u"status" else: # swedish l_comment = u"kommentar" l_structure = u"struktur" l_examples = u"exempel" l_coll = u"vanliga ord" l_status = u"status" examples = "" if len(self.examples) != 0: examples = 'exempel' % ("".join( [ ('
  • %s
  • ' % (example)) for example in self.examples])) lus_formatted = "" if len(self.lus) != 0: lines = [] lu_output = {} for lu in self.lus: pos = saldo_dict.get(lu, u"okänd") if pos not in lu_output: lu_output[pos] = [] lu_output[pos].append(lu) for p in lu_output: lines.append("" + p + " " + " ".join(lu_output[p])) lus_formatted = "
    ".join(lines) created = "" modified = "" latest_modif = "" first_modif = "" for m in self.modifications: md = date_string_to_int(m) if first_modif == "" or md < date_string_to_int(first_modif): first_modif = m if latest_modif == "" or md > date_string_to_int(latest_modif): latest_modif = m if self.createdDate == "": created = first_modif else: created = self.createdDate if latest_modif != "": modified = latest_modif else: if self.modifDate: modified = self.modifDate else: modified = self.createdDate createdstring = u'skapad%s' % (created) modificationstring = u'ändrad%s' % (modified) if self.bid != "": header = '

    %s

    ' % (self.id, self.bid, self.id.split("--")[1]) else: header = '

    %s

    ' % (self.id, self.id.split("--")[1]) s = "\n".join([ header, '' % (self.id), cond(u"domän", ", ".join(self.domains)), cond(u"semantisk typ", ", ".join(self.cemtypes)), cond(u"kärnelement", " ".join(self.c_elements)), cond(u"periferielement", " ".join(self.p_elements)), linked_cond("arv", self.inheritance), examples, cond(u"sms", " ".join(self.comps)), cond(u"sms-exempel", " ".join(self.comp_examples)), cond(u"lus", lus_formatted), #cond(u"lus", " ".join(self.lus)), cond(u"lu-förslag", " ".join(self.lu_suggestions)), cond(l_comment, self.comment), cond("skapad av", self.created_by), createdstring, modificationstring, '
    ']) return s#.encode('utf-8') def linked_cond(label, item): if(item != "" and item != None): regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) item = re.sub(regexp, sensify, item) return '%s%s' % (label, item, item) else: return '' def cond(label, item): if(item != "" and item != None): regexp = re.compile(r'(\w|-)*\.\.\d+', re.UNICODE) item = re.sub(regexp, sensify, item) return '%s%s' % (label, item) else: return '' def sensify(matchobj): sense = matchobj.group(0) parts = sense.split("..") if len(parts) == 2: return '%s%s' % (parts[0], parts[1]) else: return sense def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') return s.replace('"', '"') class ConstructionElement: def __init__(self, tagname, contents): self.tagname = tagname self.contents = contents # [("name", "Activity"), ("cat", "vb")] def __str__(self): out = [] #print self.contents regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) for (att, value) in self.contents: out.append(att + "=" + re.sub(regexp, sensify, value)) return self.tagname + ": " + " ".join(out) class Example: def __init__(self, contents): self.contents = contents def __str__(self): return self.make_string(self.contents)#.encode('utf-8') def make_string(self, contents): out = [] for p in contents: if len(p) == 2: if p[0] == "/freetext": # Freetext out.append(escape(p[1])) elif len(p) == 3: if p[0] == "/leaf": # Node out.append('[' + escape(p[2]) + ']' + p[1].get("name", "") + '') else: # Branch out.append('[') #print p[2] for item in p[2]: if len(item) == 2: # Freetext out.append(escape(item[1])) elif item[0] == "/branch": # Branch out.append('[' + self.make_string(item[2]) + ']' + item[1].get("name", "") + '') else: # Node out.append('[' + escape(item[2]) + ']' + item[1].get("name", "") + '') out.append(']' + p[1].get("name", "") + '') outstr = " ".join(out) return outstr.replace(" !|! ","") #class Constructicon_Element: #def styleForNode(node): ############################# # INPUT # ############################# def date_string_to_int(ds): return int(ds.replace("-", "")) def structify(s): s = re.sub(r'(\S+)\.\.(\d+)',lambda x: x.group(1)+""+x.group(2)+"",s,re.U) s = re.sub(r'"(.+?)"',lambda x: "" + x.group(1) + "",s,re.U) s = re.sub(r'([^\s\]\/]+)_([^\s\]\/]+)',lambda x: x.group(1) + '' + x.group(2) + '' if '' not in x.group(0) else x.group(0),s,re.U) return s def handle_markup(example): example_parts = [] for part in example.getchildren(): if part.tag == KARP+"e" or part.tag == "e": if len(part.getchildren()) == 0: example_parts.append(("/leaf", part.attrib, part.text.strip() )) else: example_parts.append(("/branch", part.attrib, handle_markup(part) )) elif part.tag == KARP+"text" or part.tag == "text": if part.text: example_parts.append(("/freetext", part.text.strip())) elif part.tag == KARP+"g" or part.tag == "g": example_parts.append(("/freetext", "!|!")) return example_parts def take(entry, tagname, resort): e2 = entry.find(tagname) if e2 != None: t = e2.text if t == None: return resort else: return t.strip() else: return resort """ def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict """ def dict_to_list(indict): outlist = [] for item in indict: if item != "uid": outlist.append((item, indict[item])) return outlist def read_csv_from_file(path, num_of_fields): with codecs.open(path, encoding='utf-8') as f: for line in f: e = [x for x in line[:-1].split('\t')] yield e[0:num_of_fields] if __name__ == '__main__': global mode global language if len(sys.argv) < 2: mode = 0 language = 0 # english else: if sys.argv[1] == "simplified": mode = 1 else: mode = 0 if sys.argv[2] == "english": language = 0 else: language = 1 lines = [] tree = ElementTree() tree.parse(URL) html = HTML() saldo = read_csv_from_file("saldo.txt", 7) for (sid, _, _, _, _, pos, _) in saldo: saldo_dict[sid] = pos entries = tree.find("Lexicon").findall("LexicalEntry") for le in entries: e = SweFNEntry() sense = le.find("Sense") if sense != None: e.id = sense.get("id") #print e.id cemtypes = [] domains = [] c_elements = [] p_elements = [] comps = [] comp_examples = [] lus = [] lu_suggestions = [] feats = sense.findall("feat") for feat in feats: feat_att = feat.get("att") feat_val = feat.get("val") if feat_att == "semanticType": cemtypes.append(escape(feat_val)) elif feat_att == "domain": domains.append(escape(feat_val)) elif feat_att == "coreElement": c_elements.append(escape(feat_val)) elif feat_att == "inheritance": e.inheritance = escape(feat_val) elif feat_att == "peripheralElement": p_elements.append(escape(feat_val)) elif feat_att == "compound": comps.append(escape(feat_val)) elif feat_att == "compoundExample": comp_examples.append(escape(feat_val)) elif feat_att == "LU": lus.append(escape(feat_val)) elif feat_att == "suggestionForLU": lu_suggestions.append(escape(feat_val)) elif feat_att == "internal_comment": # New, but should maybe not be rendered out pass elif feat_att == "comment": # Right now it's not a feat but I think it should be! e.comment = escape(feat_val) elif feat_att == "createdBy": # Right now it's not a feat but I think it should be! e.created_by = escape(feat_val) elif feat_att == "createdDate": e.createdDate = escape(feat_val) elif feat_att == "modifDate": e.modifDate = escape(feat_val) elif feat_att == "entry_status": # New field e.status = escape(feat_val) elif feat_att == "BFNID": e.bid = escape(feat_val) e.cemtypes = cemtypes e.domains = domains e.c_elements = c_elements e.p_elements = p_elements e.comps = comps e.comp_examples = comp_examples e.lus = list(lus) e.lu_suggestions = lu_suggestions modifs = le.findall(KARP + "modification", namespaces={"karp":"karp"}) for m in modifs: date_text = m.find("feat[@att='modificationDateTime']").get("val")[:10] e.modifications.append(date_text) #print list(sense) examples = sense.findall(KARP + "example", namespaces={"karp":"karp"}) for example in examples: e.add_example(handle_markup(example)) #### These are because we don't save the elements back with namespaces at the moment ### examples = sense.findall("example", namespaces={"karp":"karp"}) for example in examples: e.add_example(handle_markup(example)) ######################################################################################## html.add_entry(e) a = unicode(html) print a.encode("utf-8")