#!/usr/bin/python # -*- coding: utf-8 -*- import sys import re import codecs from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring TODO = "todo" KONST = "{http://spraakbanken.gu.se/swe/resurs/konstruktikon}" KARP = "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}" URL = "konstruktikon.xml" class HTML: def __init__(self): self.entries = [] def __str__(self): return "\n".join([ '', '', '', 'Ett svenskt konstruktikon, utvecklingsversion', '', '', '', '', "\n".join(["%s" % (e) for e in self.entries]), '', ''])#.encode('utf-8') def add_entry(self, entry): self.entries.append(entry) class KonstruktikonEntry: def __init__(self): self.typee = "" self.cat = "" self.inheritance = "" self.evokes = "" self.definition = None self.structure = "" self.cee = [] self.coll = [] self.construction_elements_internal = [] self.construction_elements_external = [] self.examples = [] self.comment = None self.status = "" self.reference = "" self.id = "" self.illustration = "" def add_example(self, contents): self.examples.append(Example(contents)) def set_definition(self, contents): self.definition = Example(contents) def set_comment(self, contents): self.comment = Example(contents) def add_internal_construction_element(self, contents): self.construction_elements_internal.append(ConstructionElement(contents)) def add_external_construction_element(self, contents): self.construction_elements_external.append(ConstructionElement(contents)) def __str__(self): global mode global language if self.illustration != "": illustration_html = ' - %s' % (self.illustration) else: illustration_html = '' if language == 0: # english l_comment = "comment" l_structure = "structure" l_examples = "examples" l_coll = "common words" l_status = "status" l_cee = "construction evoking elements" l_cat = "category" l_evokes = "frame evoked" else: # swedish l_comment = "kommentar" l_structure = "struktur" l_examples = "exempel" l_coll = "vanliga ord" l_status = "status" l_cee = "cee" l_cat = "kategori" l_evokes = "frame evoked" if mode == 1: examples = "" if len(self.examples) != 0: #print self.examples[1] examples = '%s' % (l_examples, "".join( [ ('
  • %s
  • ' % (example)) for example in self.examples])) internal = "" s = "\n".join([ '

    %s%s

    ' % (self.id, self.id.split("--")[1], illustration_html) , '' % (self.id), '' % (self.definition), cond(l_structure, self.structure), cond(l_coll, " ".join(self.coll)), examples, '' % (l_comment, self.comment), '
    definition%s
    %s%s
    ']) return s#.encode('utf-8') else: examples = "" if len(self.examples) != 0: #print self.examples[1] examples = 'examples' % ("".join( [ ('
  • %s
  • ' % (example)) for example in self.examples])) internal = "" if len(self.construction_elements_internal) != 0: internal = 'internal construction elements' % ("".join( [ ('
  • %s
  • ' % (elem)) for elem in self.construction_elements_internal])) external = "" if len(self.construction_elements_external) != 0: external = 'external construction elements' % ("".join( [ ('
  • %s
  • ' % (elem)) for elem in self.construction_elements_external])) s = "\n".join([ '

    %s%s

    ' % (self.id, self.id.split("--")[1], illustration_html) , '' % (self.id), cond("type", self.typee), cond(l_cat, self.cat), cond(l_evokes, self.evokes), '' % (self.definition or ""), cond(l_structure, self.structure), linked_cond("inheritance", self.inheritance), cond(l_cee, " ".join(self.cee)), cond(l_coll, " ".join(self.coll)), internal, external, examples, cond(l_comment, self.comment), cond("reference", self.reference), cond("status", self.status), '
    definition%s
    ']) return s#.encode('utf-8') def linked_cond(label, item): if(item != "" and item != None): regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) item = re.sub(regexp, sensify, item) return '%s%s' % (label, item, item) else: return '' def cond(label, item): if(item != "" and item != None): regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) item = re.sub(regexp, sensify, item) return '%s%s' % (label, item) else: return '' def sensify(matchobj): sense = matchobj.group(0) parts = sense.split("..") if len(parts) == 2: return '%s%s' % (parts[0], parts[1]) else: return sense def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') return s.replace('"', '"') class ConstructionElement: def __init__(self, contents): self.contents = contents # [("name", "Activity"), ("cat", "vb")] def __str__(self): out = [] theName = "" #print self.contents regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) for (att, value) in self.contents: if att == "aux": att = "other" if att != "name": out.append(att + "=" + re.sub(regexp, sensify, value)) else: theName = value return theName + ": " + " ".join(out) class Example: def __init__(self, contents): self.contents = contents def __str__(self): return self.make_string(self.contents)#.encode('utf-8') def make_string(self, contents): out = [] for p in contents: if len(p) == 2: if p[0] == "/freetext": # Freetext out.append(escape(p[1])) elif len(p) == 3: if p[0] == "/leaf": # Node out.append('[' + escape(p[2]) + ']' + escape(p[1].get("name", "")) + '') else: # Branch out.append('[') #print p[2] for item in p[2]: if len(item) == 2: # Freetext out.append(escape(item[1])) elif item[0] == "/branch": # Branch out.append('[' + self.make_string(item[2]) + ']' + escape(item[1].get("name", "")) + '') else: # Node out.append('[' + escape(item[2]) + ']' + escape(item[1].get("name", "")) + '') out.append(']' + escape(p[1].get("name", "")) + '') outstr = " ".join(out) return outstr #class Constructicon_Element: #def styleForNode(node): ############################# # INPUT # ############################# def structify(s): s = re.sub(r'(\S+)\.\.(\d+)',lambda x: x.group(1)+""+x.group(2)+"",s,re.U) s = re.sub(r'"(.+?)"',lambda x: "" + x.group(1) + "",s,re.U) s = re.sub(r'([^\s\]\/]+)_([^\s\]\/]+)',lambda x: x.group(1) + '' + x.group(2) + '' if '' not in x.group(0) else x.group(0),s,re.U) s = re.sub(r',,([^\s|]+)', lambda x: '' + x.group(1) + '' if '' not in x.group(0) else x.group(0),s,re.U) return s # t = t.replace(/,,([^\s]+)/g, function(f, a) { # if( f.indexOf("") == -1 ) { # return '' + a + '' # } else { # return f; # } # }); """ def handle_example(example): example_parts = [] if example.text != None: first_text = example.text.strip() else: first_text = "" if first_text != "": example_parts.append(("/freetext", first_text)) for part in example: inners = part.findall("e") if inners == None or len(inners) == 0: example_parts.append(("/leaf", part.attrib, part.text.strip() )) else: example_parts.append(("/branch", part.attrib, handle_example(part) )) thetail = part.tail if thetail != None: thetail = thetail.strip() if thetail != "": example_parts.append(("/freetext", thetail)) return example_parts """ def handle_markup(example): example_parts = [] for part in example.getchildren(): if part.tag == KARP+"e" or part.tag == "e": if len(part.getchildren()) == 0: example_parts.append(("/leaf", part.attrib, part.text.strip() )) else: example_parts.append(("/branch", part.attrib, handle_markup(part) )) elif part.tag == KARP+"text" or part.tag == "text": example_parts.append(("/freetext", part.text.strip())) return example_parts def take(entry, tagname, resort): e2 = entry.find(tagname) if e2 != None: t = e2.text if t == None: return resort else: return t.strip() else: return resort """ def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict """ def dict_to_list(indict): outlist = [] for item in indict: if item != "uid" and item != "type": outlist.append((item, indict[item])) return outlist if __name__ == '__main__': global mode global language if len(sys.argv) < 2: mode = 0 language = 0 # english else: if sys.argv[1] == "simplified": mode = 1 else: mode = 0 if sys.argv[2] == "english": language = 0 else: language = 1 lines = [] tree = ElementTree() tree.parse(URL) html = HTML() entries = tree.find("Lexicon").findall("LexicalEntry") for le in entries: skip = False sense = le.find("Sense") e = KonstruktikonEntry() e.id = sense.get("id") cees = [] colls = [] feats = sense.findall("feat") for feat in feats: feat_att = feat.get("att") feat_val = feat.get("val") if feat_att == "type": e.typee = escape(feat_val) elif feat_att == "cat": e.cat = escape(feat_val) elif feat_att == "evokes": e.evokes = escape(feat_val) elif feat_att == "structure": e.structure = structify(feat_val) elif feat_att == "cee": cees.append(escape(feat_val)) elif feat_att == "coll": colls.append(escape(feat_val)) elif feat_att == "reference": e.reference = escape(feat_val) elif feat_att == "internal_comment": # New, but should maybe not be rendered out pass elif feat_att == "comment": # Right now it's not a feat but I think it should be! e.comment = escape(feat_val) elif feat_att == "entry_status": # New field e.status = escape(feat_val) if e.status == "Suggestion": skip = True elif feat_att == "illustration": e.illustration = escape(feat_val) e.coll = colls e.cee = cees examples = sense.findall(KARP+"example", namespaces={"karp":"karp"}) for example in examples: e.add_example(handle_markup(example)) definition = sense.find(KARP+"definition", namespaces={"karp":"karp"}) if definition != None and definition != "": e.set_definition(handle_markup(definition)) #### These are because we don't save the elements back with namespaces at the moment ### examples = sense.findall("example", namespaces={"karp":"karp"}) for example in examples: e.add_example(handle_markup(example)) definition = sense.find("definition", namespaces={"karp":"karp"}) if definition != None and definition != "": e.set_definition(handle_markup(definition)) ######################################################################################## int_const_elements = sense.findall(KONST+"int_const_elem", namespaces={"konst":"konst"}) for int_const_element in int_const_elements: e.add_internal_construction_element(dict_to_list(int_const_element.attrib)) int_const_elements = sense.findall("int_const_elem", namespaces={"konst":"konst"}) for int_const_element in int_const_elements: e.add_internal_construction_element(dict_to_list(int_const_element.attrib)) ext_const_elements = sense.findall(KONST+"ext_const_elem", namespaces={"konst":"konst"}) for ext_const_element in ext_const_elements: e.add_external_construction_element(dict_to_list(ext_const_element.attrib)) ext_const_elements = sense.findall("ext_const_elem", namespaces={"konst":"konst"}) for ext_const_element in ext_const_elements: e.add_external_construction_element(dict_to_list(ext_const_element.attrib)) if not skip: html.add_entry(e) """ e.typee = take(entry, "type", "") e.cat = take(entry, "cat", "") e.inheritance = take(entry, "inheritance", "") e.evokes = take(entry, "evokes", "") definition = entry.find("definition") def_parts = [] if definition != None: if definition.text != None: first_text = definition.text.strip() if first_text != "": def_parts.append(("/freetext", first_text)) for atom in definition: if atom.tag == TODO: continue if atom.text != None: def_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip())) thetail = atom.tail if thetail != None: thetail = thetail.strip() if thetail != "": def_parts.append(("/freetext", thetail)) #print def_parts #print e.set_definition(def_parts) e.structure = take(entry, "structure", "") e.cee = take(entry, "cee", "") e.coll = take(entry, "coll", "") c_e = entry.find("construction_elements") if c_e != None: internal = c_e.find("internal") if internal != None: for el in internal: if el.tag == TODO: continue e.add_internal_construction_element(el.tag, el.items()) external = c_e.find("external") if external != None: for el in external: if el.tag == TODO: continue e.add_external_construction_element(el.tag, el.items()) examples = entry.find("examples") for example in examples: if example != None: if example.tag == TODO: continue example_parts = handle_example(example) #print example_parts #print "---" e.add_example(example_parts) comment = entry.find("comment") com_parts = [] if comment != None: if comment.text != None: first_text = comment.text.strip() if first_text != "": com_parts.append(("/freetext", first_text)) for atom in comment: if atom.tag == TODO: continue if atom.text != None: com_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip())) thetail = atom.tail if thetail != None: thetail = thetail.strip() if thetail != "": com_parts.append(("/freetext", thetail)) e.set_comment(com_parts) #e.comment = take(entry, "comment", "") e.reference = take(entry, "reference", "") if mode == 0 or e.typee == "cx" or e.typee == "Cx": html.add_entry(e) """ a = unicode(html) print a.encode("utf-8")