Ett svenskt konstruktikon, utvecklingsversion

#!/usr/bin/python # -*- coding: utf-8 -*- import sys import re import codecs from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring TODO = "todo" class HTML: def __init__(self): self.entries = [] def __str__(self): return "\n".join([ '', '', '', 'Ett svenskt konstruktikon, utvecklingsversion', '', '', '', '', "\n".join(["%s" % (e) for e in self.entries]), '', ''])#.encode('utf-8') def add_entry(self, entry): self.entries.append(entry) class KonstruktikonEntry: def __init__(self): self.typee = "" self.cat = "" self.inheritance = "" self.evokes = "" self.definition = None self.structure = "" self.cee = "" self.coll = "" self.construction_elements_internal = [] self.construction_elements_external = [] self.examples = [] self.comment = None self.reference = "" self.id = "" def add_example(self, contents): self.examples.append(Example(contents)) def set_definition(self, contents): self.definition = Example(contents) def set_comment(self, contents): self.comment = Example(contents) def add_internal_construction_element(self, tagname, contents): self.construction_elements_internal.append(ConstructionElement(tagname, contents)) def add_external_construction_element(self, tagname, contents): self.construction_elements_external.append(ConstructionElement(tagname, contents)) def __str__(self): global mode global language if language == 0: # english l_comment = "comment" l_structure = "structure" l_examples = "examples" l_coll = "common words" else: # swedish l_comment = "kommentar" l_structure = "struktur" l_examples = "exempel" l_coll = "vanliga ord" if mode == 1: examples = "" if len(self.examples) != 0: #print self.examples[1] examples = '%s

%s' % (l_examples, "".join( [ ('

' % (example)) for example in self.examples])) internal = "" s = "\n".join([ '

%s

' % (self.id, self.id) , '' % (self.id), '' % (self.definition), cond(l_structure, self.structure), cond(l_coll, self.coll), examples, '' % (l_comment, self.comment), '

definition	%s
%s	%s

']) return s#.encode('utf-8') else: examples = "" if len(self.examples) != 0: #print self.examples[1] examples = 'examples

%s' % ("".join( [ ('

' % (example)) for example in self.examples])) internal = "" if len(self.construction_elements_internal) != 0: internal = 'internal construction elements

%s' % ("".join( [ ('

' % (elem)) for elem in self.construction_elements_internal])) external = "" if len(self.construction_elements_external) != 0: external = 'external construction elements

%s' % ("".join( [ ('

' % (elem)) for elem in self.construction_elements_external])) s = "\n".join([ '

%s

' % (self.id, self.id) , '' % (self.id), cond("type", self.typee), cond("category", self.cat), cond("evokes", self.evokes), '' % (self.definition), cond(l_structure, self.structure), linked_cond("inheritance", self.inheritance), cond("cee", self.cee), cond(l_coll, self.coll), internal, external, examples, '' % (l_comment, self.comment), cond("reference", self.reference), '

definition	%s
%s	%s

']) return s#.encode('utf-8') def linked_cond(label, item): if(item != "" and item != None): regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) item = re.sub(regexp, sensify, item) return '%s%s' % (label, item, item) else: return '' def cond(label, item): if(item != "" and item != None): regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) item = re.sub(regexp, sensify, item) return '%s%s' % (label, item) else: return '' def sensify(matchobj): sense = matchobj.group(0) parts = sense.split("..") if len(parts) == 2: return '%s^%s' % (parts[0], parts[1]) else: return sense class ConstructionElement: def __init__(self, tagname, contents): self.tagname = tagname self.contents = contents # [("name", "Activity"), ("cat", "vb")] def __str__(self): out = [] #print self.contents regexp = re.compile(r'\w*\.\.\d+', re.UNICODE) for (att, value) in self.contents: out.append(att + "=" + re.sub(regexp, sensify, value)) return self.tagname + ": " + " ".join(out) class Example: def __init__(self, contents): self.contents = contents def __str__(self): return self.make_string(self.contents)#.encode('utf-8') def make_string(self, contents): out = [] for p in contents: if len(p) == 2: if p[0] == "/freetext": # Freetext out.append(p[1]) elif len(p) == 3: if p[0] == "/leaf": # Node out.append('[' + p[2] + ']_{' + p[1].get("name", "") + '}') else: # Branch out.append('[') #print p[2] for item in p[2]: if len(item) == 2: # Freetext out.append(item[1]) elif item[0] == "/branch": # Branch out.append('[' + self.make_string(item[2]) + ']_{' + item[1].get("name", "") + '}') else: # Node out.append('[' + item[2] + ']_{' + item[1].get("name", "") + '}') out.append(']_{' + p[1].get("name", "") + '}') outstr = " ".join(out) return outstr #class Constructicon_Element: #def styleForNode(node): ############################# # INPUT # ############################# def handle_example(example): example_parts = [] if example.text != None: first_text = example.text.strip() else: first_text = "" if first_text != "": example_parts.append(("/freetext", first_text)) for part in example: inners = part.findall("e") if inners == None or len(inners) == 0: example_parts.append(("/leaf", part.attrib, part.text.strip() )) else: example_parts.append(("/branch", part.attrib, handle_example(part) )) thetail = part.tail if thetail != None: thetail = thetail.strip() if thetail != "": example_parts.append(("/freetext", thetail)) return example_parts def take(entry, tagname, resort): e2 = entry.find(tagname) if e2 != None: t = e2.text if(t == None): return resort else: return t.strip() else: return resort def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict if __name__ == '__main__': global mode global language if len(sys.argv) < 2: mode = 0 language = 0 # english else: if sys.argv[1] == "simplified": mode = 1 else: mode = 0 if sys.argv[2] == "english": language = 0 else: language = 1 lines = [] #char_stream = codecs.getreader("utf-8")(sys.stdin) #for line in char_stream: #for line in sys.stdin: # lines.append(line) #intext = "\n".join(lines) #tree = ElementTree() #with codecs.open('constructicon2.xml', encoding='utf-8') as f: # intext = f.read() #tree = fromstring(intext) #tree = fromstring(intext.encode('utf-8')) tree = ElementTree() tree.parse("../konstruktikon-data/constructicon.xml") html = HTML() entries = tree.findall("entry") for entry in entries: if entry.tag == TODO: continue e = KonstruktikonEntry() #print entry.attrib e.id = entry.get("{http://www.w3.org/XML/1998/namespace}id") #print take(entry, "type", "") e.typee = take(entry, "type", "") e.cat = take(entry, "cat", "") e.inheritance = take(entry, "inheritance", "") e.evokes = take(entry, "evokes", "") definition = entry.find("definition") def_parts = [] if definition != None: if definition.text != None: first_text = definition.text.strip() if first_text != "": def_parts.append(("/freetext", first_text)) for atom in definition: if atom.tag == TODO: continue if atom.text != None: def_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip())) thetail = atom.tail if thetail != None: thetail = thetail.strip() if thetail != "": def_parts.append(("/freetext", thetail)) #print def_parts #print e.set_definition(def_parts) e.structure = take(entry, "structure", "") e.cee = take(entry, "cee", "") e.coll = take(entry, "coll", "") c_e = entry.find("construction_elements") if c_e != None: internal = c_e.find("internal") if internal != None: for el in internal: if el.tag == TODO: continue e.add_internal_construction_element(el.tag, el.items()) external = c_e.find("external") if external != None: for el in external: if el.tag == TODO: continue e.add_external_construction_element(el.tag, el.items()) examples = entry.find("examples") for example in examples: if example != None: if example.tag == TODO: continue example_parts = handle_example(example) #print example_parts #print "---" e.add_example(example_parts) """examples = entry.find("examples") for example in examples: if example.tag == TODO: continue example_parts = [] if example != None and example.text != None: first_text = example.text.strip() if first_text != "": example_parts.append(("/freetext", first_text)) for atom in example: if atom.tag == TODO: continue #if atom.text != None: subatoms = atom.findall("e") if subatoms == None or len(subatoms) == 0: example_parts.append((list_to_dict(atom.items()), atom.text.strip())) else: #print subatoms subex = [] for subatom in subatoms: if subatom.tag == TODO: continue subex.append((list_to_dict(subatom.items()), subatom.text.strip())) extratext = subatom.tail if extratext != None: extratext = extratext.strip() if extratext != "": subex.append(("/freetext", extratext)) example_parts.append((list_to_dict(atom.items()), "/branch", subex)) thetail = atom.tail if thetail != None: thetail = thetail.strip() if thetail != "": example_parts.append(("/freetext", thetail)) #print example_parts #print e.add_example(example_parts)""" comment = entry.find("comment") com_parts = [] if comment != None: if comment.text != None: first_text = comment.text.strip() if first_text != "": com_parts.append(("/freetext", first_text)) for atom in comment: if atom.tag == TODO: continue if atom.text != None: com_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip())) thetail = atom.tail if thetail != None: thetail = thetail.strip() if thetail != "": com_parts.append(("/freetext", thetail)) e.set_comment(com_parts) #e.comment = take(entry, "comment", "") e.reference = take(entry, "reference", "") if mode == 0 or e.typee == "cx" or e.typee == "Cx": html.add_entry(e) #sys.stdout = codecs.getwriter('utf-8')(sys.stdout) a = unicode(html) print a.encode("utf-8")