#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import re
import codecs
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
TODO = "todo"
KARP = "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}"
URL = "swefn.xml"
saldo_dict = {}
class HTML:
def __init__(self):
self.entries = []
def __str__(self):
return "\n".join([
'',
'',
'
',
'Utvecklingsversion av SweFN ',
' ',
' ',
'',
'',
"\n".join(["%s" % (e) for e in sorted(self.entries, key=lambda x: x.id)]),
'',
''])#.encode('utf-8')
def add_entry(self, entry):
self.entries.append(entry)
class SweFNEntry:
def __init__(self):
self.inheritance = ""
self.lus = []
self.lu_suggestions = []
self.comps = []
self.comp_examples = []
self.domains = []
self.cemtypes = []
self.c_elements = []
self.p_elements = []
self.examples = []
self.created_by = ""
self.comment = None
self.status = ""
self.id = ""
self.bid = ""
self.modifications = []
self.createdDate = ""
self.modifDate = ""
def add_example(self, contents):
self.examples.append(Example(contents))
def set_comment(self, contents):
self.comment = Example(contents)
def __str__(self):
global language
if language == 0: # english
l_comment = u"kommentar"#u"comment"
l_structure = u"structure"
l_examples = u"exempel"#u"examples"
l_coll = u"common words"
l_status = u"status"
else: # swedish
l_comment = u"kommentar"
l_structure = u"struktur"
l_examples = u"exempel"
l_coll = u"vanliga ord"
l_status = u"status"
examples = ""
if len(self.examples) != 0:
examples = 'exempel ' % ("".join( [ ('%s ' % (example)) for example in self.examples]))
lus_formatted = ""
if len(self.lus) != 0:
lines = []
lu_output = {}
for lu in self.lus:
pos = saldo_dict.get(lu, u"okänd")
if pos not in lu_output:
lu_output[pos] = []
lu_output[pos].append(lu)
for p in lu_output:
lines.append("" + p + " " + " ".join(lu_output[p]))
lus_formatted = " ".join(lines)
created = ""
modified = ""
latest_modif = ""
first_modif = ""
for m in self.modifications:
md = date_string_to_int(m)
if first_modif == "" or md < date_string_to_int(first_modif):
first_modif = m
if latest_modif == "" or md > date_string_to_int(latest_modif):
latest_modif = m
if self.createdDate == "":
created = first_modif
else:
created = self.createdDate
if latest_modif != "":
modified = latest_modif
else:
if self.modifDate:
modified = self.modifDate
else:
modified = self.createdDate
createdstring = u'skapad %s ' % (created)
modificationstring = u'ändrad %s ' % (modified)
if self.bid != "":
header = '' % (self.id, self.bid, self.id.split("--")[1])
else:
header = '' % (self.id, self.id.split("--")[1])
s = "\n".join([
header,
'' % (self.id),
cond(u"domän", ", ".join(self.domains)),
cond(u"semantisk typ", ", ".join(self.cemtypes)),
cond(u"kärnelement", " ".join(self.c_elements)),
cond(u"periferielement", " ".join(self.p_elements)),
linked_cond("arv", self.inheritance),
examples,
cond(u"sms", " ".join(self.comps)),
cond(u"sms-exempel", " ".join(self.comp_examples)),
cond(u"lus", lus_formatted),
#cond(u"lus", " ".join(self.lus)),
cond(u"lu-förslag", " ".join(self.lu_suggestions)),
cond(l_comment, self.comment),
cond("skapad av", self.created_by),
createdstring,
modificationstring,
'
'])
return s#.encode('utf-8')
def linked_cond(label, item):
if(item != "" and item != None):
regexp = re.compile(r'\w*\.\.\d+', re.UNICODE)
item = re.sub(regexp, sensify, item)
return '%s %s ' % (label, item, item)
else:
return ''
def cond(label, item):
if(item != "" and item != None):
regexp = re.compile(r'(\w|-)*\.\.\d+', re.UNICODE)
item = re.sub(regexp, sensify, item)
return '%s %s ' % (label, item)
else:
return ''
def sensify(matchobj):
sense = matchobj.group(0)
parts = sense.split("..")
if len(parts) == 2:
return '%s%s ' % (parts[0], parts[1])
else:
return sense
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s.replace('"', '"')
class ConstructionElement:
def __init__(self, tagname, contents):
self.tagname = tagname
self.contents = contents # [("name", "Activity"), ("cat", "vb")]
def __str__(self):
out = []
#print self.contents
regexp = re.compile(r'\w*\.\.\d+', re.UNICODE)
for (att, value) in self.contents:
out.append(att + "=" + re.sub(regexp, sensify, value))
return self.tagname + ": " + " ".join(out)
class Example:
def __init__(self, contents):
self.contents = contents
def __str__(self):
return self.make_string(self.contents)#.encode('utf-8')
def make_string(self, contents):
out = []
for p in contents:
if len(p) == 2:
if p[0] == "/freetext":
# Freetext
out.append(escape(p[1]))
elif len(p) == 3:
if p[0] == "/leaf":
# Node
out.append('[ ' + escape(p[2]) + '] ' + p[1].get("name", "") + ' ')
else:
# Branch
out.append('[ ')
#print p[2]
for item in p[2]:
if len(item) == 2:
# Freetext
out.append(escape(item[1]))
elif item[0] == "/branch":
# Branch
out.append('[ ' + self.make_string(item[2]) + '] ' + item[1].get("name", "") + ' ')
else:
# Node
out.append('[ ' + escape(item[2]) + '] ' + item[1].get("name", "") + ' ')
out.append('] ' + p[1].get("name", "") + ' ')
outstr = " ".join(out)
return outstr.replace(" !|! ","")
#class Constructicon_Element:
#def styleForNode(node):
#############################
# INPUT #
#############################
def date_string_to_int(ds):
return int(ds.replace("-", ""))
def structify(s):
s = re.sub(r'(\S+)\.\.(\d+)',lambda x: x.group(1)+""+x.group(2)+" ",s,re.U)
s = re.sub(r'"(.+?)"',lambda x: "" + x.group(1) + " ",s,re.U)
s = re.sub(r'([^\s\]\/]+)_([^\s\]\/]+)',lambda x: x.group(1) + '' + x.group(2) + ' ' if '' not in x.group(0) else x.group(0),s,re.U)
return s
def handle_markup(example):
example_parts = []
for part in example.getchildren():
if part.tag == KARP+"e" or part.tag == "e":
if len(part.getchildren()) == 0:
example_parts.append(("/leaf", part.attrib, part.text.strip() ))
else:
example_parts.append(("/branch", part.attrib, handle_markup(part) ))
elif part.tag == KARP+"text" or part.tag == "text":
if part.text:
example_parts.append(("/freetext", part.text.strip()))
elif part.tag == KARP+"g" or part.tag == "g":
example_parts.append(("/freetext", "!|!"))
return example_parts
def take(entry, tagname, resort):
e2 = entry.find(tagname)
if e2 != None:
t = e2.text
if t == None:
return resort
else:
return t.strip()
else:
return resort
"""
def list_to_dict(alist):
dict = {}
for item in alist:
dict[item[0]] = item[1]
return dict
"""
def dict_to_list(indict):
outlist = []
for item in indict:
if item != "uid":
outlist.append((item, indict[item]))
return outlist
def read_csv_from_file(path, num_of_fields):
with codecs.open(path, encoding='utf-8') as f:
for line in f:
e = [x for x in line[:-1].split('\t')]
yield e[0:num_of_fields]
if __name__ == '__main__':
global mode
global language
if len(sys.argv) < 2:
mode = 0
language = 0 # english
else:
if sys.argv[1] == "simplified":
mode = 1
else:
mode = 0
if sys.argv[2] == "english":
language = 0
else:
language = 1
lines = []
tree = ElementTree()
tree.parse(URL)
html = HTML()
saldo = read_csv_from_file("saldo.txt", 7)
for (sid, _, _, _, _, pos, _) in saldo:
saldo_dict[sid] = pos
entries = tree.find("Lexicon").findall("LexicalEntry")
for le in entries:
e = SweFNEntry()
sense = le.find("Sense")
if sense != None:
e.id = sense.get("id")
#print e.id
cemtypes = []
domains = []
c_elements = []
p_elements = []
comps = []
comp_examples = []
lus = []
lu_suggestions = []
feats = sense.findall("feat")
for feat in feats:
feat_att = feat.get("att")
feat_val = feat.get("val")
if feat_att == "semanticType":
cemtypes.append(escape(feat_val))
elif feat_att == "domain":
domains.append(escape(feat_val))
elif feat_att == "coreElement":
c_elements.append(escape(feat_val))
elif feat_att == "inheritance":
e.inheritance = escape(feat_val)
elif feat_att == "peripheralElement":
p_elements.append(escape(feat_val))
elif feat_att == "compound":
comps.append(escape(feat_val))
elif feat_att == "compoundExample":
comp_examples.append(escape(feat_val))
elif feat_att == "LU":
lus.append(escape(feat_val))
elif feat_att == "suggestionForLU":
lu_suggestions.append(escape(feat_val))
elif feat_att == "internal_comment": # New, but should maybe not be rendered out
pass
elif feat_att == "comment": # Right now it's not a feat but I think it should be!
e.comment = escape(feat_val)
elif feat_att == "createdBy": # Right now it's not a feat but I think it should be!
e.created_by = escape(feat_val)
elif feat_att == "createdDate":
e.createdDate = escape(feat_val)
elif feat_att == "modifDate":
e.modifDate = escape(feat_val)
elif feat_att == "entry_status": # New field
e.status = escape(feat_val)
elif feat_att == "BFNID":
e.bid = escape(feat_val)
e.cemtypes = cemtypes
e.domains = domains
e.c_elements = c_elements
e.p_elements = p_elements
e.comps = comps
e.comp_examples = comp_examples
e.lus = list(lus)
e.lu_suggestions = lu_suggestions
modifs = le.findall(KARP + "modification", namespaces={"karp":"karp"})
for m in modifs:
date_text = m.find("feat[@att='modificationDateTime']").get("val")[:10]
e.modifications.append(date_text)
#print list(sense)
examples = sense.findall(KARP + "example", namespaces={"karp":"karp"})
for example in examples:
e.add_example(handle_markup(example))
#### These are because we don't save the elements back with namespaces at the moment ###
examples = sense.findall("example", namespaces={"karp":"karp"})
for example in examples:
e.add_example(handle_markup(example))
########################################################################################
html.add_entry(e)
a = unicode(html)
print a.encode("utf-8")