#!/usr/bin/env python import sys import codecs import re from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring total = [] xml_tree = fromstring(sys.stdin.read()) for entry in xml_tree.find("Lexicon"): lemma = entry.find("Lemma") baseforms = [] latins = [] #syns = [] lemgram = None if lemma != None: freps = lemma.findall("FormRepresentation") if freps != None: for fr in freps: allfeats = fr.findall("feat") if allfeats != None: for feat in allfeats: if feat.attrib["att"] == "writtenForm": baseforms.append(feat.attrib["val"]) elif feat.attrib["att"] == "lemgram": lemgram = feat.attrib["val"] elif feat.attrib["att"] == "latin": latins.append(feat.attrib["val"]) elif feat.attrib["att"] == "syn": baseforms.append(feat.attrib["val"]) total.append((lemgram, baseforms, latins)) #if len(baseforms) > 1: # print str(len(baseforms)) for (lemgram, baseforms, latins) in total: if len(baseforms) == 0: pass#print "STRANGE: " #+ lemgram else: if len(latins) > 0: lats = "|".join(latins); else: lats = "-" for b in baseforms: print b.encode("utf-8") + "\t" + lats.encode("utf-8") + "\t" + lemgram.encode("utf-8") # This is unfinished. It's supposed to be used to build a raw material from the Swedberg resource.