#!/usr/bin/env python
# -*- coding: utf8 -*-
import sys
import codecs
import re
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
### NAMESPACES: ################ NOT USED RIGHT NOW BUT THE MECHANISM IS (PARTLY) AVAILABLE
ns_saldo_pos = 'saldo_pos'
ns_saldo_sense = 'saldo_sense'
ns_saldo_lemgram = 'saldo_lemgram'
ns_saldo_paradigm = 'saldo_pdgm'
ns_simple_semtype = 'simple_semtype'
ns_kelly_id = 'kelly_id'
ns_lwt_id = 'lwt_id'
ns_simple_class = 'simple_class'
ns_simple_domain = 'simple_domain'
################################
class LMF:
def __init__(self, lang):
self.lang = lang
self.lexical_entries = []
self._lexical_entries_set = set()
self._le_senses = set()
self.useNamespace = False
self.semantic_predicates = []
def add_lexical_entry(self, lexical_entry):
self.lexical_entries.append(lexical_entry)
self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden)
def add_semantic_predicate(self, semantic_predicate):
self.semantic_predicates.append(semantic_predicate)
def __str__(self):
return "\n".join([
'',
'',
'',
'',
' ',
'',
'' if not self.useNamespace else '',
' ' % self.lang,
"\n".join([str(e) for e in self.lexical_entries]),
"\n".join([str(s) for s in self.semantic_predicates]),
'',
''])
class LexicalEntry:
def __init__(self):
self.features = []
self.lemma = None
self.wordforms = []
self.senses = []
self.saldolinks = []
self._pos = ""
self._wf = ""
self.idattr = ""
def add_sense(self, sense):
self.senses.append(sense)
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def add_wordform(self, wordform):
self.wordforms.append(wordform)
def add_saldoLink(self, saldoLink):
self.saldolinks.append(saldoLink)
def __str__(self):
le_string = ''
if(self.idattr):
le_string = '' % (self.idattr)
return "\n".join([
le_string,
'\n'.join([str(f) for f in self.features]),
str(self.lemma),
'\n'.join([str(w) for w in self.wordforms]),
'\n'.join([str(s) for s in self.senses]),
'\n'.join([str(f) for f in self.saldolinks]),
''])
class SaldoLink:
def __init__(self, saldo_id):
self.saldo_id = saldo_id
def __str__(self):
return '' % (self.saldo_id)
"""
class Lemma:
def __init__(self):
self.features = [] # now including writtenForm and partOfSpeech!
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def __str__(self):
if self.features:
return "\n".join(['\n',
'\n'.join([str(f) for f in self.features]),
'\n'])
else:
return ''
"""
class Lemma:
def __init__(self):
self.form_representations = []
self.features = [] # now including writtenForm and partOfSpeech!
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def add_form_representation(self, form_representation):
self.form_representations.append(form_representation)
def __str__(self):
if self.features or self.form_representations:
return "\n".join(['', '\n'.join(str(fr) for fr in self.form_representations),''])
#return "\n".join(['\n',
# '\n'.join([str(f) for f in self.features]),
# '\n'])
else:
return ''
class WordForm:
def __init__(self):
self.features = []
self.form_representations = []
def add_feature(self, feature):
self.features.append(feature)
def add_form_representation(self, form_representation):
self.form_representations.append(form_representation)
def __str__(self):
return "\n".join(['',
'\n'.join(str(fr) for fr in self.form_representations),
'\n'.join([str(f) for f in self.features]),
''])
class FormRepresentation:
def __init__(self):
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def __str__(self):
if self.features:
return "\n".join(['','\n'.join([str(f) for f in self.features]),''])
else:
return ''
class Feature:
def __init__(self, att, val):
self.att = att
self.val = val
def __str__(self):
return '' % (self.att, escape(self.val))
class Sense:
def __init__(self, sense):
self.sense = sense
self.relations = []
self.predicative_representations = []
self.sense_examples = []
self.features = []
def add_sense_relation(self, sense_relation):
self.relations.append(sense_relation)
def add_predicative_representation(self, predicative_representation):
self.predicative_representations.append(predicative_representation)
def add_sense_example(self, sense_example):
self.sense_examples.append(sense_example)
def add_feature(self, feature):
self.features.append(feature)
def __str__(self):
if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features:
return '' % (self.sense)
else:
return "\n".join(['' % (self.sense),
'\n'.join([str(f) for f in self.features]),
"\n".join([str(pre) for pre in self.predicative_representations]),
"\n".join([str(rel) for rel in self.relations]),
"\n".join([str(ex) for ex in self.sense_examples]),
''
])
class SenseRelation:
def __init__(self, target, relation_types):
self.target = target
self.relation_types = relation_types
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __str__(self):
return "\n".join(['' % (self.target),
'\n'.join(['' % t for t in self.relation_types]),
'\n'.join([str(f) for f in self.features]),
''
])
class SenseExample:
def __init__(self, example):
self.example = example
def __str__(self):
return "\n".join([
'',
'' % (self.example),
''
])
class SemanticPredicate:
def __init__(self, id, domain, semantic_types):
self.id = id
#self.domain = domain
self.semantic_types = semantic_types
self.semantic_arguments = []
self.features = []
if domain != None and domain != "":
self.add_feature(Feature("domain", domain))
def add_semantic_argument(self, argument):
self.semantic_arguments.append(argument)
def add_feature(self, feature):
self.features.append(feature)
def generateFeatures(self, att, vals):
for val in vals:
self.add_feature(Feature(att, val.strip()))
def __str__(self):
extras = ""
for st in self.semantic_types:
extras += ''
return "\n".join([
'' % (self.id),
"\n".join(['\n' % (st) for st in self.semantic_types]),
"\n".join([str(fe) for fe in self.features]),
"\n".join([str(sa) for sa in self.semantic_arguments]),
''
])
class SemanticArgument:
def __init__(self, semantic_role, core_type):
self.semantic_role = semantic_role
self.core_type = core_type
def __str__(self):
return '' % (self.semantic_role, self.core_type)
class PredicativeRepresentation:
def __init__(self, idref):
self.idref = idref
def __str__(self):
return '' % (self.idref, self.idref)
# HELPER FUNCTIONS -------------------------------------------------------------------------------------
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s.replace('"', '"')
def read_csv(num_of_fields, tolerates=-1):
if tolerates == -1:
tolerates = num_of_fields
for line in sys.stdin:
e = line[:-1].split('\t')
if len(e) == num_of_fields:
yield e
elif len(e) >= tolerates and len(e) < num_of_fields:
f = [""] * (num_of_fields - len(e))
yield e + f
return
def read_csv_from_file(path, num_of_fields):
with codecs.open(path, encoding='utf-8') as f:
for line in f:
e = [x.encode("utf-8") for x in line[:-1].split('\t')]
#e = line[:-1].split('\t')
yield e[0:num_of_fields]
def give_namespace(namespace, identifier):
return identifier
#return '%s:%s' % (namespace, identifier)
def search_for_le_with_sense(lmf, sense):
for (s, le) in lmf._le_senses:
if s == sense:
return le
return None
def dequote(s):
if (s[0] == "'" and s[-1] == "'") or (s[0] == '"' and s[-1] == '"') :
return s[1:-1]
else:
return s
# --- SALDO ------------------------------------------------
##def sorting_by_sense(a, b):
## if a[0] == b[0]:
## return 0
## elif a[0] < b[0]:
## return -1
## else:
## return 1
# a sense is unique for an entry.
def saldo_data():
forms = {}
senses = set()
for (saldo, primary, secondary, lemgram, gf, pos, paradigm) in read_csv(num_of_fields=7):
if saldo in forms:
forms[saldo].append((lemgram, gf, pos, paradigm))
else:
forms[saldo] = [(lemgram, gf, pos, paradigm)]
senses.add((saldo, primary, secondary))
return (forms, sorted(list(senses), key=lambda x: x[0]))
def saldo():
lmf = LMF('swe')
(forms, senses) = saldo_data()
lexical_entry = LexicalEntry()
sense = Sense(give_namespace(ns_saldo_sense,'PRIM..1'))
lexical_entry.lemma = Lemma()
lexical_entry.add_sense(sense)
lmf.add_lexical_entry(lexical_entry)
counter = 0
for (saldo, primary, secondary) in senses:
#saldo = give_namespace(ns_saldo_sense, saldo.strip())
counter += 1
lexical_entry = LexicalEntry()
lemma = Lemma()
for (lemgram, gf, pos, paradigm) in forms[saldo]:
form_representation = FormRepresentation()
form_representation.add_feature(Feature("writtenForm", gf))
form_representation.add_feature(Feature("partOfSpeech", give_namespace(ns_saldo_pos, pos)))
form_representation.add_feature(Feature("lemgram", give_namespace(ns_saldo_lemgram, lemgram)))
form_representation.add_feature(Feature("paradigm", give_namespace(ns_saldo_paradigm, paradigm)))
lemma.add_form_representation(form_representation)
lexical_entry.lemma = lemma
sense = Sense(give_namespace(ns_saldo_sense, saldo))
#if primary != 'PRIM..1':
primary_relation = SenseRelation(give_namespace(ns_saldo_sense, primary), ['primary'])
sense.add_sense_relation(primary_relation)
if secondary != 'PRIM..1':
for sec in secondary.split(' '):
secondary_relation = SenseRelation(give_namespace(ns_saldo_sense, sec), ['secondary'])
sense.add_sense_relation(secondary_relation)
lexical_entry.add_sense(sense)
lmf.add_lexical_entry(lexical_entry)
#return counter
#return len(lmf.lexical_entries)
return str(lmf)
#def search_for_lexical_entry(lmf, pos, saldo):
# #saldo = saldo.split("..")[0]
# #if (pos + "." + saldo) in lmf._lexical_entries_set: # Fulhack for att gora det snabbare, ersatt med nat battre i framtiden.
# if saldo in
# for le in lmf.lexical_entries:
# if pos == le._pos and saldo == le._wf:
# return le
# return None
#def search_for_sense(lexical_entry, saldo):
# for s in lexical_entry.senses:
# if s.sense == saldo:
# return s
# return None
# ------- SALDO EXAMPLES -----------------------------------
def saldo_examples_data():
examples = []
for (saldo, _, _, _, _, example) in read_csv(num_of_fields=6):
saldo = give_namespace(ns_saldo_sense, saldo.strip())
example = example.strip()
if example and example != "*":
examples.append((saldo, example))
return examples
def saldo_examples():
lmf = LMF('swe')
added_examples = []
for (saldo, example) in saldo_examples_data():
le = search_for_le_with_sense(lmf, saldo)
if not le:
le = LexicalEntry()
le.lemma = Lemma()
lmf._le_senses.add((saldo, le))
sense = Sense(saldo)
le.add_sense(sense)
lmf.add_lexical_entry(le)
#sense = search_for_sense(le, saldo)
sense = le.senses[0]
if not (example, saldo) in added_examples: # We don't want duplicates if we have different sources
added_examples.append((example, saldo))
sense.add_sense_example(SenseExample(escape(example)))
return str(lmf)
# ------- SWESAURUS ----------------------------------------
def swesaurus_data():
return read_csv(num_of_fields=5)
def swesaurus():
lmf = LMF('swe')
synsets = swesaurus_data()
for (saldo1, saldo2, type_of, degree, source) in synsets:
saldo1 = give_namespace(ns_saldo_sense, saldo1.strip())
saldo2 = give_namespace(ns_saldo_sense, saldo2.strip())
saldo = [saldo1, saldo2]
le = [None, None]
# The relations are not always symmetric, but since a SenseRelation's IDREF has to point at a Sense ID,
# we have to make sure that there is an inverse relation (y-to-x for each x-to-y)
n = 2 if type_of == "syn" else 1
for i in range(n):
le[i] = search_for_le_with_sense(lmf, saldo[i])
if le[i] == None:
le[i] = LexicalEntry()
le[i].lemma = Lemma()
lmf._le_senses.add((saldo[i], le[i]))
lmf.add_lexical_entry(le[i])
sense = Sense(saldo[i])
le[i].add_sense(sense)
else:
sense = le[i].senses[0]
sense_relation = SenseRelation(saldo[(i+1) % 2], [type_of])
sense_relation.add_feature(Feature("degree", degree))
sense_relation.add_feature(Feature("source", source))
sense.add_sense_relation(sense_relation)
return str(lmf)
# ---------- LWT -------------------------------------------
def lwt_data():
return read_csv(num_of_fields=5)
def lwt(): # LexicalEntry acts more like a semantic entry right now
lmf = LMF('swe')
lmf.useNamespace = True
entries = lwt_data()
for (s_id, saldo, eng, definition, example) in entries:
eng2 = eng.replace("'", "").replace("?", "")
if "(" in eng2:
eng2 = eng2.split("(")[0].strip()
if "/" in eng2:
eng2 = eng2.split("/")[0].strip()
le = LexicalEntry()
sense = Sense("lwt--" + s_id)
saldo = saldo.strip()
for s in saldo.split():
s = s.strip()
if s != 'PRIM..1':
le.add_saldoLink(SaldoLink(s))
sense.add_feature(Feature("saldoSense", s))
le.lemma = Lemma()
le.add_sense(sense)
lmf.add_lexical_entry(le)
form_representation = FormRepresentation()
form_representation.add_feature(Feature("lwtID", s_id))
form_representation.add_feature(Feature("english", eng))
if definition and definition != "--":
form_representation.add_feature(Feature("definition", dequote(definition)))
if example and example != "--":
form_representation.add_feature(Feature("example", dequote(example))) # Maybe the apostrophes around (most of) the examples should be stripped of?
le.lemma.add_form_representation(form_representation)
return str(lmf)
# ---------- PAROLE ----------------------------------------
def parole_data():
return read_csv(num_of_fields=5)
def parole():
lmf = LMF('swe')
lmf.useNamespace = True
entries = parole_data()
parole_entries = {}
for (baseform, saldo, pos, valency, paroleid) in entries:
if paroleid == "zz":
paroleid = baseform + "_zz"
saldo = saldo.strip()
# sometimes the tabs have become spaces, so this is a fix for that:
if saldo == "av arbeta_av..1":
baseform = "arbeta av"
saldo = "arbeta_av..1"
elif saldo == "bort arbeta_bort..1":
baseform = "arbeta bort"
saldo = "arbeta_bort..1"
elif saldo == " epilera..1":
saldo = "epilera..1"
elif saldo == "frottera _sig..1":
saldo = "frottera_sig..1"
if baseform == "gille(s)stuga":
baseform = "gillestuga"
if paroleid in parole_entries:
if saldo != "PRIM..1":
parole_entries[paroleid]["saldo"].append(saldo)
else:
parole_entries[paroleid] = {}
parole_entries[paroleid]["pos"] = pos
parole_entries[paroleid]["baseform"] = baseform
parole_entries[paroleid]["valency"] = valency
parole_entries[paroleid]["paroleid"] = paroleid
parole_entries[paroleid]["saldo"] = [saldo] if saldo != "PRIM..1" else []
for pe in parole_entries:
le = LexicalEntry()
lemma = Lemma()
le.lemma = lemma
fr = FormRepresentation()
lemma.add_form_representation(fr)
s = Sense("parole--" + parole_entries[pe]["paroleid"])
le.add_sense(s)
fr.add_feature(Feature("partOfSpeech", parole_entries[pe]["pos"]))
fr.add_feature(Feature("writtenForm", parole_entries[pe]["baseform"]))
fr.add_feature(Feature("valency", parole_entries[pe]["valency"]))
fr.add_feature(Feature("paroleID", parole_entries[pe]["paroleid"]))
for sid in parole_entries[pe]["saldo"]:
le.add_saldoLink(SaldoLink(sid))
s.add_feature(Feature("saldoSense", sid))
lmf.add_lexical_entry(le)
#saldo = give_namespace(ns_saldo_sense, saldo.strip())
#if saldo == "PRIM..1":
# index = 0
# while True:
# index += 1
# dummy = search_for_le_with_sense(lmf, "parolelexplus--" + baseform.strip() + ".." + str(index))
# if not dummy:
# break
# saldo = "parolelexplus--" + baseform.strip() + ".." + str(index)
#le = search_for_le_with_sense(lmf, saldo)
#if not le:
# le = LexicalEntry()
# le.add_sense(Sense(saldo))
# lmf._le_senses.add((saldo, le))
# lmf.add_lexical_entry(le)
# le.lemma = Lemma()
#form_representation = FormRepresentation()
#form_representation.add_feature(Feature("partOfSpeech", give_namespace(ns_saldo_pos, pos)))
#form_representation.add_feature(Feature("writtenForm", baseform))
#form_representation.add_feature(Feature("valency", v))
#form_representation.add_feature(Feature("paroleID", x))
#le.lemma.add_form_representation(form_representation)
#le.add_feature(Feature("x", x)) # Don't know what x really is. It is not rendered to the old XML version
return str(lmf)
# ---------- SIMPLE ----------------------------------------
simple_semantic_types = {'++ext' : 'Extensional',
'++psy' : 'Psychological_property',
'++phy' : 'Physical_property',
'++soc' : 'Social_property',
'++tem' : 'Temporal_property',
'++inp' : 'Intensifying_property',
'++rel' : 'Relational_property'}
simple_argmap = {'a_00' : '0',
'a0' : '1',
'a0 a1' : '2',
'a0 a1a' : '2', # markup error?
'a0 a' : '2', # markup error?
'a0 a1 a2' : '3',
'a0 a1a2' : '3'} # markup error?
def simple_expand_semantic_type(abbrev):
abbrev = simple_semantic_types.get(abbrev, abbrev)
if abbrev[0] == "+":
abbrev = abbrev[1:]
return abbrev
def simple_data():
return read_csv(num_of_fields=17)
def simple():
lmf = LMF('swe')
lmf.useNamespace = True
entries = simple_data()
for (baseform,paroleid,_,ssensen,gldb,bc,ontology,domain,lexiquest,gldbex,usynsemu,args,argreal,predfornoun,verbnoun,pos,saldo) in entries:
le = LexicalEntry()
newID = "simple--" + paroleid + "-" + ssensen[2:]
sense = Sense(newID)
le.add_sense(sense)
if domain == "g":
domain = "Gen"
## SemanticType (= ontology)
sense.add_feature(Feature("semanticType", simple_expand_semantic_type(ontology)))
sense.add_feature(Feature("domain", domain))
if ssensen[0:2] == "<<":
sense.add_feature(Feature("simpleSenseNumber", ssensen[2:]))
sense.add_feature(Feature("GLDB", gldb))
if gldbex != "-":
sense.add_feature(Feature("GLDBExample", gldbex)) # lemma/sense/nuance
## Basic Concepts
if bc != "ZZ":
sense.add_feature(Feature("basicConcept", bc)) # lemma/sense/nuance
## LexiQuest
lexiquest = lexiquest.strip()
for c in lexiquest.split("@"):
sense.add_feature(Feature("class", c))
## Codes(?), number of links between an usyn construction and corresponding semu specifications
usynsemu = usynsemu.strip()
if usynsemu != "-":
if usynsemu[0] == "p":
usynsemu = usynsemu[1:]
sense.add_feature(Feature("numberOfUsynSemuLinks", usynsemu))
## Arguments
if args != "-":
args = simple_argmap[args]
sense.add_feature(Feature("numberOfArguments", args))
## Argument relisations:
argreal = argreal.strip()
if argreal != "aa_00":
#for ar in argreal.split("_OR_"): <-- This can be realised in the future
# ...
sense.add_feature(Feature("argumentRealisation", argreal))
## Predicate for noun
if predfornoun != "-":
sense.add_feature(Feature("predicate", predfornoun))
## Type of verb_noun
# typ av verbalt substantiv l_n (verb nominalisation); l_ag (agent nominalisation); l_pa (process nominalization); annars l_00
if verbnoun != "-" and verbnoun != "l_00":
verbnoun = verbnoun[2:]
sense.add_feature(Feature("verbalizedNounType", verbnoun))
lemma = Lemma()
le.lemma = lemma
fr = FormRepresentation()
lemma.add_form_representation(fr)
fr.add_feature(Feature("partOfSpeech", pos))
fr.add_feature(Feature("writtenForm", baseform))
fr.add_feature(Feature("paroleID", paroleid))
if saldo != "-":
for s in saldo.split(";"):
le.add_saldoLink(SaldoLink(s))
sense.add_feature(Feature("saldoSense", s))
lmf.add_lexical_entry(le)
#saldo = give_namespace(ns_saldo_sense, saldo.strip())
#if saldo == 'PRIM..1' or saldo == '-':
# index = 0
# while True:
# index += 1
# dummy = search_for_le_with_sense(lmf, "simpleplus--" + baseform.strip().replace(" ", "_") + ".." + str(index))
# if not dummy:
# break
# saldo = "simpleplus--" + baseform.strip().replace(" ", "_") + ".." + str(index)
#le = search_for_le_with_sense(lmf, saldo)
#if not le:
# le = LexicalEntry()
# le.add_sense(Sense(saldo))
# lmf._le_senses.add((saldo, le))
# lmf.add_lexical_entry(le)
# le.lemma = Lemma()
#form_representation = FormRepresentation()
#
## Written Form
#form_representation.add_feature(Feature("writtenForm", baseform))
#
## Part-of-speech
#form_representation.add_feature(Feature("partOfSpeech", give_namespace(ns_saldo_pos, pos)))
#
## SemanticType (= ontology)
#form_representation.add_feature(Feature("semanticType", give_namespace(ns_simple_semtype, simple_expand_semantic_type(ontology))))
#
## Domain
#if domain == "g":
# domain = "Gen"
#form_representation.add_feature(Feature("domain", give_namespace(ns_simple_domain, domain)))
#
## Simple Sense Number
#if ssensen[0:2] == "<<":
# form_representation.add_feature(Feature("simpleSenseNumber", ssensen[2:]))
#
## GLDB lemma/sense/nuance numbers
#form_representation.add_feature(Feature("GLDB", gldb)) # lemma/sense/nuance
#
## GLDB example (only for verbs)
#if gldbex != "-":
# form_representation.add_feature(Feature("GLDBExample", gldbex)) # lemma/sense/nuance
#
## Basic Concepts
#if bc != "ZZ":
# form_representation.add_feature(Feature("basicConcept", bc)) # lemma/sense/nuance
#
## LexiQuest
#lexiquest = lexiquest.strip()
#for c in lexiquest.split("@"):
# form_representation.add_feature(Feature("class", give_namespace(ns_simple_class, c)))
#
#
## Codes(?), number of links between an usyn construction and corresponding semu specifications
#usynsemu = usynsemu.strip()
#if usynsemu != "-":
# if usynsemu[0] == "p":
# usynsemu = usynsemu[1:]
# form_representation.add_feature(Feature("numberOfUsynSemuLinks", usynsemu))
#
## Arguments
#if args != "-":
# args = simple_argmap[args]
# form_representation.add_feature(Feature("numberOfArguments", args))
#
## Argument relisations:
#argreal = argreal.strip()
#if argreal != "aa_00":
# #for ar in argreal.split("_OR_"): <-- This can be realised in the future
# # ...
# form_representation.add_feature(Feature("argumentRealisation", argreal))
#
## Predicate for noun
#if predfornoun != "-":
# form_representation.add_feature(Feature("predicate", predfornoun))
#
## Type of verb_noun
## typ av verbalt substantiv l_n (verb nominalisation); l_ag (agent nominalisation); l_pa (process nominalization); annars l_00
#if verbnoun != "-" and verbnoun != "l_00":
# verbnoun = verbnoun[2:]
# form_representation.add_feature(Feature("verbalizedNounType", verbnoun))
#
#le.lemma.add_form_representation(form_representation)
return str(lmf)
# ---------- KELLY -----------------------------------------
kelly_to_saldo = {'verb' : 'vb',
'noun' : 'nn',
'noun-en' : 'nn',
'noun-ett' : 'nn',
'noun-en/-ett' : 'nn',
'adjective' : 'av',
'numeral' : 'nl',
'proper name' : 'pm',
'adverb' : 'ab',
'aux verb' : 'vb',
'conj' : 'kn',
'det' : 'pn',
'interj' : 'in',
'particip' : 'vb',
'particle' : 'ab',
'prep' : 'pp',
'pronoun' : 'pn',
'subj' : 'sn'}
def map_kelly_pos_to_saldo(pos):
return kelly_to_saldo.get(pos, pos)
def kelly_data():
return read_csv(num_of_fields=10, tolerates=9)
def kelly():
lmf = LMF('swe')
lmf.useNamespace = True
entries = kelly_data()
for (id_num, raw, wpm, cefr, source, grammar, baseform, saldo, pos, example) in entries:
saldo = saldo.strip()
if "(" in baseform:
extrainfo = "(" + baseform.split("(")[1]
baseform = baseform.split("(")[0].strip()
else:
extrainfo = None
le = LexicalEntry()
sense = Sense("kelly--" + baseform)
le.add_sense(sense)
for s in saldo.split():
le.add_saldoLink(SaldoLink(s.strip()))
sense.add_feature(Feature("saldoSense", s))
kellyid = "" # TODO: FIX
lemma = Lemma()
le.lemma = lemma
form_representation = FormRepresentation()
lemma.add_form_representation(form_representation)
form_representation.add_feature(Feature("writtenForm", baseform))
if extrainfo:
form_representation.add_feature(Feature("formInformation", extrainfo))
form_representation.add_feature(Feature("partOfSpeech", map_kelly_pos_to_saldo(pos)))
form_representation.add_feature(Feature("kellyPartOfSpeech", pos))
form_representation.add_feature(Feature("kellyIdentifier", id_num))
form_representation.add_feature(Feature("raw", raw))
form_representation.add_feature(Feature("wpm", wpm))
form_representation.add_feature(Feature("cefr", cefr))
form_representation.add_feature(Feature("source", source))
if grammar: # Maybe both grammar and example should always be there but empty?
form_representation.add_feature(Feature("grammar", grammar))
if example:
form_representation.add_feature(Feature("example", example)) # Maybe "e.g. " in the start of the sentences should be deleted?
lmf.add_lexical_entry(le)
#for s in saldo.split():
# s = s.strip()
# if s == 'PRIM..1':
# index = 0
# while True:
# index += 1
# dummy = search_for_le_with_sense(lmf, "kelly--" + baseform.strip().replace(" ", "_") + ".." + str(index))
# if not dummy:
# break
# s = "kelly--" + baseform.strip().replace(" ", "_") + ".." + str(index)
# s = give_namespace(ns_saldo_sense, s)
# le = search_for_le_with_sense(lmf, s)
# if not le:
# le = LexicalEntry()
# le.add_sense(Sense(s))
# lmf._le_senses.add((s, le))
# lmf.add_lexical_entry(le)
#
# lemma = le.lemma
# if not lemma:
# lemma = Lemma()
# le.lemma = lemma
# form_representation = FormRepresentation()
# lemma.add_form_representation(form_representation)
#
# form_representation.add_feature_unique(Feature("writtenForm", baseform))
# if extrainfo:
# form_representation.add_feature_unique(Feature("formInformation", extrainfo))
# form_representation.add_feature_unique(Feature("partOfSpeech", give_namespace(ns_saldo_pos, map_kelly_pos_to_saldo(pos))))
#
# form_representation.add_feature_unique(Feature("kellyIdentifier", give_namespace(ns_kelly_id, id_num)))
# form_representation.add_feature_unique(Feature("raw", raw))
# form_representation.add_feature_unique(Feature("wpm", wpm))
# form_representation.add_feature_unique(Feature("cefr", cefr))
# form_representation.add_feature_unique(Feature("source", source))
# if(grammar): # Maybe both grammar and example should always be there but empty?
# form_representation.add_feature(Feature("grammar", grammar))
# if(example):
# form_representation.add_feature(Feature("example", example)) # Maybe "e.g. " in the start of the sentences should be deleted?
return str(lmf)
# ---------- WORDNET ---------------------------------------
# Wordnet is a little special because it operates on the files 'wn3_synsets.txt' and 'wordnet-saldo.txt' already in the directory
saldo_pos_from_wordnet = { "n" : "nn", "s" : "av", "v" : "vb", "r" : "ab", "a" : "av"} # adjective satellites (s) can also e numerals etc.
def wordnet_data():
synsets = {}
with codecs.open('wn3_synsets.txt', encoding='utf-8') as f:
for line in f:
e = [x.encode("utf-8") for x in line[:-1].split('\t')]
synsets[e[0]] = {}
syn = synsets[e[0]]
syn["gloss"] = e[2]
syn["pos"] = e[3]
syn["definition"] = e[4] # The definition may contain one or more examples, grab these later! FORM: definition; "[example]"; "[example]" ...
entries = {}
with codecs.open('wordnet-saldo.txt', encoding='utf-8') as f:
for line in f:
e = [x.encode("utf-8") for x in line[:-1].split('\t')]
if not e[0] in entries:
entries[e[0]] = {}
ent = entries[e[0]]
ent["saldo"] = e[1]
ent["synset"] = e[0]
ent["type"] = e[2]
ent["core"] = e[6]
ent["freq"] = e[4]
ent["gloss"] = synsets[e[0]]["gloss"]
ent["pos"] = synsets[e[0]]["pos"]
ent["definition"] = synsets[e[0]]["definition"]
#with codecs.open('wordnet-saldo-relations.txt', encoding='utf-8') as f:
# for line in f:
# e = [x.encode("utf-8") for x in line[:-1].split('\t')]
# if not "relations" in entries[e[0]]:
# entries[e[0]]["relations"] = []
# rels = entries[e[0]]["relations"]
# rels.append((e[1], e[2]))
return entries
def wordnet():
lmf = LMF('swe')
lmf.useNamespace = True
objects = wordnet_data()
for key in objects:
item = objects[key]
le = LexicalEntry()
lmf.add_lexical_entry(le)
sense = Sense("wordnet--" + key.replace(":","_").replace("%","_"))
le.add_sense(sense)
le.add_saldoLink(SaldoLink(item["saldo"]))
sense.add_feature(Feature("saldoSense", item["saldo"]))
#if "relations" in item:
# for rel in item["relations"]:
# s.add_sense_relation(SenseRelation(rel[1], [rel[0]]))
lemma = Lemma()
le.lemma = lemma
fr = FormRepresentation()
glosses = item["gloss"].split(", ")
for gl in glosses:
fr.add_feature(Feature("gloss", gl))
fr.add_feature(Feature("partOfSpeech", saldo_pos_from_wordnet[item["pos"]]))
fr.add_feature(Feature("wordnetPartOfSpeech", item["pos"]))
def_and_examples = [x.strip() for x in item["definition"].split(";")]
if len(def_and_examples) >= 1:
sense.add_feature(Feature("definition", def_and_examples[0]))
if len(def_and_examples) > 1:
for examp in def_and_examples[1:]:
if examp != "":
sense.add_feature(Feature("example", dequote(examp)))
sense.add_feature(Feature("synset", item["synset"]))
sense.add_feature(Feature("type", item["type"]))
sense.add_feature(Feature("core", item["core"]))
sense.add_feature(Feature("frequency", item["freq"]))
lemma.add_form_representation(fr)
return str(lmf)
# ---------- CROSS PIVOT -----------------------------------
# Uses the raw material from dalin and fsv to make a cross pivot resource
# allowing searching on for example "brev" to find "bref" etc.
def cp_fsvbase_data():
return read_csv_from_file("../fsv/fsv.txt", 10)
def cp_dalinbase_data():
return read_csv_from_file("dalin_saldo.txt", 10)
def crosspivot():
lmf = LMF('swe')
pivots = {}
# Data from Dalin
entries = cp_dalinbase_data()
for (old_spelling, new_spelling, pos, dalin_gram, dalin_lemgram, le_type, pattern, saldo_lemgram, saldo_senses, skos) in entries:
if saldo_lemgram != "--" and saldo_lemgram != "PRIM..1":
if saldo_lemgram in pivots:
pivots[saldo_lemgram].append(("_1800", dalin_lemgram, skos))
else:
pivots[saldo_lemgram] = [("_1800", dalin_lemgram, skos)]
# Data from FSV
entries = cp_fsvbase_data()
for (old_spelling, new_spelling, pos, _, fsv_lemgram, le_type, pattern, saldo_lemgram, saldo_senses, skos) in entries:
if saldo_lemgram != "--" and saldo_lemgram != "PRIM..1":
if " " in saldo_lemgram:
saldo_lemgrams = saldo_lemgram.split(" ")
else:
saldo_lemgrams = [saldo_lemgram]
for sl in saldo_lemgrams:
if not sl in pivots:
pivots[sl] = []
pivots[sl].append(("old", fsv_lemgram, skos))
for pivot in pivots:
le = LexicalEntry()
lemma = Lemma()
le.lemma = lemma
saldo_fr = FormRepresentation()
saldo_fr.add_feature(Feature("category", "modern"))
saldo_fr.add_feature(Feature("lemgram", pivot))
lemma.add_form_representation(saldo_fr)
for post in pivots[pivot]:
fr = FormRepresentation()
fr.add_feature(Feature("category", post[0]))
fr.add_feature(Feature("lemgram", post[1]))
fr.add_feature(Feature("match", post[2]))
lemma.add_form_representation(fr)
lmf.add_lexical_entry(le)
return str(lmf)
# ---------- DALIN BASE MATERIAL -----------------------------------
def dalinbase_data():
return read_csv(num_of_fields=10)
def dalinbase():
lmf = LMF('swe')
entries = dalinbase_data()
for (old_spelling, new_spelling, pos, dalin_gram, dalin_lemgram, le_type, pattern, saldo_lemgram, saldo_senses, match_type) in entries:
le = LexicalEntry()
lemma = Lemma()
le.lemma = lemma
fr = FormRepresentation()
fr.add_feature(Feature("lemgram", dalin_lemgram))
fr.add_feature(Feature("oldSpelling", old_spelling))
fr.add_feature(Feature("newSpelling", new_spelling))
fr.add_feature(Feature("xref", le_type))
fr.add_feature(Feature("partOfSpeech", pos))
if pattern != "--":
fr.add_feature(Feature("paradigm", pattern))
lemma.add_form_representation(fr)
lmf.add_lexical_entry(le)
return str(lmf)
# ----------- SWEDBERG "FAKE" MORPHOLOGY -------------------------
def swedbergm():
total = []
xml_tree = fromstring(sys.stdin.read())
for entry in xml_tree.find("Lexicon"):
lemma = entry.find("Lemma")
baseforms = []
lem = None
pos = "prim"
if lemma != None:
freps = lemma.findall("FormRepresentation")
if freps != None:
for fr in freps:
allfeats = fr.findall("feat")
if allfeats != None:
for feat in allfeats:
if feat.attrib["att"] == "writtenForm":
baseforms.append(feat.attrib["val"].encode('utf-8'))
elif feat.attrib["att"] == "lemgram":
lem = feat.attrib["val"].encode('utf-8')
elif feat.attrib["att"] == "partOfSpeech":
pos = feat.attrib["val"].encode('utf-8')
total.append((lem, baseforms, pos))
lmf = LMF('swe')
for (lemg, wflist, pos) in total:
le = LexicalEntry()
lemma = Lemma()
le.lemma = lemma
fr = FormRepresentation()
lemma.add_form_representation(fr)
if lemg != None:
fr.add_feature(Feature("lemgram", lemg))
for wf in wflist:
pass
fr.add_feature(Feature("writtenForm", wf))
wordform = WordForm()
wordform.add_feature(Feature("writtenForm", wf))
wordform.add_feature(Feature("msd", "prim"))
le.add_wordform(wordform)
if pos != "prim":
fr.add_feature(Feature("partOfSpeech", pos))
fr.add_feature(Feature("paradigm", "prim"))
lmf.add_lexical_entry(le)
return str(lmf)
# ----------- AKADEMISK ORDLISTA----------------------------
def ao_data():
return read_csv(num_of_fields=3)
def ao():
pos_conversion = {"adverb" : "ab",
"substantiv" : "nn",
"adjektiv" : "av",
"verb" : "vb",
"preposition" : "pp",
"konjunktion" : "kn",
"particip" : "av",
"frågande/relativt_possesivuttryck" : "pn",
"partikel" : "pp",
"possessivuttryck" : "pn",
"pronomen" : "pn",
"subjunktion" : "sn",
}
lmf = LMF('swe')
entries = ao_data()
rank = 1
for (ao_lemma, pos, saldo_id) in entries:
le = LexicalEntry()
lemma = Lemma()
le.lemma = lemma
fr = FormRepresentation()
lemma.add_form_representation(fr)
fr.add_feature(Feature("writtenForm", ao_lemma.strip()))
lemgrams = saldo_id.split("|")
for l in lemgrams:
fr.add_feature(Feature("lemgram", l.strip()))
fr.add_feature(Feature("partOfSpeech", pos_conversion[pos.strip()]))
fr.add_feature(Feature("nativePartOfSpeech", pos.strip()))
fr.add_feature(Feature("rank", str(rank)))
sense = Sense("ao--" + ao_lemma.strip())
le.add_sense(sense)
rank += 1
lmf.add_lexical_entry(le)
return str(lmf)
# ----------------------------------------------------------
if __name__ == '__main__':
if len(sys.argv) > 1:
resource = sys.argv[1]
if resource == 'saldo':
print saldo()
if resource == 'saldoe':
print saldo_examples()
elif resource == 'swesaurus':
print swesaurus()
elif resource == 'lwt':
print lwt()
elif resource == 'parole':
print parole()
elif resource == 'simple':
print simple()
elif resource == 'kelly':
print kelly()
elif resource == 'wordnet':
print wordnet()
elif resource == 'crosspivot':
print crosspivot()
elif resource == 'dalinbase':
print dalinbase()
elif resource == 'swedbergm':
print swedbergm()
elif resource == 'ao':
print ao()