import sys
import cjson
import codecs
def svnhtml(s):
return ('
TXT
%s' % s).encode('UTF-8')
def rendertxt(tbl):
for (p,freqs) in tbl:
print ("\n".join(["\t".join([w,p,str(f),c]) for (f,(w,c)) in freqs])).encode('UTF-8')
def renderhtml(tbl):
tbl = sorted([(p,[(f,x) for (f,x) in freqs if f > 9][:1000]) for (p,freqs) in tbl])
s = '' + " ".join(['%s' % (p,p) for (p,fs) in tbl if len(fs) > 0]) + '
'
for (p,freqs) in tbl:
if len(freqs) > 0:
s += render(p,freqs)
return s
def pos(p):
if p[0:2] == 'NC':
return 'NN'
elif p[0:2] == 'NP':
return 'PM'
elif p[0:2] == 'AF':
return 'PC'
elif p[0:2] == 'AF':
return 'VB'
elif p[0:2] == 'AQ':
return 'JJ'
elif p[0:2] == 'V@':
return 'VB'
else:
return p
def render(p,freqs):
return "\n".join([
'' % (p,p),
'',
"\n".join(['%d. | %s | %s | %s |
' % (n+1,w,f,c) for (n,(f,(w,c))) in enumerate(freqs)]),
'
'
])
def saldo():
words = set([])
f = codecs.open('saldo.lex','r','utf-8')
for l in f.readlines():
j = cjson.decode(l)
if(not (j['param'] in ['c','ci','cm'])):
words.add(j['word'])
f.close()
return words
def readdata(words):
tbl={}
for l in sys.stdin:
(w,p,f,c) = l[:-1].decode('UTF-8').split('\t')
p = pos(p)
f = int(f)
if w.isalpha() and (w not in words and
w.lower() not in words and
w.capitalize() not in words):
#w = w.lower()
if p in tbl:
tbl[p][w] = (f,c)
else:
tbl[p] = {w:(f,c)}
return [(p,sorted([(f,(w,c)) for (w,(f,c)) in fs.items()], reverse=True)) for (p,fs) in tbl.items()]
if __name__ == '__main__':
if len(sys.argv) > 1:
words = saldo()
tbl = readdata(words)
if sys.argv[1] == 'html':
print svnhtml(renderhtml(tbl))
elif sys.argv[1] == 'txt':
print rendertxt(tbl)