import sys import cjson import codecs def svnhtml(s): return ('

TXT

%s' % s).encode('UTF-8') def rendertxt(tbl): for (p,freqs) in tbl: print ("\n".join(["\t".join([w,p,str(f),c]) for (f,(w,c)) in freqs])).encode('UTF-8') def renderhtml(tbl): tbl = sorted([(p,[(f,x) for (f,x) in freqs if f > 9][:1000]) for (p,freqs) in tbl]) s = '

' + " ".join(['%s' % (p,p) for (p,fs) in tbl if len(fs) > 0]) + '

' for (p,freqs) in tbl: if len(freqs) > 0: s += render(p,freqs) return s def pos(p): if p[0:2] == 'NC': return 'NN' elif p[0:2] == 'NP': return 'PM' elif p[0:2] == 'AF': return 'PC' elif p[0:2] == 'AF': return 'VB' elif p[0:2] == 'AQ': return 'JJ' elif p[0:2] == 'V@': return 'VB' else: return p def render(p,freqs): return "\n".join([ '

%s

' % (p,p), '', "\n".join(['' % (n+1,w,f,c) for (n,(f,(w,c))) in enumerate(freqs)]), '
%d.%s%s%s
' ]) def saldo(): words = set([]) f = codecs.open('saldo.lex','r','utf-8') for l in f.readlines(): j = cjson.decode(l) if(not (j['param'] in ['c','ci','cm'])): words.add(j['word']) f.close() return words def readdata(words): tbl={} for l in sys.stdin: (w,p,f,c) = l[:-1].decode('UTF-8').split('\t') p = pos(p) f = int(f) if w.isalpha() and (w not in words and w.lower() not in words and w.capitalize() not in words): #w = w.lower() if p in tbl: tbl[p][w] = (f,c) else: tbl[p] = {w:(f,c)} return [(p,sorted([(f,(w,c)) for (w,(f,c)) in fs.items()], reverse=True)) for (p,fs) in tbl.items()] if __name__ == '__main__': if len(sys.argv) > 1: words = saldo() tbl = readdata(words) if sys.argv[1] == 'html': print svnhtml(renderhtml(tbl)) elif sys.argv[1] == 'txt': print rendertxt(tbl)