# -*- encoding: utf-8 -*- import glob import codecs from collections import defaultdict with codecs.open('data/kubhist.txt',encoding='utf-8') as f: yp = defaultdict(list) for l in f: (year,_,papers) = l.strip().split('\t') yp[int(year)] = [t.split('-')[1] for t in papers.replace('[','').replace(']','').split(', ')] for f in glob.glob('data/*.txt'): f = f.split('/')[1] if f != 'kubhist.txt': span = f.split('.')[0] (y1,y2) = span.split('-') papers = set([t for y in range(int(y1),int(y2)+1) for t in yp[y]]) d = [] with codecs.open('data/' + f, encoding='utf-8') as fi: for l in fi: try: xs = l[:-1].split('\t') w1 = xs[0] w2 = xs[1] fr = xs[2] year = xs[3].split(' ')[0] ex = xs[3].split(' ',1)[1].strip() sids = xs[4:] except: print l d.append(((w1,w2),int(float(fr)),year,ex)) with codecs.open(span+'.html', encoding='utf-8',mode='w') as fi: fi.write(u""" Experiment 1: %s

Experiment 1: %s

Tidningar: %s

""" % (span, span, ", ".join(papers))) for ((w1,w2),fr,year,ex) in d: fi.write(u""" """ % (w1,w2,fr,int(year),ex.replace('[[[','').replace(']]]','').replace('[[','').replace(']]','').replace('[','').replace(']',''))) fi.write(u"""
ord ord frekvens år exempel
%s %s %d %d %s
""")