import glob import codecs from collections import defaultdict with codecs.open('kubhist.txt',encoding='utf-8') as f: d = defaultdict(list) for l in f: (year,_,papers) = l[:-1].split('\t') d[int(year)] = [t.split('-')[1] for t in papers.replace('[','').replace(']','').split(', ')] for x in glob.glob('*.txt'): if x != 'kubhist.txt': span = x.split('.')[0] (y1,y2) = span.split('-') papers = set([t for y in range(int(y1),int(y2)+1) for t in d[y]]) print """ %s %s """ % (span,span, ", ".join(papers))