#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import cjson import urllib import codecs parole = {} #char_stream = codecs.getreader("utf-8")(sys.stdin) with codecs.open('parole.txt', encoding='utf-8') as f: for line in f: try: (_,sid,_,_,pid) = line[:-1].split('\t') if pid in parole: if not sid in parole[pid]: parole[pid].append(sid) else: parole[pid] = [sid] except: pass for line in sys.stdin: alist = line[:-1].split('\t') pid = alist[1] s = [] if pid in parole: s = alist + [(";").join(parole[pid]).encode("utf-8")] else: s = alist + ["-"] print "\t".join(s) #with codecs.open('SIMPLEn_SE_2009.txt', encoding='UTF-8') as f: #with char_stream as f: # for line in f: # print line #try: # (word,pid,_,_,_,_,ontology,domain,cl,_,_,_,_) = line[:-1].split('\t') # for sid in parole[pid]: # s = '%s\t%s\tnn\t%s\t%s\t%s' % (word,sid,ontology,domain,cl) # print s.encode('UTF-8') #except: # pass