require 'FileUtils' scale = "average" #average or simple sampling = "consecutive" #random or consecutive annotator = false #if false: usual split split_mode = "" #single: a test set consists only of the labels by the specified annotator; multiple: several test sets: one consists of the averaged labels of all specified annotators, other contain labels by every single specified annotator; divided: test set consists of labels by the specified annotator (has to be only one); two train sets: one contains all labels, another all but the labels by this annotator; "ultimate". single_annot_mode = "clean" #or: "" (not supported anymore). If "clean"> include in the test set only the labels from this annotator. mult_annot_mode = "clean" #clean: no labels produced by specified annotators are included in the test set nsents_to_train = 500 #relevant for the "ultimate" mode def convert(integer) if integer < 10 converted = "0#{integer}" elsif integer < 100 converted = integer else STDERR.puts "ERROR" end return converted end if annotator == false addendum = "" else annot_hash = {1 => 9, 3 => 10, 4 => 11, 6 => 12, 7 => 13, 8 => 14, 9 => 15, 10 => 16, 11 => 17} annot_hash2 = {"all" => "xx", 1 => "a", 3 => "bb", 6 => "e", 7 => "f", 8 => "g", 9 => "h", 10 => "i", 11 => "j"} #if annotator.is_a? Integer addendum = "_annot#{annotator.join("_")}#{split_mode}" #elsif annotator.is_a? Array end filtered = true if filtered and annotator and split_mode == "ultimate" then addendum << "_filtered" end if scale == "simple" scale2 = "vote" valueindex = 7 temphash = {"1"=>"3.0","0"=>"2.0","-1"=>"1.0"} elsif scale == "average" valueindex = 5 scale2 = scale end def create_samples(sampling,nfolds,samplesize,totalsize) samplesource = (0..totalsize).to_a samples = [] if sampling == "consecutive" for i in 0..nfolds-1 start = i*samplesize finish = start + samplesize - 1 if i == nfolds-1 finish = totalsize end samples << samplesource[start..finish] STDERR.puts "#{i}, #{start}, #{finish}" end elsif sampling == "random" for i in 1..nfolds-1 currentsample = samplesource.sample(samplesize) samplesource = samplesource.reject{|k| currentsample.include?(k)} samples << currentsample end STDERR.puts samplesource.length samples << samplesource end return samples end STDERR.puts "Reading file" f = File.open("P_annotation.tsv","r:utf-8") bank = [] #if annotator == true: store all for train and dev bank2 = [] #if annotator == true and split_mode == divide: store averaged without the test annotator bank3 = [] #if annotator == true and split_mode == divide: store ONLY the test annotator test = [] tests = {"all" => []} banks_u = Hash.new{|hash, key| hash[key] = Array.new} tests_u = Hash.new{|hash, key| hash[key] = Array.new} texts_in_test = Hash.new{|hash, key| hash[key] = Array.new} texts_in_bank = Hash.new{|hash, key| hash[key] = Array.new} sents_per_text = Hash.new(0) trainlimits = Hash.new(0) if annotator for annotator1 in annotator tests[annotator1] = [] end end headers = {} f.each_line.with_index do |line,index| line1 = line.strip.split("\t") if index > 0 #testline = "#{line1[headers["doc"]]}\t#{line1[headers["par"]]}\t#{line1[headers["text"]]}" sents_per_text[line1[headers["doc"]]] += 1 if scale == "simple" line1[headers[scale2]] = temphash[line1[headers[scale2]]] end if annotator == false bank << "#{line1[headers["doc"]]}\t#{line1[headers["par"]]}\t#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" elsif split_mode == "divided" annotated_by = [] annotated_values = {} for annotator1 in [0, 1, 3, 6, 7, 8, 9, 10, 11] if line1[headers[annotator1.to_s]] != "" annotated_by << annotator1 annotated_values[annotator1] = line1[headers[annotator1.to_s]].to_f end end for current_annotator in annotator if annotated_by.length == 1 and annotated_by[0] == current_annotator test << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" elsif annotated_by.include?(current_annotator) bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" averaged_without = 0.0 annotated_values.each_pair do |annot, judgment| if annot != annotator[0] averaged_without += judgment end end averaged_without = averaged_without/(annotated_values.values.length - 1) bank2 << "#{line1[headers["text"]]}\t#{averaged_without}" bank3 << "#{line1[headers["text"]]}\t#{line1[headers[annotator[0].to_s]]}" else bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" bank2 << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" bank3 << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" end end elsif split_mode == "ultimate" annotated_by = [] annotated_values = {} for annotator1 in [0, 1, 3, 6, 7, 8, 9, 10, 11] if line1[headers[annotator1.to_s]] != "" annotated_by << annotator1 annotated_values[annotator1] = line1[headers[annotator1.to_s]].to_f end end #flag1 = true #flag2 = true #in_a_test = false if annotated_by.length == 1 if annotator.include?(annotated_by[0]) if trainlimits[annotated_by[0]] < nsents_to_train for current_annotator in annotator if current_annotator != annotated_by[0] banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" texts_in_bank[current_annotator] << line1[headers["doc"]] end end bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" trainlimits[annotated_by[0]] += 1 else if filtered tests_u[annotated_by[0]] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}\t#{line1[headers["doc"]]}" else tests_u[annotated_by[0]] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" end texts_in_test[annotated_by[0]] << line1[headers["doc"]] end else for current_annotator in annotator banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" texts_in_bank[current_annotator] << line1[headers["doc"]] end bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" end else bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" for current_annotator in annotator if annotated_by.include?(current_annotator) averaged_without = 0.0 annotated_values.each_pair do |annot, judgment| if annot != current_annotator averaged_without += judgment end end averaged_without = averaged_without/(annotated_values.values.length - 1) banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{averaged_without}" else banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" end texts_in_bank[current_annotator] << line1[headers["doc"]] end end #=end else all_annots_present = true annot_values = [] annotator.each do |annotator1| if line1[annot_hash[annotator1]] == "" all_annots_present = false break else annot_values << line1[annot_hash[annotator1]].to_f end end if all_annots_present if single_annot_mode == "clean" average = 0.0 annot_values.each do |value| average += value end average = average/annot_values.length tests["all"] << "#{line1[headers["text"]]}\t#{average}" for annotator1 in annotator tests[annotator1] << "#{line1[headers["text"]]}\t#{line1[annot_hash[annotator1]].to_f}" end #else #test << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" end else one_annot_present = false if mult_annot_mode == "clean" for annotator1 in annotator if line1[annot_hash[annotator1]] != "" one_annot_present = true break end end end if !one_annot_present bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" end end end else line1.each.with_index do |header, header_index| headers[header] = header_index end end end #STDERR.puts "#{trainlimits}" if annotator for i in annotator STDERR.puts banks_u[i].length end end def remove_sample(array, removesize) samplesource = (0..array.length-1).to_a not_to_remove = samplesource.sample(array.length - removesize).sort after_remove = [] for counter in 0..array.length-1 if not_to_remove.include?(counter) after_remove << array[counter] end end return after_remove end if annotator and split_mode == "ultimate" #STDOUT.puts "tested_on\tnot_trained_on\ttexts\tpars_in_text" overlapping_texts = [] for annotator1 in annotator for annotator2 in annotator overlap = 0.0 overlapping_texts_size = 0.0 texts_in_test[annotator1].uniq.each do |text_in_bank| texts_in_bank[annotator2].uniq.each do |text_in_test| if text_in_bank == text_in_test overlap += 1 overlapping_texts_size += sents_per_text[text_in_bank] overlapping_texts << text_in_bank end end end #STDOUT.puts "#{annotator1}\t#{annotator2}\t#{overlap}\t#{overlapping_texts_size}" end end if filtered #STDERR.puts "Yes!" tests_u_filtered = Hash.new{|hash, key| hash[key] = Array.new} overlapping_texts.uniq! STDERR.puts "#{overlapping_texts}" for annotator1 in annotator tests_u[annotator1].each do |par| #STDERR.puts par #STDERR.puts par[2] if !overlapping_texts.include?(par.split("\t")[2]) tests_u_filtered[annotator1] << par.split("\t")[0..1].join("\t") end end end tests_u = tests_u_filtered end #STDERR.puts overlapping_texts.uniq.length end STDERR.puts "Creating samples" nfolds = 10 #10 if split_mode == "ultimate" totalsize = banks_u[6].length #- nsents_to_train STDERR.puts totalsize totalsize2 = bank.length STDERR.puts totalsize2 samplesize2 = (totalsize2/nfolds.to_f).round(0) samples2 = create_samples(sampling,nfolds,samplesize2,totalsize2) else totalsize = bank.length end #totalsize = 4872 - 1 samplesize = (totalsize/nfolds.to_f).round(0) STDERR.puts samplesize if annotator == false or split_mode == "multiple" or split_mode == "divided" or split_mode == "ultimate" samples = create_samples(sampling,nfolds,samplesize,totalsize) else samples = create_samples(sampling,nfolds-1,samplesize,totalsize-samplesize) #only for "single" split_mode #samplesize = 600 #specify sample size in this mode if necessary! end #STDERR.puts "#{bank.length} lines read" #STDOUT.puts "#{tests}" STDERR.puts "Output" dirname = "split#{nfolds}_#{sampling}_#{scale}#{addendum}" if File.exists?(dirname) FileUtils.rm_r(dirname) end Dir.mkdir(dirname) if annotator == false for i in 0..nfolds-1 STDERR.puts i sethash = {} j = i + 1 if i==nfolds-1 then j=0 end sethash["test"] = samples[i] sethash["dev"] = samples[j] sethash["train"] = [] samples.each.with_index do |sample, index| if ![i,j].include?(index) sethash["train"] << sample end end sethash["train"] = sethash["train"].flatten! #STDERR.puts "#{sethash}" sethash.each_pair do |setname,set| if i < 10 and nfolds >= 10 o = File.open("#{dirname}\\#{setname}0#{i}.tsv","w:utf-8") else o = File.open("#{dirname}\\#{setname}#{i}.tsv","w:utf-8") end o.puts "doc\tpar\ttext\tlabel" set.each do |lineindex| o.puts bank[lineindex] end o.close end end elsif split_mode == "multiple" sethash = {} sethash["dev"] = samples[0] sethash["train"] = samples[1..-1].flatten sethash.each_pair do |setname,set| o = File.open("#{dirname}\\#{setname}xx.csv","w:utf-8") o.puts "sentence\tlabel" set.each do |lineindex| o.puts bank[lineindex] end o.close end ["all",annotator].flatten.each do |annotator1| sethash = {} sethash["test"] = tests[annotator1] sethash.each_pair do |setname,set| o = File.open("#{dirname}\\#{setname}#{annot_hash2[annotator1]}.csv","w:utf-8") o.puts "sentence\tlabel" o.puts sethash["test"] o.close end end elsif split_mode == "divided" sethash = {} sethash["test"] = test sethash["dev"] = samples[0] sethash["train"] = samples[1..-1].flatten sethash["train2"] = samples[1..-1].flatten sethash["train3"] = samples[1..-1].flatten sethash.each_pair do |setname,set| if setname.length > 5 o = File.open("#{dirname}\\train#{annot_hash2[annotator[0]]}#{setname[-1]}.csv","w:utf-8") else o = File.open("#{dirname}\\#{setname}#{annot_hash2[annotator[0]]}.csv","w:utf-8") end o.puts "sentence\tlabel" if setname == "test" o.puts test elsif setname == "train2" set.each do |lineindex| o.puts bank2[lineindex] end elsif setname == "train3" set.each do |lineindex| o.puts bank3[lineindex] end else set.each do |lineindex| o.puts bank[lineindex] end end o.close end elsif split_mode == "single" sethash = {} sethash["test"] = test sethash["dev"] = samples[0] sethash["train"] = samples[1..-1].flatten sethash.each_pair do |setname,set| o = File.open("#{dirname}\\#{setname}#{annot_hash2[annotator[0]]}.csv","w:utf-8") o.puts "sentence\tlabel" if setname != "test" set.each do |lineindex| o.puts bank[lineindex] end else o.puts test end o.close end elsif split_mode == "ultimate" sethash = {} sethash["dev"] = samples[0] sethash["train"] = samples[1..-1].flatten #looparray = [annotator,(90..99).to_a].flatten for i in annotator do sethash.each_pair do |setname,set| o = File.open("#{dirname}\\#{setname}#{convert(i)}.csv","w:utf-8") o.puts "sentence\tlabel" set.each do |lineindex| o.puts banks_u[i][lineindex] end o.close end end for i in annotator do o = File.open("#{dirname}\\test#{convert(i)}.csv","w:utf-8") o.puts "sentence\tlabel" o.puts tests_u[i] o.close end sethash2 = {} sethash2["dev"] = samples2[0] sethash2["train"] = samples2[1..-1].flatten sethash2.each_pair do |setname,set| o = File.open("#{dirname}\\#{setname}99.csv","w:utf-8") o.puts "sentence\tlabel" set.each do |lineindex| o.puts bank[lineindex] end o.close end end