require 'FileUtils'

scale = "average" #average or simple
sampling = "consecutive" #random or consecutive
annotator = false #if false: usual split
split_mode = "" #single: a test set consists only of the labels by the specified annotator; multiple: several test sets: one consists of the averaged labels of all specified annotators, other contain labels by every single specified annotator; divided: test set consists of labels by the specified annotator (has to be only one); two train sets: one contains all labels, another all but the labels by this annotator; "ultimate".
single_annot_mode = "clean" #or: "" (not supported anymore). If "clean"> include in the test set only the labels from this annotator.
mult_annot_mode = "clean" #clean: no labels produced by specified annotators are included in the test set
nsents_to_train = 500 #relevant for the "ultimate" mode

def convert(integer)
    if integer < 10
        converted = "0#{integer}"
    elsif integer < 100
        converted = integer
    else
        STDERR.puts "ERROR"
    end
    return converted
end


if annotator == false
    addendum = ""
else
    annot_hash = {1 => 9, 3 => 10, 4 => 11, 6 => 12, 7 => 13, 8 => 14, 9 => 15, 10 => 16, 11 => 17}
    annot_hash2 = {"all" => "xx", 1 => "a", 3 => "bb", 6 => "e", 7 => "f", 8 => "g", 9 => "h", 10 => "i", 11 => "j"}
    #if annotator.is_a? Integer
    addendum = "_annot#{annotator.join("_")}#{split_mode}"
    #elsif annotator.is_a? Array
end

filtered = true
if filtered and annotator and split_mode == "ultimate" then addendum << "_filtered" end


if scale == "simple"
    scale2 = "vote"
    valueindex = 7
    temphash = {"1"=>"3.0","0"=>"2.0","-1"=>"1.0"}
elsif scale == "average"
    valueindex = 5
    scale2 = scale
    
end





def create_samples(sampling,nfolds,samplesize,totalsize)
    samplesource = (0..totalsize).to_a
    samples = []
    if sampling == "consecutive"
        for i in 0..nfolds-1
            start = i*samplesize
            finish = start + samplesize - 1
            if i == nfolds-1
                finish = totalsize
            end
            samples << samplesource[start..finish]
            STDERR.puts "#{i}, #{start}, #{finish}"
        end
    elsif sampling == "random"
        for i in 1..nfolds-1
            currentsample = samplesource.sample(samplesize)
            samplesource = samplesource.reject{|k| currentsample.include?(k)} 
            samples << currentsample
        end
        STDERR.puts samplesource.length
        samples << samplesource
    end
    return samples
end

STDERR.puts "Reading file"
f = File.open("P_annotation.tsv","r:utf-8")
bank = [] #if annotator == true: store all for train and dev
bank2 = [] #if annotator == true and split_mode == divide: store averaged without the test annotator
bank3 = [] #if annotator == true and split_mode == divide: store ONLY the test annotator
test = []
tests = {"all" => []}

banks_u = Hash.new{|hash, key| hash[key] = Array.new}
tests_u = Hash.new{|hash, key| hash[key] = Array.new}
texts_in_test = Hash.new{|hash, key| hash[key] = Array.new}
texts_in_bank = Hash.new{|hash, key| hash[key] = Array.new}
sents_per_text = Hash.new(0)

trainlimits = Hash.new(0)

if annotator
    for annotator1 in annotator
        tests[annotator1] = []
    end
end

headers = {}
f.each_line.with_index do |line,index|
    line1 = line.strip.split("\t")
    if index > 0
        #testline = "#{line1[headers["doc"]]}\t#{line1[headers["par"]]}\t#{line1[headers["text"]]}"
        sents_per_text[line1[headers["doc"]]] += 1
        if scale == "simple"
            line1[headers[scale2]] = temphash[line1[headers[scale2]]]
        end
        
        if annotator == false
            bank << "#{line1[headers["doc"]]}\t#{line1[headers["par"]]}\t#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
        elsif split_mode == "divided"
            annotated_by = []
            annotated_values = {}
            for annotator1 in [0, 1, 3, 6, 7, 8, 9, 10, 11]
                if line1[headers[annotator1.to_s]] != ""
                    annotated_by << annotator1
                    annotated_values[annotator1] = line1[headers[annotator1.to_s]].to_f
                end
            end

            for current_annotator in annotator
                if annotated_by.length == 1 and annotated_by[0] == current_annotator
                    test << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                elsif annotated_by.include?(current_annotator)
                    bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                    
                    
                    
                    averaged_without = 0.0
                    annotated_values.each_pair do |annot, judgment|
                        if annot != annotator[0]
                            averaged_without += judgment
                        end
                    end
                    averaged_without = averaged_without/(annotated_values.values.length - 1)
                    
                    
                    bank2 << "#{line1[headers["text"]]}\t#{averaged_without}" 
                    bank3 << "#{line1[headers["text"]]}\t#{line1[headers[annotator[0].to_s]]}" 
                else
                    bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                    bank2 << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                    bank3 << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                end
            end
        elsif split_mode == "ultimate"
            annotated_by = []
            annotated_values = {}
            for annotator1 in [0, 1, 3, 6, 7, 8, 9, 10, 11]
                if line1[headers[annotator1.to_s]] != ""
                    annotated_by << annotator1
                    annotated_values[annotator1] = line1[headers[annotator1.to_s]].to_f
                end
            end
            
            #flag1 = true
            #flag2 = true
            #in_a_test = false
            if annotated_by.length == 1
                if annotator.include?(annotated_by[0])
                    if trainlimits[annotated_by[0]] < nsents_to_train
                        for current_annotator in annotator
                            if current_annotator != annotated_by[0]
                                banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}"
                                texts_in_bank[current_annotator] << line1[headers["doc"]]
                            end
                        end
                        bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}"
                        trainlimits[annotated_by[0]] += 1
                    else
                        if filtered
                            tests_u[annotated_by[0]] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}\t#{line1[headers["doc"]]}"
                        else
                            tests_u[annotated_by[0]] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}"
                        end
                        texts_in_test[annotated_by[0]] << line1[headers["doc"]]
                    end
                else
                    for current_annotator in annotator
                        banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                        texts_in_bank[current_annotator] << line1[headers["doc"]]
                    end
                    bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}"
                end
            else
                bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}"
                for current_annotator in annotator
                    if annotated_by.include?(current_annotator)
                        averaged_without = 0.0
                        annotated_values.each_pair do |annot, judgment|
                            if annot != current_annotator
                                averaged_without += judgment
                            end
                        end
                        averaged_without = averaged_without/(annotated_values.values.length - 1)
                        banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{averaged_without}" 
                        
                    else
                        banks_u[current_annotator] << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                    end
                    texts_in_bank[current_annotator] << line1[headers["doc"]]
                end
            end
#=end
        else
            all_annots_present = true
            annot_values = []
            annotator.each do |annotator1|
                if line1[annot_hash[annotator1]] == ""
                    all_annots_present = false
                    break
                else
                    annot_values << line1[annot_hash[annotator1]].to_f
                end
            end
            if all_annots_present
                if single_annot_mode == "clean"
                    average = 0.0
                    annot_values.each do |value|
                        average += value
                    end
                    average = average/annot_values.length
                    tests["all"] << "#{line1[headers["text"]]}\t#{average}" 
                    for annotator1 in annotator
                        tests[annotator1] << "#{line1[headers["text"]]}\t#{line1[annot_hash[annotator1]].to_f}"
                    end
                #else
                    #test << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                end
            else
                one_annot_present = false
                if mult_annot_mode == "clean"
                    for annotator1 in annotator
                        if line1[annot_hash[annotator1]] != ""
                            one_annot_present = true
                            break
                        end
                    end
                end
                if !one_annot_present
                    bank << "#{line1[headers["text"]]}\t#{line1[headers[scale2]]}" 
                end
            end

        end
    else
        line1.each.with_index do |header, header_index|
            headers[header] = header_index
        end
    end
end
#STDERR.puts "#{trainlimits}"

if annotator
    for i in annotator
        STDERR.puts banks_u[i].length
    end
end


def remove_sample(array, removesize)
    samplesource = (0..array.length-1).to_a
    not_to_remove = samplesource.sample(array.length - removesize).sort
    after_remove = []
    for counter in 0..array.length-1
        if not_to_remove.include?(counter)
            after_remove << array[counter]
        end
    end
    return after_remove
end

if annotator and split_mode == "ultimate"
    
    #STDOUT.puts "tested_on\tnot_trained_on\ttexts\tpars_in_text"
    overlapping_texts = []
    for annotator1 in annotator
        for annotator2 in annotator
            overlap = 0.0
            overlapping_texts_size = 0.0
            texts_in_test[annotator1].uniq.each do |text_in_bank|
                texts_in_bank[annotator2].uniq.each do |text_in_test|
                    if text_in_bank == text_in_test
                        overlap += 1
                        overlapping_texts_size += sents_per_text[text_in_bank]
                        overlapping_texts << text_in_bank
                        
                    end
                end
            end
            #STDOUT.puts "#{annotator1}\t#{annotator2}\t#{overlap}\t#{overlapping_texts_size}"
        end
    end
    if filtered
        #STDERR.puts "Yes!"
        tests_u_filtered = Hash.new{|hash, key| hash[key] = Array.new}
        overlapping_texts.uniq!
        STDERR.puts "#{overlapping_texts}"
        for annotator1 in annotator
            tests_u[annotator1].each do |par|
                #STDERR.puts par
                #STDERR.puts par[2]
                if !overlapping_texts.include?(par.split("\t")[2])
                    tests_u_filtered[annotator1] << par.split("\t")[0..1].join("\t")
                end
            end
        end
        tests_u = tests_u_filtered
    end
    #STDERR.puts overlapping_texts.uniq.length


end


STDERR.puts "Creating samples"
nfolds = 10 #10
if split_mode == "ultimate"
    totalsize = banks_u[6].length #- nsents_to_train
    STDERR.puts totalsize
    totalsize2 = bank.length
    STDERR.puts totalsize2
    samplesize2 = (totalsize2/nfolds.to_f).round(0)
    samples2 = create_samples(sampling,nfolds,samplesize2,totalsize2)
else
    totalsize = bank.length
end


#totalsize = 4872 - 1

samplesize = (totalsize/nfolds.to_f).round(0)
STDERR.puts samplesize


if annotator == false or split_mode == "multiple" or split_mode == "divided" or split_mode == "ultimate"
    samples = create_samples(sampling,nfolds,samplesize,totalsize)
else
    samples = create_samples(sampling,nfolds-1,samplesize,totalsize-samplesize) #only for "single" split_mode
    #samplesize = 600 #specify sample size in this mode if necessary!
end


#STDERR.puts "#{bank.length} lines read"
#STDOUT.puts "#{tests}"
STDERR.puts "Output"

dirname = "split#{nfolds}_#{sampling}_#{scale}#{addendum}"
if File.exists?(dirname)
    FileUtils.rm_r(dirname)
end
Dir.mkdir(dirname)


if annotator == false
    for i in 0..nfolds-1
        STDERR.puts i
        
        sethash = {}
        j = i + 1
        if i==nfolds-1 then j=0 end
        sethash["test"] = samples[i]
        sethash["dev"] = samples[j]
        sethash["train"] = []
        samples.each.with_index do |sample, index|
            if ![i,j].include?(index)
                sethash["train"] << sample
            end
        end
        sethash["train"] = sethash["train"].flatten!
        #STDERR.puts "#{sethash}"
    
        sethash.each_pair do |setname,set|
            if i < 10 and nfolds >= 10
                o = File.open("#{dirname}\\#{setname}0#{i}.tsv","w:utf-8")
            else
                o = File.open("#{dirname}\\#{setname}#{i}.tsv","w:utf-8")
            end
            o.puts "doc\tpar\ttext\tlabel"
            set.each do |lineindex|
                o.puts bank[lineindex]
            end
            o.close
        end
    end
elsif split_mode == "multiple"
    sethash = {}
    sethash["dev"] = samples[0]
    sethash["train"] = samples[1..-1].flatten
    sethash.each_pair do |setname,set|
        o = File.open("#{dirname}\\#{setname}xx.csv","w:utf-8")
        o.puts "sentence\tlabel"
        set.each do |lineindex|
            o.puts bank[lineindex]
        end
        o.close
    end

    ["all",annotator].flatten.each do |annotator1|
        sethash = {}
        sethash["test"] = tests[annotator1]
        sethash.each_pair do |setname,set|
            o = File.open("#{dirname}\\#{setname}#{annot_hash2[annotator1]}.csv","w:utf-8")
            o.puts "sentence\tlabel"
            o.puts sethash["test"]
            
            o.close
        end
    end
elsif split_mode == "divided"
    sethash = {}
    sethash["test"] = test
    sethash["dev"] = samples[0]
    sethash["train"] = samples[1..-1].flatten
    sethash["train2"] = samples[1..-1].flatten
    sethash["train3"] = samples[1..-1].flatten
        
    sethash.each_pair do |setname,set|
        if setname.length > 5
            o = File.open("#{dirname}\\train#{annot_hash2[annotator[0]]}#{setname[-1]}.csv","w:utf-8")
        else
            o = File.open("#{dirname}\\#{setname}#{annot_hash2[annotator[0]]}.csv","w:utf-8")
        end
        o.puts "sentence\tlabel"
        if setname == "test"
            o.puts test
        elsif setname == "train2"
            set.each do |lineindex|
                o.puts bank2[lineindex]
            end
        elsif setname == "train3"
            set.each do |lineindex|
                o.puts bank3[lineindex]
            end
        else
            set.each do |lineindex|
                o.puts bank[lineindex]
            end
        end
        o.close
    end
elsif split_mode == "single"
    sethash = {}
    sethash["test"] = test
    sethash["dev"] = samples[0]
    sethash["train"] = samples[1..-1].flatten
        
    sethash.each_pair do |setname,set|
        o = File.open("#{dirname}\\#{setname}#{annot_hash2[annotator[0]]}.csv","w:utf-8")
        o.puts "sentence\tlabel"
        if setname != "test"
            set.each do |lineindex|
                o.puts bank[lineindex]
            end
        else
            o.puts test
        end

        o.close
    end
elsif split_mode == "ultimate"
    sethash = {}
    sethash["dev"] = samples[0]
    sethash["train"] = samples[1..-1].flatten
    #looparray = [annotator,(90..99).to_a].flatten
    for i in annotator do 
        sethash.each_pair do |setname,set|
            o = File.open("#{dirname}\\#{setname}#{convert(i)}.csv","w:utf-8")
            o.puts "sentence\tlabel"
            set.each do |lineindex|
                o.puts banks_u[i][lineindex]
            end
            o.close
        end
    end
    
    for i in annotator do
        o = File.open("#{dirname}\\test#{convert(i)}.csv","w:utf-8")
        o.puts "sentence\tlabel"
        o.puts tests_u[i]
        o.close
    end

    sethash2 = {}
    sethash2["dev"] = samples2[0]
    sethash2["train"] = samples2[1..-1].flatten
    sethash2.each_pair do |setname,set|
        o = File.open("#{dirname}\\#{setname}99.csv","w:utf-8")
        o.puts "sentence\tlabel"
        set.each do |lineindex|
            o.puts bank[lineindex]
        end
        o.close
    end
    
end