/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.resources;

import de.julielab.geneexpbase.data.DocumentSourceFileRegistry;
import de.julielab.geneexpbase.data.DocumentSourceFiles;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.genemapper.Configuration;
import de.julielab.genemapper.GeneMapper;
import de.julielab.genemapper.classification.TransformerDisambiguationDataUtils;
import de.julielab.genemapper.utils.GeneMapperException;
import de.julielab.genemapper.utils.GeneMapperInitializationException;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TransformerDisambiguationBC3DataWriter {
    private static final Logger log = LoggerFactory.getLogger(TransformerDisambiguationBC3DataWriter.class);

    public static void main(String[] args) throws IOException, GeneMapperException, ExecutionException, GeneMapperInitializationException {
        Configuration configuration = new Configuration(new File("data/eval_data/genemapper_for_disambig_opt.properties"));
        DocumentSourceFiles documentSourceFiles = DocumentSourceFileRegistry.bc3Trainset1InferredMentionIds();
        GeneMapper geneMapper = null;
        String goldTaxMode = "goldTax";
        File outputFile = new File("transformerDisambiguationData-bc3trainset1-v23-" + goldTaxMode + ".tsv");
        File corpusSplitMapping = new File("splitmappings/bc3-trainset1-10split-5devfreq.txt");
        TransformerDisambiguationBC3DataWriter.createDisambiguationData(documentSourceFiles, geneMapper, outputFile, corpusSplitMapping);
    }

    public static void createDisambiguationData(DocumentSourceFiles sourceFiles, GeneMapper mapper, File outputFile, File corpusSplitMapping) throws IOException, GeneMapperException, ExecutionException {
        Set devDocIds = FileUtils.readLines((File)corpusSplitMapping, (Charset)StandardCharsets.UTF_8).stream().map(line -> line.split("\\s+")).filter(s -> s[1].equals("dev")).map(s -> s[0]).collect(Collectors.toSet());
        log.info("Writing transformer training data for corpus {} to {}", (Object)sourceFiles.getName(), (Object)outputFile);
        String outputPath = outputFile.getAbsolutePath();
        File devFile = new File(outputPath.substring(0, outputPath.lastIndexOf(46)) + "-dev.tsv");
        log.info("Got {} dev docs from {} that will be omitted from the training data and written to {}.", new Object[]{devDocIds.size(), corpusSplitMapping, devFile});
        Collection documents = null;
        Stream<GeneDocument> trainStream = documents.stream().filter(d -> !devDocIds.contains(d.getId()));
        Stream<GeneDocument> devStream = documents.stream().filter(d -> devDocIds.contains(d.getId()));
        TransformerDisambiguationDataUtils.writeData((GeneMapper)mapper, (File)outputFile, trainStream);
        TransformerDisambiguationDataUtils.writeData((GeneMapper)mapper, (File)devFile, devStream);
    }
}

