/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.classification;

import com.google.common.collect.Multimap;
import de.julielab.gene.candidateretrieval.CandidateRetrieval;
import de.julielab.gene.candidateretrieval.GeneRecordHit;
import de.julielab.gene.candidateretrieval.LuceneCandidateRetrieval;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.geneexpbase.configuration.Parameters;
import de.julielab.geneexpbase.data.GeneInformation;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.geneexpbase.genemodel.PosTag;
import de.julielab.genemapper.GeneMapper;
import de.julielab.genemapper.mappingcores.DypsisCandidateSetter;
import de.julielab.genemapper.mappingcores.DypsisMappingCore;
import de.julielab.genemapper.utils.GeneMapperException;
import de.julielab.java.utilities.FileUtilities;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TransformerDisambiguationDataUtils {
    public static final boolean USE_GOLD_TAX_FOR_CANDIDATE_RETRIEVAL = true;
    public static final boolean ADD_GENERIF = false;
    public static final boolean ADD_INTERACTIONS = false;
    public static final boolean ADD_SUMMARY = false;
    public static final boolean ADD_DESC = false;
    public static final int MAX_DOC_CONTEXT_SIZE = 256;
    public static final boolean ADD_NAME_TYPES = true;
    public static final boolean USE_ORIGINAL_QUERY_NAMES = true;
    public static final boolean NORMALIZE_CONTEXT_GENES = false;
    public static final boolean EXCLUDE_FP_GM = false;
    public static final boolean ONLY_EXACT_MATCHES = false;
    public static final boolean ONLY_APPROX_MATCHES = true;
    public static final int VERSION = 23;
    public static final Parameters CANDIDATE_SETTER_PARAMS = new Parameters(new File("configurations/transformer_candidate_setter_parameters.properties"));
    private static final Logger log = LoggerFactory.getLogger(TransformerDisambiguationDataUtils.class);

    public static void writeData(GeneMapper mapper, File outputFile, Stream<GeneDocument> geneDocumentStream) throws IOException, ExecutionException, GeneMapperException {
        try (BufferedWriter bw = FileUtilities.getWriterToFile(outputFile);){
            bw.write(String.join((CharSequence)"\t", "gene_desc", "doc_context", "label"));
            bw.newLine();
            Iterator docIt = geneDocumentStream.iterator();
            int i = 0;
            while (docIt.hasNext()) {
                GeneDocument doc = (GeneDocument)docIt.next();
                TransformerDisambiguationDataUtils.writeData(bw, mapper, doc);
                log.info("Wrote data for document {}", (Object)(++i));
            }
        }
    }

    /*
     * WARNING - void declaration
     */
    public static void writeData(BufferedWriter bw, GeneMapper mapper, GeneDocument doc) throws IOException, ExecutionException, GeneMapperException {
        DypsisCandidateSetter candidateSetter = ((DypsisMappingCore)mapper.getMappingCore()).getCandidateSetter();
        CandidateRetrieval candidateRetrieval = mapper.getMappingCore().getCandidateRetrieval();
        Iterator geneIt = doc.getGenes().iterator();
        while (geneIt.hasNext()) {
            GeneMention gm = (GeneMention)geneIt.next();
            if (!doc.isGoldHasOffsets()) {
                throw new IllegalStateException("The gold mentions must have their offsets. If the document has only document level annotations, use DocumentLoader#inferDocumentLevelLabelsToMentions to derive mention level gold annotations.");
            }
            List<String> allGoldIds = gm.getAllGoldIdsAsList();
            List<SynHit> positiveHits = ((LuceneCandidateRetrieval)candidateRetrieval).getIndexRecords(allGoldIds);
            String docText = TransformerDisambiguationDataUtils.getGmMarkedDocumentText(gm, 256, true, false);
            List<SynHit> negativeHits = candidateRetrieval.getCandidates(gm, gm.getAllGoldTaxonomyIdsAsList(), LuceneCandidateRetrieval.GENE_RECORDS_CNF);
            if (negativeHits.stream().anyMatch(SynHit::isExactMatch) || positiveHits.stream().anyMatch(SynHit::isExactMatch)) continue;
            if ((negativeHits = negativeHits.stream().filter(Predicate.not(SynHit::isRejectionCandidate)).filter(sh -> !allGoldIds.contains(sh.getId())).collect(Collectors.toList())).isEmpty()) {
                for (SynHit synHit : positiveHits) {
                    String query = TransformerDisambiguationDataUtils.getCandidateQueryString(synHit, candidateRetrieval);
                    if (query.isBlank()) continue;
                    bw.write(String.join((CharSequence)"\t", query, docText, "1"));
                    bw.newLine();
                }
            } else if (!positiveHits.isEmpty()) {
                int posIndex = 0;
                for (SynHit sh3 : negativeHits) {
                    SynHit positiveSh;
                    String positiveQuery;
                    String query = TransformerDisambiguationDataUtils.getCandidateQueryString(sh3, candidateRetrieval);
                    if (!query.isBlank()) {
                        bw.write(String.join((CharSequence)"\t", query, docText, "0"));
                        bw.newLine();
                    }
                    if ((positiveQuery = TransformerDisambiguationDataUtils.getCandidateQueryString(positiveSh = positiveHits.get(posIndex++ % positiveHits.size()), candidateRetrieval)).isBlank()) continue;
                    bw.write(String.join((CharSequence)"\t", positiveQuery, docText, "1"));
                    bw.newLine();
                }
            } else if (doc.isCompletelyAnnotated()) {
                void var12_18;
                int index = 0;
                SynHit synHit = negativeHits.get(index);
                double bestScore = synHit.getLexicalScore();
                for (int written = 0; var12_18 != null && (Double.compare(var12_18.getLexicalScore(), bestScore) == 0 || written < 3); ++written) {
                    String query = TransformerDisambiguationDataUtils.getCandidateQueryString((SynHit)var12_18, candidateRetrieval);
                    if (!query.isBlank()) {
                        bw.write(String.join((CharSequence)"\t", query, docText, "0"));
                        bw.newLine();
                    }
                    if (index + 1 < negativeHits.size()) {
                        SynHit synHit2 = negativeHits.get(++index);
                        continue;
                    }
                    Object var12_20 = null;
                }
            }
            gm.setMentionMappingResult(null);
        }
    }

    public static String getGmMarkedDocumentText(GeneMention gm, int maxContextTokens, boolean onlyGenes, boolean uniqueGenes) {
        GeneDocument doc = gm.getGeneDocument();
        StringBuilder sb = new StringBuilder();
        Function<GeneMention, String> gmStringFunc = g2 -> StringUtils.normalizeSpace(g2.getText());
        if (!onlyGenes && !uniqueGenes) {
            String docText = doc.getDocumentText();
            Collection<PosTag> leftContext = doc.getOverlappingPosTags(Range.between(0, gm.getBegin() - 1));
            Collection<PosTag> rightContext = doc.getOverlappingPosTags(Range.between(gm.getEnd() + 1, doc.getDocumentText().length()));
            int gmTokens = doc.getOverlappingPosTags(gm.getOffsets()).size();
            int leftCharOffset = 0;
            int rightCharOffset = docText.length();
            if (leftContext.size() + rightContext.size() + gmTokens + 2 > maxContextTokens) {
                PosTag posTag;
                int i;
                int leftContextSize = 0;
                int rightContextSize = 0;
                while (leftContextSize + rightContextSize + gmTokens + 2 < maxContextTokens) {
                    if (leftContext.size() - leftContextSize > 0) {
                        ++leftContextSize;
                    }
                    if (rightContext.size() - rightContextSize <= 0) continue;
                    ++rightContextSize;
                }
                Iterator<PosTag> posIt = leftContext.iterator();
                for (i = 0; posIt.hasNext() && i <= leftContext.size() - leftContextSize; ++i) {
                    posTag = posIt.next();
                    if (i != leftContext.size() - leftContextSize) continue;
                    leftCharOffset = posTag.getBegin();
                }
                posIt = rightContext.iterator();
                for (i = 0; i < rightContextSize; ++i) {
                    posTag = posIt.next();
                    if (i != rightContextSize - 1) continue;
                    rightCharOffset = posTag.getEnd();
                }
            }
            sb.append(StringUtils.normalizeSpace(docText.substring(leftCharOffset, gm.getBegin())));
            sb.append(" << ");
            sb.append(gmStringFunc.apply(gm));
            sb.append(" >> ");
            sb.append(StringUtils.normalizeSpace(docText.substring(gm.getEnd(), rightCharOffset)));
        } else if (onlyGenes && !uniqueGenes) {
            for (GeneMention g3 : doc.getGenesIterable()) {
                boolean isFocusGene = g3.equals(gm);
                if (isFocusGene) {
                    sb.append(" << ");
                }
                sb.append(gmStringFunc.apply(g3));
                if (isFocusGene) {
                    sb.append(" >>");
                }
                sb.append(" ");
            }
        } else {
            HashSet<String> alreadySeen = new HashSet<String>();
            boolean markWritten = false;
            for (GeneMention g4 : doc.getGenesIterable()) {
                String normalizedTextNormSpace = gmStringFunc.apply(g4);
                if (!alreadySeen.add(normalizedTextNormSpace)) continue;
                boolean isFocusGene = gmStringFunc.apply(g4).equals(gmStringFunc.apply(gm));
                if (isFocusGene && !markWritten) {
                    sb.append(" << ");
                }
                sb.append(normalizedTextNormSpace);
                if (isFocusGene && !markWritten) {
                    sb.append(" >>");
                }
                sb.append(" ");
                if (!isFocusGene) continue;
                markWritten = true;
            }
        }
        return sb.toString();
    }

    public static String getCandidateQueryString(SynHit sh, CandidateRetrieval candidateRetrieval) throws ExecutionException {
        try {
            sh = candidateRetrieval.getOriginalNamesIndexRecords(List.of(sh.getId())).get(0);
        }
        catch (IndexOutOfBoundsException e) {
            log.warn("Got no original names for gene ID {}", (Object)sh.getId());
        }
        if (!(sh instanceof GeneRecordHit)) {
            throw new IllegalArgumentException("This code requires the use of the gene record candidate index to obtain all the gene record information from. However, the given SynHit is not a GeneRecordHit: " + sh);
        }
        GeneRecordHit rh = (GeneRecordHit)sh;
        Stream.Builder<String> sb = Stream.builder();
        sb.add(sh.getId());
        sb.add("symbol:");
        sb.add(rh.getSymbol());
        sb.add("symbolnomenclat:");
        sb.add(rh.getSymbolFromNomenclature());
        sb.add("synonyms:");
        Stream.of(rh.getSynonyms()).forEach(syn -> sb.add((String)syn));
        sb.add("chromosome:");
        sb.add(rh.getChromosome());
        sb.add("maploc:");
        sb.add(rh.getMapLocation());
        sb.add("fullnames:");
        Stream.of(rh.getFullNames()).forEach(fn -> sb.add((String)fn));
        sb.add("otherdes:");
        Stream.of(rh.getOtherDesignations()).forEach(od -> sb.add((String)od));
        String ret = sb.build().filter(Objects::nonNull).filter(Predicate.not(String::isBlank)).collect(Collectors.joining(" "));
        return ret;
    }

    public static void addDocumentLevelGeneAnnotations(GeneDocument document, Multimap<String, String> docid2geneid) {
        Collection<String> geneIds = docid2geneid.get(document.getId());
        HashSet<String> goldIds = new HashSet<String>();
        for (String geneId : geneIds) {
            goldIds.add(GeneInformation.REPLACED.getOrDefault(geneId, geneId));
        }
        document.setGoldIds(goldIds);
    }

    private static /* synthetic */ String lambda$getGmMarkedDocumentText$1(GeneMention g2) {
        return StringUtils.normalizeSpace(g2.getNormalizedText());
    }
}

