/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.geneexpbase.data;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval;
import de.julielab.geneexpbase.candidateretrieval.QueryGenerator;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.geneexpbase.configuration.Configuration;
import de.julielab.geneexpbase.data.CorpusReader;
import de.julielab.geneexpbase.data.DocumentLoadingException;
import de.julielab.geneexpbase.data.DocumentSourceFiles;
import de.julielab.geneexpbase.data.GeneInformation;
import de.julielab.geneexpbase.genemodel.Acronym;
import de.julielab.geneexpbase.genemodel.Apposition;
import de.julielab.geneexpbase.genemodel.CoreferenceSet;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.geneexpbase.genemodel.GeneOrthologs;
import de.julielab.geneexpbase.genemodel.MeshHeading;
import de.julielab.geneexpbase.genemodel.PosTag;
import de.julielab.geneexpbase.genemodel.SpeciesCandidates;
import de.julielab.geneexpbase.genemodel.SpeciesMention;
import de.julielab.java.utilities.spanutils.OffsetMap;
import de.julielab.java.utilities.spanutils.OffsetSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DocumentLoader {
    private static final Logger log = LoggerFactory.getLogger(DocumentLoader.class);
    private final CandidateRetrieval candidateRetrieval;
    private final QueryGenerator queryGeneratorForMentionInference;
    private final GeneOrthologs geneOrthologs;
    private final Configuration configuration;
    private final TermNormalizer normalizer;

    @Inject
    public DocumentLoader(CandidateRetrieval candidateRetrieval, @Named(value="IdInference") QueryGenerator queryGeneratorForMentionInference, GeneOrthologs geneOrthologs, TermNormalizer normalizer, Configuration configuration) {
        this.candidateRetrieval = candidateRetrieval;
        this.queryGeneratorForMentionInference = queryGeneratorForMentionInference;
        this.geneOrthologs = geneOrthologs;
        this.normalizer = normalizer;
        this.configuration = configuration;
    }

    private static void setGoldMentionsToPredictions(Collection<List<GeneMention>> goldData, Multimap<String, GeneMention> predictedGeneMentions, Set<String> finalDocIds) {
        Map goldGeneMap = goldData.stream().flatMap(Collection::stream).collect(Collectors.groupingBy(GeneMention::getDocId, HashMap::new, Collectors.toMap(GeneMention::getOffsets, gm -> {
            ArrayList<GeneMention> l = new ArrayList<GeneMention>();
            l.add((GeneMention)gm);
            return l;
        }, (l1, l2) -> {
            l1.addAll(l2);
            return l1;
        }, OffsetMap::new)));
        for (GeneMention gm2 : predictedGeneMentions.values()) {
            if (!finalDocIds.contains(gm2.getDocId()) || goldGeneMap.get(gm2.getDocId()) == null) continue;
            gm2.setOverlappingGoldMentions(new ArrayList<GeneMention>(((OffsetMap)goldGeneMap.get(gm2.getDocId())).getOverlapping(gm2).values().stream().flatMap(Collection::stream).collect(Collectors.toList())));
        }
    }

    public Stream<GeneDocument> getDocuments(DocumentSourceFiles files) throws DocumentLoadingException {
        try {
            Set<String> docIds;
            Multimap<String, GeneMention> goldData;
            boolean goldHasOffsets = false;
            boolean inferDocLevelAnnotationsToMentions = false;
            try {
                goldData = CorpusReader.readMentionsWithOffsets(files.getGoldGeneList());
                goldHasOffsets = true;
            }
            catch (Exception e) {
                goldData = CorpusReader.readGoldIds(files.getGoldGeneList());
                inferDocLevelAnnotationsToMentions = files.getInferDocumentLevelLabelsToMentions();
            }
            Iterator<GeneMention> goldIt = goldData.values().iterator();
            while (goldIt.hasNext()) {
                GeneMention goldGm = goldIt.next();
                if (!goldGm.getIds().isEmpty() && !goldGm.getIds().get(0).equals("NoId")) continue;
                goldIt.remove();
            }
            goldData.values().forEach(gm -> gm.setTagger(GeneMention.GeneTagger.GOLD));
            goldData.values().forEach(gm -> gm.setSpecificType(gm.getSpecificType() != null ? gm.getSpecificType() : files.getDefaultEntityType()));
            boolean withOffsets = goldHasOffsets;
            List<String> allowedTypes = Arrays.asList("Gene", "protein", "protein_complex", "protein_enum", "protein_familiy_or_group");
            Multimap<String, GeneMention> predictedGeneMentions = CorpusReader.readMixedFileForGenesWithOffsets(files.getPredictedGenesPath(), files.getAllowedGeneTypes(), files.getTaggersToUse());
            Multimap<String, Acronym> acronyms = CorpusReader.readAcronymAnnotations(files.getAcronymsPath());
            Multimap<String, CoreferenceSet> coreferences = CorpusReader.readCoreferenceAnnotations(files.getCorefPath());
            Multimap<String, Apposition> appositions = CorpusReader.readAppositionAnnotations(files.getAppositionsPath());
            Map<String, String> documentContexts = CorpusReader.readGeneContexts(files.getDocTextPath());
            Multimap<String, Range<Integer>> sentences = CorpusReader.readMixedFileForSentenceOffsets(files.getSentencesPath());
            Multimap<String, Range<Integer>> nonGenePhrases = CorpusReader.readMixedFileForNonGenePhraseOffsets(files.getSentencesPath());
            Map<String, OffsetMap<SpeciesMention>> species = CorpusReader.readMixedFileForTextSpecies(files.getSpeciesPath());
            Map<String, OffsetMap<String>> chunks = CorpusReader.readMixedFileForChunkOffsets(files.getChunksPath());
            Map<String, OffsetMap<String>> ontologyClassMentions = CorpusReader.readMixedFileForOntologyClassMentions(files.getOntologyMentionsPath());
            Multimap<String, PosTag> posTags = CorpusReader.readMixedFileForPosTags(files.getPosPath());
            Multimap<Object, Object> meshHeadings = HashMultimap.create();
            if (files.hashMesh()) {
                meshHeadings = CorpusReader.readMeshHeadings(files.getMeshPath());
            }
            if (files.hasSubstances()) {
                meshHeadings.putAll(CorpusReader.readMeshHeadings(files.getSubstancesPath()));
            }
            if (predictedGeneMentions.isEmpty()) {
                throw new IllegalArgumentException("Could not find any entity of types '" + allowedTypes + "' of tagger '" + files.getTaggersToUse() + "' in " + files.getPredictedGenesPath() + ".");
            }
            Multimap<Object, Object> finalMeshHeadings = meshHeadings;
            boolean isSpeciesCorpus = files.isSpeciesCorpus();
            Set<String> finalDocIds = docIds = documentContexts.keySet();
            if (files.isHasGeneIds()) {
                goldData.values().stream().filter(gm -> finalDocIds.contains(gm.getDocId())).forEach(gm -> {
                    gm.setIds(gm.getIds().stream().map(id -> GeneInformation.REPLACED.getOrDefault(id, (String)id)).collect(Collectors.toList()));
                    gm.setTaxonomyId(this.candidateRetrieval.mapGeneIdToTaxId(gm.getGoldMentionId()));
                    if (gm.getTaxonomyId().isBlank()) {
                        log.warn("Could not retrieve the taxonomy of the gold gene ID " + gm.getIds());
                    }
                });
            } else if (files.isSpeciesCorpus()) {
                goldData.values().stream().forEach(gm -> gm.setTaxonomyIds(gm.getIds()));
            }
            Multimap<String, GeneMention> finalGoldData = goldData;
            boolean finalGoldHasOffsets = goldHasOffsets;
            boolean finalInferDocToMention = inferDocLevelAnnotationsToMentions;
            return finalDocIds.stream().map(docId -> this.getGeneDocument(this.normalizer, this.candidateRetrieval, finalGoldData, predictedGeneMentions, acronyms, coreferences, appositions, documentContexts, sentences, nonGenePhrases, species, chunks, posTags, ontologyClassMentions, (Multimap<String, MeshHeading>)finalMeshHeadings, isSpeciesCorpus, this.geneOrthologs, withOffsets, finalInferDocToMention, (String)docId)).peek(d -> {
                if (finalGoldHasOffsets || finalInferDocToMention) {
                    DocumentLoader.setGoldMentionsToPredictions(d.getGoldGenes().values(), predictedGeneMentions, finalDocIds);
                }
                d.setCompletelyAnnotated(files.isCompletelyAnnotated());
            });
        }
        catch (IOException e) {
            throw new DocumentLoadingException(e);
        }
    }

    public GeneDocument getGeneDocument(TermNormalizer normalizer, CandidateRetrieval candidateRetrieval, Multimap<String, GeneMention> finalGoldData, Multimap<String, GeneMention> predictedGeneMentions, Multimap<String, Acronym> acronyms, Multimap<String, CoreferenceSet> coreferences, Multimap<String, Apposition> appositions, Map<String, String> documentContexts, Multimap<String, Range<Integer>> sentences, Multimap<String, Range<Integer>> nonGenePhrases, Map<String, OffsetMap<SpeciesMention>> species, Map<String, OffsetMap<String>> chunks, Multimap<String, PosTag> posTags, Map<String, OffsetMap<String>> ontologyClassMentions, Multimap<String, MeshHeading> meshHeadings, boolean isSpeciesCorpus, GeneOrthologs geneOrthologs, boolean goldHasOffsets, boolean inferDocLevelLabelsToMentions, String docId) {
        GeneDocument document = new GeneDocument(docId);
        document.setTermNormalizer(normalizer);
        document.setAcronyms(new HashSet<Acronym>(acronyms.get(docId)));
        document.setCoreferenceRelations(coreferences.get(docId));
        document.setAppositions(appositions.get(docId));
        String[] textSplit = (String[])Stream.of(documentContexts.get(docId).split("\\n")).filter(Predicate.not(String::isBlank)).toArray(String[]::new);
        String title = null;
        String abstractText = null;
        if (textSplit.length > 1) {
            title = textSplit[0];
            abstractText = textSplit[1];
        } else if (textSplit.length == 1) {
            if (textSplit[0].length() > 300) {
                abstractText = textSplit[0];
            } else {
                title = textSplit[0];
            }
        }
        if (title != null) {
            document.setTitleOffsets(Range.between(0, title.length()));
        }
        document.setAbstractOffsets(Range.between(document.getTitleOffsets().getMaximum() + 1, documentContexts.get(docId).length()));
        document.setDocumentTitle(title);
        document.setDocumentAbstract(abstractText);
        document.setDocumentText(documentContexts.get(docId));
        document.setChunks(chunks.get(docId));
        document.setOntologyClassMentions(ontologyClassMentions.get(docId));
        document.setPosTags(posTags.get(docId));
        document.setSpecies(new SpeciesCandidates(document.getTitleOffsets().getMinimum(), document.getTitleOffsets().getMaximum(), Collections.emptySet(), species.get(docId)));
        document.setSentences(new OffsetSet(sentences.get(docId)));
        document.setNonGenePhrases(new OffsetSet(nonGenePhrases.get(docId)));
        document.setMeshHeadings(meshHeadings.get(docId));
        document.setGenes(new HashSet<GeneMention>(predictedGeneMentions.get(docId)));
        document.getAllGenes().forEach(gm -> {
            gm.setDocumentContext(document.getDocumentText());
            gm.setId("NoId");
            gm.setTaxonomyId(null);
        });
        document.selectAllGenes();
        if (goldHasOffsets) {
            finalGoldData.get(docId).forEach(document::putGoldGene);
            document.setGoldIds(finalGoldData.get(docId).stream().map(GeneMention::getIds).flatMap(Collection::stream).collect(Collectors.toSet()));
        } else {
            Set<String> goldIds = finalGoldData.get(docId).stream().map(GeneMention::getIds).flatMap(Collection::stream).collect(Collectors.toSet());
            if (inferDocLevelLabelsToMentions) {
                this.inferDocumentLevelLabelsToMentions(document, goldIds, candidateRetrieval, geneOrthologs, isSpeciesCorpus);
            }
            document.setGoldIds(goldIds);
            document.setGoldTaxonomyIds(goldIds.stream().map(id -> candidateRetrieval.mapGeneIdToTaxId((String)id)).collect(Collectors.toSet()));
        }
        if (isSpeciesCorpus) {
            document.getGoldGenes().values().stream().flatMap(Collection::stream).forEach(goldgm -> {
                goldgm.setTaxonomyIds(goldgm.getIds());
                goldgm.setId(null);
            });
            document.setGoldTaxonomyIds(new HashSet<String>(document.getGoldIds()));
        }
        document.addState(GeneDocument.State.REFERENCE_SPECIES_ADDED);
        document.setGoldMentionsWithOffsets(goldHasOffsets || inferDocLevelLabelsToMentions);
        return document;
    }

    public void inferDocumentLevelLabelsToMentions(GeneDocument document, Set<String> goldIds, CandidateRetrieval candidateRetrieval, GeneOrthologs geneOrthologs, boolean isSpeciesCorpus) {
        boolean REMOVE_AMBIGOUOUS_GENES = false;
        int batchSize = 1024;
        if (goldIds.size() > batchSize) {
            log.debug("Document {} has {} goldIds", (Object)document.getId(), (Object)goldIds.size());
        }
        ArrayList<GeneMention> unclearGms = new ArrayList<GeneMention>();
        for (GeneMention gm2 : document.getGenesIterable()) {
            ArrayList<SynHit> candidates = new ArrayList<SynHit>();
            Iterator<String> goldIdIt = goldIds.iterator();
            while (goldIdIt.hasNext()) {
                ArrayList<String> batch = new ArrayList<String>(batchSize);
                for (int i = 0; i < batchSize && goldIdIt.hasNext(); ++i) {
                    batch.add(goldIdIt.next());
                }
                try {
                    candidates.addAll(candidateRetrieval.getCandidates(gm2, batch, Collections.emptySet(), this.queryGeneratorForMentionInference));
                    if (goldIds.size() <= batchSize) continue;
                    log.debug("Retrieved {} candidates for gene {} in document {}", candidates.size(), gm2.getText(), document.getId());
                }
                catch (Exception e) {
                    log.error("Could not retrieve candidates for gene {} and goldIds {}.", (Object)gm2, (Object)goldIds);
                    throw e;
                }
            }
            if (!candidates.isEmpty()) {
                SynHit bestHit = (SynHit)candidates.get(0);
                Set orthologIds = geneOrthologs.getOrthologs(bestHit.getId()).stream().map(record -> record.getGeneIdNotEqualTo(bestHit.getId())).collect(Collectors.toSet());
                orthologIds.remove(bestHit.getId());
                Sets.SetView orthologsInGold = Sets.intersection(orthologIds, goldIds);
                GeneMention gold = new GeneMention(gm2);
                gold.setDocId(document.getId());
                gold.setIds(List.of(bestHit.getId()));
                gold.setTaxonomyIds(gold.getIds().stream().map(id -> candidateRetrieval.mapGeneIdToTaxId((String)id)).collect(Collectors.toList()));
                document.putGoldGene(gold);
                gm2.setOverlappingGoldMentions(Collections.singletonList(gold));
                continue;
            }
            unclearGms.add(gm2);
        }
        unclearGms.forEach(gm -> document.removeGene((GeneMention)gm));
        document.setGoldMentionsWithOffsets(true);
        document.setGoldOffsetsInferred(true);
    }
}

