/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.uima;

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import de.julielab.geneexpbase.genemodel.GeneEmbeddingCentroid;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.types.EmbeddingVector;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.types.Token;
import de.julielab.jcore.utility.JCoReTools;
import de.julielab.jcore.utility.index.Comparators;
import de.julielab.jcore.utility.index.IndexTermGenerator;
import de.julielab.jcore.utility.index.JCoReTreeMapAnnotationIndex;
import de.julielab.jcore.utility.index.TermGenerators;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name="JCoRe Gene Embedding Centroid Writer", description="Expects Token annotations with EmbeddingVectors set to their respective feature. There should be EntityMentions representing the exact gazetteer synonym mentions of gene synonyms. The Gene annotations should come from a more sophisticated method like flair. Then, all EntityMention annotations that overlap with a Gene annotation will add the embedding vectors of their overlapping tokens to the centroid of the synonym.")
@TypeCapability(inputs={"de.julielab.jcore.types.Token", "de.julielab.jcore.types.EmbeddingVector", "de.julielab.jcore.types.EntityMention", "de.julielab.jcore.types.Gene", "de.julielab.jcore.types.Header"})
public class GeneSynonymEmbeddingCentroidWriter
extends JCasAnnotator_ImplBase {
    public static final String PARAM_DB_DIR = "EmbeddingDbDir";
    private static final Logger log = LoggerFactory.getLogger(GeneSynonymEmbeddingCentroidWriter.class);
    private static final Cache<String, GeneEmbeddingCentroid> centroidDb = CacheBuilder.newBuilder().maximumSize(100000L).build();
    private static final Map<String, Boolean> seenDocumentsDb = new ConcurrentHashMap<String, Boolean>();
    @ConfigurationParameter(name="EmbeddingDbDir", description="The directory in which the Map.db files will be stored that contain the summed embeddings.")
    private String embeddingDbDir;
    private File embeddingOutputFile;

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);
        this.embeddingDbDir = (String)aContext.getConfigParameterValue(PARAM_DB_DIR);
        this.embeddingOutputFile = new File(this.embeddingDbDir);
        Map<String, Boolean> map = seenDocumentsDb;
        synchronized (map) {
            if (this.embeddingOutputFile.exists()) {
                log.info("Deleting existing embedding centroids file {}", (Object)this.embeddingDbDir);
                this.embeddingOutputFile.delete();
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        Map<String, Boolean> map = seenDocumentsDb;
        synchronized (map) {
            log.info("Writing embedding centroids for {} centroid vectors collected in {} documents to {}.", new Object[]{centroidDb.size(), seenDocumentsDb.size(), this.embeddingOutputFile});
            try (ObjectOutputStream os = new ObjectOutputStream(FileUtilities.getOutputStreamToFile((File)this.embeddingOutputFile));){
                HashMap centroidsForStorage = new HashMap(centroidDb.asMap());
                os.writeObject(centroidsForStorage);
                os.writeObject(seenDocumentsDb);
            }
            catch (IOException e) {
                log.error("Could not write centroids", (Throwable)e);
                throw new AnalysisEngineProcessException((Throwable)e);
            }
        }
    }

    public void process(JCas jCas) {
        String docId = JCoReTools.getDocId((JCas)jCas);
        if (StringUtils.isBlank((CharSequence)docId)) {
            throw new IllegalArgumentException("The 'docId' feature of the Header CAS annotation is blank or there is no header, cannot tell if the genes in the document have already been added to the store.");
        }
        Boolean documentAlreadySeen = seenDocumentsDb.get(docId);
        if (documentAlreadySeen != null && documentAlreadySeen.booleanValue()) {
            log.debug("Document with ID {} was marked in the embedding database as already seen, skipping.", (Object)docId);
            return;
        }
        JCoReTreeMapAnnotationIndex genes = new JCoReTreeMapAnnotationIndex(Comparators.longOverlapComparator(), (IndexTermGenerator)TermGenerators.longOffsetTermGenerator(), (IndexTermGenerator)TermGenerators.longOffsetTermGenerator(), jCas, Gene.type);
        if (genes.getIndex().isEmpty()) {
            return;
        }
        JCoReTreeMapAnnotationIndex tokens = new JCoReTreeMapAnnotationIndex(Comparators.longOverlapComparator(), (IndexTermGenerator)TermGenerators.longOffsetTermGenerator(), (IndexTermGenerator)TermGenerators.longOffsetTermGenerator(), jCas, Token.type);
        if (tokens.getIndex().isEmpty()) {
            log.warn("No token annotations found in document {}", (Object)docId);
            if (jCas.getAnnotationIndex(EmbeddingVector.type).iterator().hasNext()) {
                log.warn("However, EmbeddingVector annotations were found. Use the EmbeddingsWithTokensConnectionAnnotator component to set the embeddings to the tokens.");
            }
        }
        for (Annotation gazetteerGeneAnnotation : jCas.getAnnotationIndex(EntityMention.type)) {
            int i;
            Optional overlappingGene;
            int synPrio;
            if (gazetteerGeneAnnotation instanceof Gene) continue;
            EntityMention gazetteerGene = (EntityMention)gazetteerGeneAnnotation;
            String[] specificType = gazetteerGene.getSpecificType().split("_");
            String geneId = specificType[0];
            if (specificType.length > 1 && (synPrio = Integer.parseInt(specificType[1])) > 3 || !(overlappingGene = genes.searchFuzzy((Annotation)gazetteerGene).findAny()).isPresent()) continue;
            List synonymTokens = tokens.searchFuzzy((Annotation)overlappingGene.get()).collect(Collectors.toList());
            if (synonymTokens.isEmpty()) {
                log.warn("No tokens found for the gene synonym " + gazetteerGene.getCoveredText() + " in document " + docId);
                continue;
            }
            double[] vectorMean = null;
            for (i = 0; i < synonymTokens.size(); ++i) {
                Token token = (Token)synonymTokens.get(i);
                if (token.getEmbeddingVectors(0).getVector() == null) {
                    throw new IllegalArgumentException("A token has an embedding vector that has a null-valued 'vector' feature. Meaning, the actual embedding vector is null. The token is: " + token);
                }
                if (token.getEmbeddingVectors() == null) continue;
                if (vectorMean == null) {
                    vectorMean = token.getEmbeddingVectors(0).getVector().toArray();
                    continue;
                }
                for (int j = 0; j < vectorMean.length; ++j) {
                    int n = j;
                    vectorMean[n] = vectorMean[n] + token.getEmbeddingVectors(0).getVector().get(j);
                }
            }
            if (vectorMean != null && synonymTokens.size() > 1) {
                i = 0;
                while (i < vectorMean.length) {
                    int n = i++;
                    vectorMean[n] = vectorMean[n] / (double)synonymTokens.size();
                }
            } else if (vectorMean == null) {
                log.warn("Did not find word vectors on the tokens between offsets {}-{} in document {}", new Object[]{gazetteerGene.getBegin(), gazetteerGene.getEntityString(), docId});
            }
            if (vectorMean == null) continue;
            if (StringUtils.isBlank((CharSequence)geneId)) {
                throw new IllegalArgumentException("The specific type of an EntityMention annotation is blank. It must be set to the NCBI Gene ID.");
            }
            GeneEmbeddingCentroid centroid = (GeneEmbeddingCentroid)centroidDb.getIfPresent((Object)geneId);
            if (centroid == null) {
                centroidDb.put((Object)geneId, (Object)new GeneEmbeddingCentroid(vectorMean, true));
                continue;
            }
            centroid.addVector(vectorMean);
            centroidDb.put((Object)geneId, (Object)centroid);
        }
        seenDocumentsDb.put(docId, true);
    }
}

