/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.contentanalysis;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import de.l3s.icrawl.contentanalysis.DocumentVector;
import de.l3s.icrawl.contentanalysis.LanguageModel;
import de.l3s.icrawl.contentanalysis.LanguageModels;
import de.l3s.icrawl.domain.specification.NamedEntity;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.concurrent.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ThreadSafe
public class DocumentVectorSimilarity
implements Serializable {
    static final String TOKEN_SEPARATOR = " ";
    static final int DEFAULT_NGRAM_SIZE = 2;
    private static final Logger logger = LoggerFactory.getLogger(DocumentVectorSimilarity.class);
    private static final long serialVersionUID = 3L;
    @JsonProperty
    private final Map<Locale, DocumentVector> referenceVectors;
    @JsonProperty
    private final Map<Locale, LanguageModel.KeywordMatcher> matchers;
    @JsonIgnore
    private LanguageModels languageModels;
    @JsonIgnore
    private final Locale defaultLanguage;
    private final Map<Locale, Double> correctionFactors;

    public DocumentVectorSimilarity(Map<String, Locale> referenceDocumentsToLanguage, Set<String> keywords, Set<NamedEntity> entities, int maxTerms, boolean useDF, Locale defaultLanguage, LanguageModels languageModels) {
        this.defaultLanguage = defaultLanguage;
        this.languageModels = languageModels;
        Multimap documents = Multimaps.invertFrom((Multimap)Multimaps.forMap(referenceDocumentsToLanguage), (Multimap)ArrayListMultimap.create());
        HashSet<String> allLanguageKeywords = new HashSet<String>(keywords);
        HashMultimap keywordsByLanguage = HashMultimap.create();
        for (NamedEntity entity : entities) {
            for (NamedEntity.Label label : entity.getLabels()) {
                if (label.getLanguage() != null) {
                    keywordsByLanguage.put((Object)label.getLanguage(), (Object)label.getName());
                    continue;
                }
                allLanguageKeywords.add(label.getName());
            }
        }
        ImmutableMap.Builder vectors = ImmutableMap.builder();
        ImmutableMap.Builder matchersBuilder = ImmutableMap.builder();
        for (Map.Entry entry : documents.asMap().entrySet()) {
            Locale lang = (Locale)entry.getKey();
            LanguageModel.KeywordMatcher keywordMatcher = languageModels.buildMatcher(lang, Iterables.concat((Iterable)keywordsByLanguage.get((Object)lang), allLanguageKeywords), 2);
            ArrayList languageVectors = Lists.newArrayListWithExpectedSize((int)((Collection)entry.getValue()).size());
            for (String document2 : (Collection)entry.getValue()) {
                languageVectors.add(languageModels.buildDocumentVector(lang, document2, keywordMatcher));
            }
            logger.debug("Got doc vectors for language {}: {}", (Object)lang.getLanguage(), (Object)languageVectors);
            DocumentVector vector = DocumentVector.merge(languageVectors, useDF);
            if (maxTerms > 0) {
                vector = vector.topN(maxTerms);
            }
            vectors.put((Object)lang, (Object)vector);
            matchersBuilder.put((Object)lang, (Object)keywordMatcher);
        }
        this.referenceVectors = vectors.build();
        this.matchers = matchersBuilder.build();
        if (logger.isDebugEnabled()) {
            for (Map.Entry entry : this.referenceVectors.entrySet()) {
                logger.debug("Reference vector for language '{}': {}...", entry.getKey(), ((DocumentVector)entry.getValue()).topComponents(10));
            }
        }
        ImmutableMap.Builder correctionFactors = ImmutableMap.builder();
        for (Locale language : documents.keySet()) {
            DocumentVector reference = this.referenceVectors.get(language);
            LanguageModel.KeywordMatcher matcher = this.matchers.get(language);
            correctionFactors.put((Object)language, (Object)documents.get((Object)language).stream().mapToDouble(document -> languageModels.getSimilarity(language, (String)document, reference, matcher)).average().orElse(1.0));
        }
        this.correctionFactors = correctionFactors.build();
    }

    public static DocumentVectorSimilarity fromVectors(Map<Locale, DocumentVector> referenceVectors, Map<Locale, Set<String>> keywords, Locale defaultLanguage, LanguageModels languageModels, Map<Locale, Double> correctionFactors) {
        Set<String> allLanguageKeywords = keywords.values().stream().flatMap(Collection::stream).collect(Collectors.toSet());
        ImmutableMap.Builder matchersBuilder = ImmutableMap.builder();
        for (Map.Entry<Locale, Set<String>> language : keywords.entrySet()) {
            Locale lang = language.getKey();
            LanguageModel.KeywordMatcher keywordMatcher = languageModels.buildMatcher(lang, allLanguageKeywords, 2);
            matchersBuilder.put((Object)lang, (Object)keywordMatcher);
        }
        DocumentVectorSimilarity dvs = new DocumentVectorSimilarity(referenceVectors, (Map<Locale, LanguageModel.KeywordMatcher>)matchersBuilder.build(), defaultLanguage, correctionFactors);
        dvs.setLanguageModels(languageModels);
        return dvs;
    }

    public void setLanguageModels(LanguageModels languageModels) {
        this.languageModels = languageModels;
    }

    @JsonCreator
    protected DocumentVectorSimilarity(@JsonProperty(value="referenceVectors") Map<Locale, DocumentVector> referenceVectors, @JsonProperty(value="matchers") Map<Locale, LanguageModel.KeywordMatcher> matchers, @JsonProperty(value="defaultLanguage") Locale defaultLanguage, @JsonProperty(value="correctionFactors") Map<Locale, Double> correctionFactors) {
        this.referenceVectors = referenceVectors;
        this.matchers = matchers;
        this.defaultLanguage = defaultLanguage;
        this.correctionFactors = correctionFactors;
        this.languageModels = new LanguageModels(defaultLanguage, new HashMap<String, Double>(), defaultLanguage);
    }

    public String toString() {
        return this.referenceVectors.entrySet().stream().map(e -> String.format("%s => %s", e.getKey(), ((DocumentVector)e.getValue()).topComponents(10))).collect(Collectors.joining(", ", "DocumentVectorSimilarity[", "]"));
    }

    private LanguageModel.KeywordMatcher getMatcher(Locale language) {
        LanguageModel.KeywordMatcher keywordMatcher = this.matchers.get(language);
        if (keywordMatcher == null) {
            logger.debug("No keyword matcher for language '{}', falling back to default", (Object)language);
            keywordMatcher = this.matchers.get(this.defaultLanguage);
        }
        return keywordMatcher;
    }

    private DocumentVector getReferenceVector(Locale language) {
        DocumentVector reference = this.referenceVectors.get(language);
        if (reference == null) {
            logger.debug("No reference vector for language '{}', falling back to default", (Object)language);
            reference = this.referenceVectors.get(this.defaultLanguage);
        }
        return reference;
    }

    public Map<Locale, DocumentVector> getReferenceVectors() {
        return this.referenceVectors;
    }

    public Map<Locale, LanguageModel.KeywordMatcher> getMatchers() {
        return this.matchers;
    }

    public Map<Locale, Double> getCorrectionFactors() {
        return this.correctionFactors;
    }

    public double getSimilarity(Locale language, String text) {
        DocumentVector reference = this.getReferenceVector(language);
        if (reference == null) {
            logger.info("No reference vector for language {}", (Object)language);
            return 0.0;
        }
        LanguageModel.KeywordMatcher keywordMatcher = this.getMatcher(language);
        if (keywordMatcher == null) {
            logger.debug("Available keywords matchers: {}", this.matchers.keySet());
        }
        double correction = this.correctionFactors.getOrDefault(language, 1.0);
        return this.languageModels.getSimilarity(language, text, reference, keywordMatcher) / correction;
    }
}

