/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.contentanalysis;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Multiset;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import de.l3s.icrawl.contentanalysis.DocumentVector;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LanguageModel {
    private static final int EXPECTED_DOCUMENT_VOCABULARY_SIZE = 1024;
    private static final double MIN_NUMBER_OCCURRENCES = 0.005;
    private static final Logger logger = LoggerFactory.getLogger(LanguageModel.class);
    private final ImmutableMap<String, Double> idfDictionary;
    private final double maxIdfValue;
    private final Analyzer analyzer;

    public LanguageModel(Analyzer analyzer, Map<String, Double> idfValues) {
        this.analyzer = analyzer;
        HashMap builder = Maps.newHashMapWithExpectedSize((int)idfValues.size());
        for (Map.Entry<String, Double> entry : idfValues.entrySet()) {
            String analyzed = this.analyzeToken(entry.getKey());
            Double oldValue = (Double)builder.get(analyzed);
            if (oldValue != null && !(oldValue > entry.getValue())) continue;
            builder.put(analyzed, entry.getValue());
        }
        this.idfDictionary = ImmutableMap.copyOf((Map)builder);
        this.maxIdfValue = this.idfDictionary.isEmpty() ? 1.0 : (Double)Ordering.natural().max((Iterable)this.idfDictionary.values());
    }

    public DocumentVector buildDocumentVector(String document, KeywordMatcher keywordMatcher) {
        Multiset tokens = (Multiset)LanguageModel.analyzeDocument(document, this.analyzer, HashMultiset.create((int)1024));
        HashMap dv = Maps.newHashMapWithExpectedSize((int)tokens.elementSet().size());
        double size = tokens.size();
        Splitter tokenizer = Splitter.on((String)" ");
        for (Multiset.Entry entry : tokens.entrySet()) {
            double weight;
            String multiToken = (String)entry.getElement();
            if (CharMatcher.DIGIT.matchesAnyOf((CharSequence)multiToken) && ((double)entry.getCount() / size < 0.005 || multiToken.length() == 1)) continue;
            List tokenList = tokenizer.splitToList((CharSequence)multiToken);
            if (keywordMatcher != null) {
                switch (keywordMatcher.match(tokenList)) {
                    case MATCHES_FULL: {
                        weight = 2.0;
                        break;
                    }
                    case MATCHES_PARTIAL: {
                        weight = 1.5;
                        break;
                    }
                    default: {
                        weight = 1.0;
                        break;
                    }
                }
            } else {
                weight = 1.0;
            }
            dv.put(multiToken, weight * LanguageModel.tf(entry.getCount()) * this.idf((String)entry.getElement()));
        }
        return new DocumentVector(dv);
    }

    private static double tf(int occurrences) {
        return occurrences <= 0 ? 0.0 : 1.0 + Math.log(occurrences);
    }

    private double idf(String token) {
        Double idf = (Double)this.idfDictionary.get((Object)token);
        return idf != null ? idf : this.maxIdfValue;
    }

    static <T extends Collection<String>> T analyzeDocument(String document, Analyzer analyzer, T tokenConsumer) {
        try (TokenStream ts = analyzer.tokenStream("text", document);){
            ts.reset();
            CharTermAttribute textAttribute = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
            while (ts.incrementToken()) {
                tokenConsumer.add((String)textAttribute.toString());
            }
            ts.end();
        }
        catch (IOException e) {
            throw new AssertionError("Unexpected exception while analysing string", e);
        }
        return tokenConsumer;
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    String analyzeToken(String token) {
        try (TokenStream ts = this.analyzer.tokenStream("text", token);){
            ts.reset();
            CharTermAttribute textAttribute = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
            ts.incrementToken();
            String analyzed = textAttribute.toString();
            ts.end();
            String string = analyzed;
            return string;
        }
        catch (IOException e) {
            throw new AssertionError("Unexpected exception while analysing string", e);
        }
    }

    public KeywordMatcher buildMatcher(Iterable<String> keywords, int ngramSize) {
        return new KeywordMatcher(keywords, this.analyzer, ngramSize);
    }

    static class KeywordMatcher
    implements Serializable {
        private static final long serialVersionUID = 1L;
        public static final double FULL_MATCH_WEIGHT = 2.0;
        public static final double PARTIAL_MATCH_WEIGHT = 1.5;
        public static final double NO_MATCH_WEIGHT = 1.0;
        private static final Joiner TOKEN_JOINER = Joiner.on((String)" ");
        @JsonProperty
        private final Set<String> singleTokenKeywords;
        @JsonProperty
        private final Set<String> multiTokenKeywords;
        @JsonProperty
        private final int ngramSize;

        @JsonCreator
        protected KeywordMatcher(@JsonProperty(value="singleTokenKeywords") Set<String> singleTokenKeywords, @JsonProperty(value="multiTokenKeywords") Set<String> multiTokenKeywords, @JsonProperty(value="ngramSize") int ngramSize) {
            this.singleTokenKeywords = singleTokenKeywords;
            this.multiTokenKeywords = multiTokenKeywords;
            this.ngramSize = ngramSize;
        }

        public KeywordMatcher(Iterable<String> keywords, Analyzer analyzer, int ngramSize) {
            this.ngramSize = ngramSize;
            ImmutableSet.Builder singleTokenKeywordsBuilder = ImmutableSet.builder();
            ImmutableSet.Builder multiTokenKeywordsBuilder = ImmutableSet.builder();
            for (String keyword : keywords) {
                List tokens = LanguageModel.analyzeDocument(keyword, analyzer, new ArrayList());
                if (tokens.isEmpty()) {
                    logger.debug("empty tokens list for keywords '{}'", (Object)keyword);
                    continue;
                }
                if (tokens.size() == 1) {
                    singleTokenKeywordsBuilder.add(tokens.get(0));
                    continue;
                }
                multiTokenKeywordsBuilder.addAll(this.ngrams(tokens, ngramSize));
            }
            this.singleTokenKeywords = singleTokenKeywordsBuilder.build();
            this.multiTokenKeywords = multiTokenKeywordsBuilder.build();
        }

        private Set<String> ngrams(List<String> tokens, int ngramSize) {
            HashSet ngrams = Sets.newHashSetWithExpectedSize((int)tokens.size());
            for (int i = 0; i < tokens.size() - ngramSize + 1; ++i) {
                ngrams.add(TOKEN_JOINER.join(tokens.subList(i, i + ngramSize)));
            }
            return ngrams;
        }

        public Match match(List<String> tokens) {
            for (String ngram : this.ngrams(tokens, this.ngramSize)) {
                if (!this.multiTokenKeywords.contains(ngram)) continue;
                return Match.MATCHES_FULL;
            }
            for (String token : tokens) {
                if (!this.singleTokenKeywords.contains(token)) continue;
                return Match.MATCHES_PARTIAL;
            }
            return Match.NO_MATCH;
        }

        public static KeywordMatcher matchNone() {
            return new KeywordMatcher(Collections.emptySet(), Collections.emptySet(), 1);
        }

        static enum Match {
            MATCHES_FULL,
            MATCHES_PARTIAL,
            NO_MATCH;

        }
    }
}

