/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.contentanalysis;

import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMultiset;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import de.l3s.icrawl.contentanalysis.Labeler;
import de.l3s.icrawl.contentanalysis.LabelerFactory;
import de.l3s.icrawl.contentanalysis.TextRankWrapper;
import de.l3s.icrawl.domain.specification.NamedEntity;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import javax.annotation.Nullable;
import org.openimaj.text.nlp.language.LanguageDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ContentAnalyser {
    private static final CharMatcher SEPARATOR_MATCHER = CharMatcher.WHITESPACE.or(CharMatcher.anyOf((CharSequence)"<>|\u201c\u201d\u201e\u201a\u2018\u2019,;.:-_'+*`'()$%!\"?"));
    private static final Splitter TEXT_SPLITTER = Splitter.on((CharMatcher)SEPARATOR_MATCHER).omitEmptyStrings();
    private static final Logger logger = LoggerFactory.getLogger(ContentAnalyser.class);
    private final LanguageDetector languageDetector;
    private final LabelerFactory labelerFactory;
    private final TextRankWrapper textRank;

    public ContentAnalyser(LanguageDetector languageDetector, @Nullable LabelerFactory labelerFactory) {
        this.languageDetector = languageDetector;
        this.labelerFactory = labelerFactory;
        this.textRank = new TextRankWrapper();
    }

    public Counts analyze(List<String> paragraphs, Set<String> keywords) {
        List<String> extractedKeywords;
        HashMultiset detectedKeywords = HashMultiset.create((int)keywords.size());
        HashMultiset detectedEntities = HashMultiset.create();
        long words = 0L;
        String text = ContentAnalyser.joinParagraphs(paragraphs);
        LanguageDetector.WeightedLocale wl = this.languageDetector.classify(text);
        Labeler labeler = this.labelerFactory != null ? this.labelerFactory.get(wl) : null;
        try {
            extractedKeywords = this.textRank.rank(text, wl.getLocale(), 10);
        }
        catch (RuntimeException e) {
            logger.debug("Exception while running TextRank on '{}'@{}: ", new Object[]{text.length() > 50 ? text.substring(0, 50) + "..." : text, wl.getLocale(), e});
            extractedKeywords = Collections.emptyList();
        }
        for (String paragraph : paragraphs) {
            detectedEntities.addAll(this.extractEntities(labeler, paragraph));
            if (!keywords.isEmpty()) {
                words += this.extractSpecifiedKeywords(paragraph, (Multiset<String>)detectedKeywords, keywords);
                continue;
            }
            words += this.countWords(paragraph);
        }
        return new Counts((Multiset<String>)detectedKeywords, (Multiset<NamedEntity>)detectedEntities, extractedKeywords, words, wl.getLocale());
    }

    private long extractSpecifiedKeywords(String paragraph, Multiset<String> detectedKeywords, Set<String> keywords) {
        long tokens = 0L;
        for (String token : TEXT_SPLITTER.split((CharSequence)paragraph)) {
            if (SEPARATOR_MATCHER.matchesAllOf((CharSequence)token)) continue;
            ++tokens;
            String actualToken = token.toLowerCase().trim();
            if (!keywords.contains(actualToken)) continue;
            detectedKeywords.add((Object)actualToken);
        }
        return tokens;
    }

    private long countWords(String paragraph) {
        long tokens = 0L;
        for (String token : TEXT_SPLITTER.split((CharSequence)paragraph)) {
            if (SEPARATOR_MATCHER.matchesAllOf((CharSequence)token)) continue;
            ++tokens;
        }
        return tokens;
    }

    protected Collection<NamedEntity> extractEntities(Labeler labeler, String paragraph) {
        if (labeler != null) {
            return labeler.extractEntities(paragraph);
        }
        return Collections.emptySet();
    }

    private static String joinParagraphs(List<String> paragraphs) {
        return Joiner.on((char)'\n').join(paragraphs);
    }

    public static class Counts {
        private final Multiset<String> keywords;
        private final Multiset<NamedEntity> entities;
        private final List<String> detectedKeywords;
        private final long documentLength;
        private final Locale language;

        Counts(Multiset<String> keywords, Multiset<NamedEntity> entities, List<String> detectedKeywords, long documentLength, Locale language) {
            this.keywords = keywords;
            this.entities = entities;
            this.detectedKeywords = detectedKeywords;
            this.documentLength = documentLength;
            this.language = language;
        }

        public Multiset<String> getKeywords() {
            return this.keywords;
        }

        public Multiset<NamedEntity> getEntities() {
            return this.entities;
        }

        public long getDocumentLength() {
            return this.documentLength;
        }

        public List<String> getDetectedKeywords() {
            return this.detectedKeywords;
        }

        public Locale getLanguage() {
            return this.language;
        }

        public static <T> List<T> topK(Multiset<T> set, int k) {
            ImmutableList.Builder results = ImmutableList.builder();
            ImmutableMultiset highestCounts = Multisets.copyHighestCountFirst(set);
            Iterable first = Iterables.limit((Iterable)highestCounts.entrySet(), (int)k);
            for (Multiset.Entry entry : first) {
                results.add(entry.getElement());
            }
            return results.build();
        }
    }
}

