/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.crawler.analysis;

import com.codahale.metrics.Counter;
import com.codahale.metrics.Histogram;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMultiset;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.Resources;
import de.l3s.icrawl.contentanalysis.DocumentVectorSimilarity;
import de.l3s.icrawl.contentanalysis.LanguageModels;
import de.l3s.icrawl.contentanalysis.WebPageDateExtractor;
import de.l3s.icrawl.crawler.ArchiveCrawlSpecification;
import de.l3s.icrawl.crawler.CrawlUrl;
import de.l3s.icrawl.crawler.TimeSpecification;
import de.l3s.icrawl.crawler.analysis.ResourceAnalyserFactory;
import de.l3s.icrawl.crawler.urls.RegexUrlNormalizer;
import de.l3s.icrawl.crawler.urls.UrlCanonicalizerNormalizer;
import de.l3s.icrawl.crawler.urls.UrlFilter;
import de.l3s.icrawl.crawler.urls.UrlNormalizer;
import de.l3s.icrawl.crawler.urls.UrlNormalizers;
import de.l3s.icrawl.snapshots.Snapshot;
import de.l3s.icrawl.util.TextExtractor;
import java.io.IOException;
import java.time.ZonedDateTime;
import java.util.Collection;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openimaj.text.nlp.language.LanguageDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ResourceAnalyser {
    private static final Logger logger = LoggerFactory.getLogger(ResourceAnalyser.class);
    private final UrlFilter urlFilter;
    private final UrlNormalizer urlNormalizer;
    private final Histogram outlinkCount;
    private final Counter unknowns;
    private final Counter empty;
    private final DocumentVectorSimilarity similarity;
    private final LanguageDetector languageDetector = new LanguageDetector();
    private final Timer parseTime;
    private final Timer textExtractTime;
    private final Timer analysisTime;
    private final WeightingMethod method;
    private final TimeSpecification referenceTime;
    private final Timer dateExtractionTime;
    private final float timeRelevanceThreshold;
    private final float docSimilarityWeight;

    public ResourceAnalyser(ArchiveCrawlSpecification spec, WeightingMethod method, MetricRegistry metrics, float timeRelevanceThreshold, float docSimilarityWeight) throws IOException {
        Map<String, Double> dictionary;
        Preconditions.checkArgument((0.0f <= docSimilarityWeight && (double)docSimilarityWeight <= 1.0 ? 1 : 0) != 0, (Object)"docSimilarityWeight");
        this.method = method;
        this.docSimilarityWeight = docSimilarityWeight;
        this.timeRelevanceThreshold = timeRelevanceThreshold;
        try (GZIPInputStream is = new GZIPInputStream(Resources.getResource((String)"dictionary-DE.tsv.gz").openStream());){
            dictionary = LanguageModels.readIdfDictionary(is);
        }
        LanguageModels models = new LanguageModels(Locale.GERMAN, dictionary, spec.getDefaultLanguage());
        this.similarity = DocumentVectorSimilarity.fromVectors(spec.getReferenceVectors(), spec.getKeywords(), spec.getDefaultLanguage(), models, spec.getCorrectionFactors());
        this.referenceTime = spec.getReferenceTime();
        this.urlFilter = UrlFilter.ONLY_HTTP;
        this.urlNormalizer = new UrlNormalizers(new UrlCanonicalizerNormalizer(), new RegexUrlNormalizer(Resources.getResource((String)"default-regex-normalizers.xml")));
        this.outlinkCount = metrics.histogram(MetricRegistry.name(this.getClass(), (String[])new String[]{"numOutlinks"}));
        this.unknowns = metrics.counter(MetricRegistry.name(this.getClass(), (String[])new String[]{"unknownType"}));
        this.empty = metrics.counter(MetricRegistry.name(this.getClass(), (String[])new String[]{"empty"}));
        this.parseTime = metrics.timer(MetricRegistry.name(this.getClass(), (String[])new String[]{"parseTime"}));
        this.textExtractTime = metrics.timer(MetricRegistry.name(this.getClass(), (String[])new String[]{"textExtractTime"}));
        this.analysisTime = metrics.timer(MetricRegistry.name(this.getClass(), (String[])new String[]{"analysisTime"}));
        this.dateExtractionTime = metrics.timer(MetricRegistry.name(this.getClass(), (String[])new String[]{"dateExtractionTime"}));
    }

    public Result analyse(Snapshot resource, CrawlUrl url) {
        Object content = resource.getContent();
        if (content instanceof String) {
            WebPageDateExtractor.WebPageDate modifiedDate;
            Timer.Context timer = this.parseTime.time();
            Document doc = Jsoup.parse((String)((String)content), (String)url.getUrl());
            timer.stop();
            timer = this.textExtractTime.time();
            String text = TextExtractor.extractText(doc);
            timer.stop();
            if (text.trim().isEmpty()) {
                logger.debug("No content for URL '{}", (Object)url);
                this.empty.inc();
                return Result.EMPTY;
            }
            timer = this.analysisTime.time();
            Locale language = this.languageDetector.classify(text).getLocale();
            float docSimilarity = (float)this.similarity.getSimilarity(language, text);
            timer.stop();
            float timeRelevance = 1.0f;
            try (Timer.Context t = this.dateExtractionTime.time();){
                long crawlTimeMs = resource.getCrawlTime().toInstant().toEpochMilli();
                modifiedDate = WebPageDateExtractor.getModifiedDate(resource.getOriginalUrl(), doc, crawlTimeMs, null);
                if (modifiedDate != null && modifiedDate.getDate() != null) {
                    timeRelevance = this.method == WeightingMethod.TIME || this.method == WeightingMethod.CONTENT_AND_TIME ? (float)this.referenceTime.getRelevance(modifiedDate.getDate()) : (float)this.referenceTime.getRelevanceExp(modifiedDate.getDate());
                }
            }
            catch (InterruptedException e) {
                logger.info("Interrupted while extracting date", (Throwable)e);
                return Result.EMPTY;
            }
            float outlinkScore = this.outlinkScore(docSimilarity, timeRelevance);
            ImmutableMultiset.Builder outlinks = ImmutableMultiset.builder();
            for (Element link : doc.select("a[href]")) {
                String docUrl = link.absUrl("href");
                if (docUrl.trim().isEmpty() || !docUrl.startsWith("http")) {
                    logger.trace("Skipping URL '{}'", (Object)docUrl);
                    continue;
                }
                String outUrl = this.urlNormalizer.normalize(docUrl);
                if (!this.urlFilter.apply(outUrl)) continue;
                outlinks.add((Object)url.outlink(outUrl, outlinkScore, resource.getCrawlTime()));
            }
            Set outUrls = outlinks.build().elementSet();
            this.outlinkCount.update(outUrls.size());
            logger.debug("Extracted outlinks for URL {}, got {}", (Object)url, (Object)outUrls.size());
            ZonedDateTime modifiedDateDate = modifiedDate != null ? modifiedDate.getDate() : null;
            return new Result(outUrls, docSimilarity, modifiedDateDate);
        }
        logger.debug("Unhandled content type '{}' for URL '{}'", (Object)resource.getMimeType(), (Object)url);
        this.unknowns.inc();
        return Result.EMPTY;
    }

    private float outlinkScore(float docSimilarity, float timeRelevance) {
        switch (this.method) {
            case CONTENT: {
                return docSimilarity;
            }
            case CONTENT_AND_TIME: 
            case CONTENT_AND_TIME_EXP: {
                if (docSimilarity > this.timeRelevanceThreshold) {
                    return this.docSimilarityWeight * docSimilarity + (1.0f - this.docSimilarityWeight) * timeRelevance;
                }
                return docSimilarity;
            }
            case TIME: 
            case TIME_EXP: {
                return timeRelevance;
            }
            case UNFOCUSED: {
                return 1.0f;
            }
        }
        throw new IllegalStateException("Unhandled weighting method " + (Object)((Object)this.method));
    }

    public static class Factory
    implements ResourceAnalyserFactory {
        private final MetricRegistry metrics;
        private final float timeRelevanceThreshold;
        private final float docSimilarityWeight;

        public Factory(MetricRegistry metrics, float timeRelevanceTreshold, float docSimilarityWeight) {
            this.metrics = metrics;
            this.timeRelevanceThreshold = timeRelevanceTreshold;
            this.docSimilarityWeight = docSimilarityWeight;
        }

        @Override
        public ResourceAnalyser get(ArchiveCrawlSpecification spec, WeightingMethod method) throws IOException {
            return new ResourceAnalyser(spec, method, this.metrics, this.timeRelevanceThreshold, this.docSimilarityWeight);
        }
    }

    public static enum WeightingMethod {
        CONTENT(false),
        TIME(true),
        TIME_EXP(true),
        CONTENT_AND_TIME(true),
        CONTENT_AND_TIME_EXP(true),
        UNFOCUSED(false);

        private final boolean timeSensitive;

        private WeightingMethod(boolean timeSensitive) {
            this.timeSensitive = timeSensitive;
        }

        public boolean isTimeSensitive() {
            return this.timeSensitive;
        }
    }

    public static class Result {
        public static final Result EMPTY = new Result((Collection<CrawlUrl>)ImmutableSet.of(), -1.0, null);
        private final Collection<CrawlUrl> outlinks;
        private final double relevance;
        private final ZonedDateTime modifiedDate;

        Result(Collection<CrawlUrl> outlinks, double relevance, ZonedDateTime modifiedDate) {
            this.outlinks = outlinks;
            this.relevance = relevance;
            this.modifiedDate = modifiedDate;
        }

        public Collection<CrawlUrl> getOutlinks() {
            return this.outlinks;
        }

        public double getRelevance() {
            return this.relevance;
        }

        public ZonedDateTime getModifiedDate() {
            return this.modifiedDate;
        }
    }
}

