/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.crawler.tools;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import com.google.common.io.Files;
import com.google.common.io.Resources;
import de.l3s.icrawl.contentanalysis.DocumentVectorSimilarity;
import de.l3s.icrawl.contentanalysis.LanguageModels;
import de.l3s.icrawl.crawler.ArchiveCrawlSpecification;
import de.l3s.icrawl.crawler.TimeSpecification;
import de.l3s.icrawl.domain.specification.NamedEntity;
import de.l3s.icrawl.util.TextExtractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.Period;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.sourceforge.jwbf.core.actions.ContentProcessable;
import net.sourceforge.jwbf.core.actions.HttpActionClient;
import net.sourceforge.jwbf.core.contentRep.ParsedPage;
import net.sourceforge.jwbf.mediawiki.MediaWiki;
import net.sourceforge.jwbf.mediawiki.actions.misc.ParsePage;
import net.sourceforge.jwbf.mediawiki.actions.queries.CategoryMembersSimple;
import net.sourceforge.jwbf.mediawiki.bots.MediaWikiBot;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CrawlSpecCreator {
    private static final String WIKIPEDIA_API_URL = "https://de.wikipedia.org/w/";
    private static final String WIKIPEDIA_BASE_URL = "https://de.wikipedia.org/wiki/";
    private static final String WIKINEWS_API_URL = "https://de.wikinews.org/w/";
    private static final Logger logger = LoggerFactory.getLogger(CrawlSpecCreator.class);
    private static final Locale DEFAULT_LANGUAGE = Locale.GERMAN;
    private static final Map<Pattern, String> URL_REPLACEMENTS = ImmutableMap.builder().put((Object)Pattern.compile("https://web.archive.org/web/\\d+/(.*)"), (Object)"$1").put((Object)Pattern.compile("https://archive.is/\\d+/(.*)"), (Object)"$1").put((Object)Pattern.compile("https://archive.is/(.*)\\*$"), (Object)"$1").put((Object)Pattern.compile("http://www.webcitation.org/[a-zA-Z0-9]+\\?url=(.*)"), (Object)"$1").put((Object)Pattern.compile("http://derefer.unbubble.eu/?\\?u=(.*)"), (Object)"$1").put((Object)Pattern.compile("http://deadurl.invalid/(.*)"), (Object)"$1").build();
    private static final Set<Pattern> URL_PATTERNS_WHITELIST = ImmutableSet.of((Object)Pattern.compile("^https?://[a-z0-9.-]*?\\.de/"));
    private final MediaWikiBot wpBot;
    private final MediaWikiBot wnBot;
    private final EnumSet<ParsePage.ParseProp> props;
    private final Pattern parentheses;
    private final Map<String, Double> idfDictionary;
    private final MediaWiki.Version version;

    public static void main(String[] args) throws IOException {
        if (args.length < 1) {
            System.out.println("Usage: java " + CrawlSpecCreator.class.getName() + " topicsFile.tsv [outputDirectory]");
            System.exit(1);
        }
        CrawlSpecCreator creator = new CrawlSpecCreator();
        DateTimeFormatter dateFormat = DateTimeFormatter.ISO_DATE;
        File baseDirectory = new File(args.length > 1 ? args[1] : "");
        baseDirectory.mkdirs();
        try (BufferedReader reader = Files.newReader((File)new File(args[0]), (Charset)StandardCharsets.UTF_8);){
            String line;
            boolean readHeader = false;
            while ((line = reader.readLine()) != null) {
                if (!readHeader) {
                    readHeader = true;
                    continue;
                }
                String[] parts = line.split("\t", 8);
                String code = parts[0];
                LocalDate from = LocalDate.parse(parts[1], dateFormat);
                LocalDate until = LocalDate.parse(parts[2], dateFormat);
                Period before = Period.parse(parts[3]);
                Period after = Period.parse(parts[4]);
                String description = parts[5];
                List<String> wikipedia = Arrays.asList(parts[6].split(",\\s*"));
                creator.extract(code, wikipedia, from, until, before, after, description, baseDirectory);
                logger.info("Created crawl spec for topic {}", (Object)code);
            }
        }
    }

    public CrawlSpecCreator() throws IOException {
        URL wikiUrl = new URL(WIKIPEDIA_API_URL);
        CloseableHttpClient httpClient = HttpClientBuilder.create().setDefaultRequestConfig(RequestConfig.custom().setCookieSpec("standard").build()).setUserAgent("L3SSpecBuilder <gossen@l3s.de>").build();
        this.wpBot = new MediaWikiBot(HttpActionClient.builder().withClient((HttpClient)httpClient).withUrl(wikiUrl).withRequestsPerUnit(10.0, TimeUnit.MINUTES).build());
        this.wnBot = new MediaWikiBot(HttpActionClient.builder().withClient((HttpClient)httpClient).withUrl(WIKINEWS_API_URL).withRequestsPerUnit(1.0, TimeUnit.SECONDS).build());
        this.version = this.wpBot.getVersion();
        this.props = EnumSet.of(ParsePage.ParseProp.externallinks, ParsePage.ParseProp.links, ParsePage.ParseProp.text);
        this.parentheses = Pattern.compile(" (\\([^)]+\\))$");
        try (GZIPInputStream is = new GZIPInputStream(Resources.getResource((String)"dictionary-DE.tsv.gz").openStream());){
            this.idfDictionary = LanguageModels.readIdfDictionary(is);
        }
    }

    public void extract(String name, Collection<String> pages, LocalDate from, LocalDate until, Period beforeFuzziness, Period afterFuzziness, String description, File baseDirectory) throws IOException {
        CrawlSpecBuilder builder = new CrawlSpecBuilder(name, description, TimeSpecification.interval(from, until, beforeFuzziness, afterFuzziness));
        for (String pageTitle : pages) {
            if (pageTitle.startsWith("news:")) {
                this.extractWikiNewsCategory(pageTitle.substring("news:".length()), builder);
                continue;
            }
            this.extractWikipediaPage(pageTitle, builder);
        }
        builder.createSpec(true).writeFile(new File(baseDirectory, name + ".json"));
        builder.createSpec(false).writeFile(new File(baseDirectory, name + "-noKW.json"));
    }

    private void extractWikipediaPage(String pageTitle, CrawlSpecBuilder builder) {
        builder.addReferenceDocument(WIKIPEDIA_BASE_URL + pageTitle);
        ParsedPage page = ((ParsePage)this.wpBot.getPerformedAction((ContentProcessable)new ParsePage(pageTitle, this.props, true, this.version))).getResult();
        for (String url : page.getExternalLinks()) {
            if (url.startsWith("//")) continue;
            builder.addUrl(url);
        }
        for (ParsedPage.Link link : page.getLinks()) {
            String linkName = link.getName();
            if (linkName.startsWith("Liste ") || linkName.startsWith("Vorlage:")) continue;
            builder.addKeyword(this.parentheses.matcher(linkName).replaceFirst(""));
        }
        String cleanedText = "";
        Document fragment = this.parseHtmlFragment(page.getText());
        if (!page.getText().trim().isEmpty() && fragment != null) {
            cleanedText = this.cleanWikipediaHtml(fragment);
        }
        builder.addDocument(cleanedText, Locale.GERMAN);
    }

    private void extractWikiNewsCategory(String categoryName, CrawlSpecBuilder builder) {
        logger.debug("Retrieving WikiNews category {}", (Object)categoryName);
        for (String title : new CategoryMembersSimple(this.wnBot, categoryName, new int[]{0})) {
            List<String> externalLinks = ((ParsePage)this.wnBot.getPerformedAction((ContentProcessable)new ParsePage(title, EnumSet.of(ParsePage.ParseProp.externallinks), false, this.version))).getResult().getExternalLinks();
            logger.debug("Got {} links for '{}': {}", new Object[]{externalLinks.size(), title, externalLinks});
            builder.addUrls(externalLinks);
        }
    }

    @VisibleForTesting
    static boolean isAllowedUrl(String url) {
        boolean inWhitelist = false;
        for (Pattern whitelistPattern : URL_PATTERNS_WHITELIST) {
            if (!whitelistPattern.matcher(url).find()) continue;
            inWhitelist = true;
            break;
        }
        return inWhitelist;
    }

    @VisibleForTesting
    static String cleanUrl(String url) {
        String cleanedUrl = url;
        for (Map.Entry<Pattern, String> replacement : URL_REPLACEMENTS.entrySet()) {
            cleanedUrl = replacement.getKey().matcher(cleanedUrl).replaceAll(replacement.getValue());
        }
        if (logger.isDebugEnabled() && !cleanedUrl.equals(url)) {
            logger.debug("Replaced URL {} with {}", (Object)url, (Object)cleanedUrl);
        }
        return cleanedUrl;
    }

    private String cleanWikipediaHtml(Document fragment) {
        for (Element blockElement : fragment.body().children()) {
            if (blockElement.tagName().equals("p")) continue;
            blockElement.remove();
        }
        return this.domFragmentToString(fragment);
    }

    Document parseHtmlFragment(String wikipediaHtml) {
        return Jsoup.parseBodyFragment((String)wikipediaHtml);
    }

    private String domFragmentToString(Document doc) {
        return TextExtractor.extractText(doc);
    }

    private class CrawlSpecBuilder {
        private final Multiset<String> keywords = HashMultiset.create();
        private final Map<String, Locale> documents = new HashMap<String, Locale>();
        private final Set<String> urls = new HashSet<String>();
        private final List<String> referenceDocuments = new ArrayList<String>();
        private final TimeSpecification timeSpecification;
        private final String name;
        private final String description;

        public CrawlSpecBuilder(String name, String description, TimeSpecification timeSpecification) {
            this.name = name;
            this.description = description;
            this.timeSpecification = timeSpecification;
        }

        public void addUrls(Collection<String> urls) {
            urls.stream().map(CrawlSpecCreator::cleanUrl).filter(CrawlSpecCreator::isAllowedUrl).forEach(this.urls::add);
        }

        public void addReferenceDocument(String url) {
            this.referenceDocuments.add(url);
        }

        public void addUrl(String url) {
            String cleanedUrl = CrawlSpecCreator.cleanUrl(url);
            if (CrawlSpecCreator.isAllowedUrl(cleanedUrl)) {
                this.urls.add(cleanedUrl);
            }
        }

        public void addKeyword(String keyword) {
            this.keywords.add((Object)keyword);
        }

        public void addDocument(String text, Locale locale) {
            this.documents.put(text, locale);
        }

        public ArchiveCrawlSpecification createSpec(boolean includeKeywords) {
            Set usedKeywords = includeKeywords ? this.keywords.elementSet() : Collections.emptySet();
            LanguageModels models = new LanguageModels(Locale.GERMAN, CrawlSpecCreator.this.idfDictionary, DEFAULT_LANGUAGE);
            DocumentVectorSimilarity dvs = new DocumentVectorSimilarity(this.documents, usedKeywords, new HashSet<NamedEntity>(), 100, false, DEFAULT_LANGUAGE, models);
            ImmutableMap keywordsByLanguage = ImmutableMap.of((Object)Locale.GERMAN, (Object)usedKeywords);
            return new ArchiveCrawlSpecification(this.name, new ArrayList<String>(this.urls), this.referenceDocuments, this.timeSpecification, dvs.getReferenceVectors(), (Map<Locale, Set<String>>)keywordsByLanguage, this.description, DEFAULT_LANGUAGE, dvs.getCorrectionFactors());
        }
    }
}

