/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.contentanalysis;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;
import de.l3s.icrawl.contentanalysis.TreeWalker;
import de.l3s.icrawl.util.DateUtils;
import de.l3s.icrawl.util.WebPageUtils;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.time.DateTimeException;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.hadoop.mapreduce.Mapper;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebPageDateExtractor {
    private static final Predicate<WebPageDate> VALID_DATE_PREDICATE = wpd -> DateUtils.isValidDate(wpd.getDate());
    private static final Logger logger = LoggerFactory.getLogger(WebPageDateExtractor.class);
    @VisibleForTesting
    static final Pattern DATE_TRIGGERS = Pattern.compile("created?|updated?|modified|last modifi|letzte? (ge|ver)?\u00e4nder|publi(z|sh)", 258);
    private static final Set<String> SKIPPED_ELEMENTS = ImmutableSet.of((Object)"script", (Object)"style", (Object)"pre");
    private static final Map<String, Integer> NAMES_TO_MONTH = WebPageDateExtractor.namesMap();
    static final List<Pattern> DATE_PATTERNS = WebPageDateExtractor.buildDatePattern();

    private static List<Pattern> buildDatePattern() {
        ImmutableList.Builder patterns = ImmutableList.builder();
        patterns.add((Object)Pattern.compile("(?<year>\\d{4})-(?<month>\\d{2})-(?<day>\\d{2})", 256)).add((Object)Pattern.compile("(?<day>\\d{1,2})\\.\\s*(?<month>\\d{1,2})\\.\\s*(?<year>\\d{2,4})", 256)).add((Object)Pattern.compile("(?<day>\\d{1,2})\\.?\\s*(?<month>\\w+)\\.?\\s+(?<year>\\d{2,4})(,\\s*(?<hour>\\d{1,2}):(?<minute>\\d{1,2})( Uhr)?)?", 256)).add((Object)Pattern.compile("(?<day>\\d{1,2})\\.?\\s*(?<month>\\w+)\\.?\\s+(?<year>\\d{2,4})", 256)).add((Object)Pattern.compile("(?<month>\\w+)\\s+(?<day>\\d{1,2})[\\.,]\\s*(?<year>\\d{4})", 256)).add((Object)Pattern.compile("(?<month>\\d{1,2})/(?<day>\\d{1,2})/(?<year>\\d{4})", 256)).add((Object)Pattern.compile("(?<day>\\d{1,2})/(?<month>\\d{1,2})/(?<year>\\d{4})", 256)).add((Object)Pattern.compile("(?<year>\\\\d{4})-(?<day>\\d{1,2})-(?<month>\\d{1,2})/", 256));
        return patterns.build();
    }

    private static Map<String, Integer> namesMap() {
        try {
            URL mappingsResource = Resources.getResource((String)"de/l3s/icrawl/month_mappings.tsv");
            return (Map)Resources.readLines((URL)mappingsResource, (Charset)StandardCharsets.UTF_8, (LineProcessor)new LineProcessor<Map<String, Integer>>(){
                private final ImmutableMap.Builder<String, Integer> namesBuilder = ImmutableMap.builder();

                public boolean processLine(String line) throws IOException {
                    String[] split = line.split("\t", 2);
                    String key = split[0];
                    int value = Integer.parseInt(split[1]);
                    this.namesBuilder.put((Object)key, (Object)value);
                    return true;
                }

                public Map<String, Integer> getResult() {
                    return this.namesBuilder.build();
                }
            });
        }
        catch (IOException e) {
            logger.warn("Cannot initialize date extractor: ", (Throwable)e);
            return Collections.emptyMap();
        }
    }

    public static WebPageDate extractModifiedDate(Document dom) throws InterruptedException {
        Map candidates = WebPageDateExtractor.findCandidateElements(dom);
        logger.trace("Found {} candidates: {}", (Object)candidates.size(), candidates);
        candidates = Maps.filterValues(candidates, VALID_DATE_PREDICATE);
        if (candidates.isEmpty()) {
            candidates = Maps.filterValues(WebPageDateExtractor.findElementsWithDate(dom), VALID_DATE_PREDICATE);
        }
        return WebPageDateExtractor.getBestDateMatch(candidates);
    }

    private static WebPageDate getBestDateMatch(Map<Element, WebPageDate> candidates) {
        Comparator comparator = (wpd1, wpd2) -> {
            int cmp = wpd1.getDateSource().compareTo(wpd2.getDateSource());
            if (cmp != 0) {
                return -cmp;
            }
            boolean lt1Empty = wpd1.getDate().toLocalTime().equals(LocalTime.MIDNIGHT);
            boolean lt2Empty = wpd2.getDate().toLocalTime().equals(LocalTime.MIDNIGHT);
            if (lt1Empty && lt2Empty || !lt1Empty && !lt2Empty) {
                return wpd1.getDate().compareTo(wpd2.getDate());
            }
            if (lt1Empty) {
                return -1;
            }
            return 1;
        };
        return candidates.values().stream().collect(Collectors.maxBy(comparator)).orElse(null);
    }

    private static Map<Element, WebPageDate> findElementsWithDate(Document dom) {
        LinkedHashMap<Element, WebPageDate> candidates = new LinkedHashMap<Element, WebPageDate>();
        for (Node n : new TreeWalker(WebPageDateExtractor.findDomRoot(dom), SKIPPED_ELEMENTS)) {
            ZonedDateTime dateTime;
            if (!(n instanceof TextNode) || (dateTime = WebPageDateExtractor.findDateMatch(((TextNode)n).text())) == null) continue;
            Element element = WebPageUtils.findParagraphParent(n, -1);
            candidates.put(element, new WebPageDate(dateTime, DateSource.TEXT_DATE));
        }
        return candidates;
    }

    private static Map<Element, WebPageDate> findCandidateElements(Document dom) throws InterruptedException {
        ZonedDateTime date;
        LinkedHashMap<Element, WebPageDate> candidates = new LinkedHashMap<Element, WebPageDate>();
        for (Element element : dom.getElementsByTag("time")) {
            date = WebPageDateExtractor.getTimeElementDate(element);
            if (date == null) continue;
            candidates.put(element, new WebPageDate(date, DateSource.TIME));
        }
        for (Element element : dom.getElementsByTag("meta")) {
            date = WebPageDateExtractor.getMetaElementDate(element);
            if (date == null) continue;
            candidates.put(element, new WebPageDate(date, DateSource.META));
        }
        for (Node n : new TreeWalker(WebPageDateExtractor.findDomRoot(dom), SKIPPED_ELEMENTS)) {
            if (Thread.interrupted()) {
                throw new InterruptedException();
            }
            if (!(n instanceof TextNode) || !DATE_TRIGGERS.matcher(((TextNode)n).text()).find()) continue;
            WebPageDateExtractor.extractDateFromTextNode((TextNode)n, candidates);
        }
        return candidates;
    }

    private static void extractDateFromTextNode(TextNode n, Map<Element, WebPageDate> candidates) {
        Element element = WebPageUtils.findParagraphParent((Node)n, -1);
        ZonedDateTime dateTime = WebPageDateExtractor.findDateMatch(element.text());
        if (dateTime != null) {
            candidates.put(element, new WebPageDate(dateTime, DateSource.TRIGGER_WORD));
        }
    }

    static Node findDomRoot(Document dom) {
        Element root = dom.body();
        if (root == null) {
            root = dom.ownerDocument();
        }
        return root;
    }

    private static ZonedDateTime getMetaElementDate(Element element) {
        for (String name : DateUtils.META_ATTRIBUTE_NAMES) {
            ZonedDateTime parsedDate;
            String value = element.attr(name);
            if (value == null || !DateUtils.dateMetaKey(value) || (parsedDate = DateUtils.liberalParseDate(element.attr("content"))) == null) continue;
            return parsedDate;
        }
        return null;
    }

    private static ZonedDateTime getTimeElementDate(Element element) {
        if (element.hasAttr("datetime")) {
            return DateUtils.liberalParseDate(element.attr("datetime"));
        }
        logger.trace("Expected attribte 'datetime' on element '{}'", (Object)element);
        return null;
    }

    public static WebPageDate getModifiedDate(String url, Document document, Long httpModifiedTime, Mapper.Context context) throws InterruptedException {
        ZonedDateTime httpDateTime;
        LocalDate urlDate = DateUtils.extractDateFromUrl(url);
        if (urlDate != null && DateUtils.isValidDate(urlDate.atStartOfDay().atZone(ZoneOffset.UTC))) {
            WebPageDateExtractor.incrementCount(context, DateSource.URL);
            return new WebPageDate(urlDate.atStartOfDay().atZone(ZoneOffset.UTC), DateSource.URL);
        }
        WebPageDate contentDate = WebPageDateExtractor.extractModifiedDate(document);
        if (contentDate != null && DateUtils.isValidDate(contentDate.getDate())) {
            WebPageDateExtractor.incrementCount(context, contentDate.getDateSource());
            return contentDate;
        }
        if (httpModifiedTime != null && DateUtils.isValidDate(httpDateTime = Instant.ofEpochMilli(httpModifiedTime).atZone(ZoneOffset.UTC))) {
            WebPageDateExtractor.incrementCount(context, DateSource.HEADER);
            return new WebPageDate(httpDateTime, DateSource.HEADER);
        }
        logger.debug("No date found for URL {}", (Object)url);
        return null;
    }

    private static void incrementCount(Mapper.Context context, Enum<?> counter) {
        if (context != null) {
            context.getCounter(counter).increment(1L);
        }
    }

    @VisibleForTesting
    static ZonedDateTime findDateMatch(String s) {
        for (Pattern pattern : DATE_PATTERNS) {
            Matcher matcher = pattern.matcher(s);
            if (!matcher.find()) continue;
            int year = Integer.parseInt(matcher.group("year"));
            if (year < 15) {
                year += 2000;
            } else if (year < 100) {
                year += 1900;
            }
            String rawMonth = matcher.group("month");
            int month = -1;
            Integer monthLookup = NAMES_TO_MONTH.get(rawMonth.toLowerCase(Locale.ENGLISH));
            if (monthLookup != null) {
                month = monthLookup;
            } else {
                if (!rawMonth.matches("\\d{1,2}")) continue;
                month = Integer.parseInt(rawMonth);
            }
            int day = Integer.parseInt(matcher.group("day"));
            int hour = 0;
            int minute = 0;
            if (matcher.groupCount() > 3 && matcher.group("hour") != null && matcher.group("minute") != null) {
                hour = Integer.parseInt(matcher.group("hour"));
                minute = Integer.parseInt(matcher.group("minute"));
            }
            try {
                return ZonedDateTime.of(year, month, day, hour, minute, 0, 0, ZoneOffset.UTC);
            }
            catch (DateTimeException e) {
                logger.trace("Could not use as a date: {}-{}-{}: ", new Object[]{year, month, day, e});
            }
        }
        return null;
    }

    public static class ExtractionException
    extends RuntimeException {
        private static final long serialVersionUID = 1L;

        public ExtractionException(String message, Throwable cause) {
            super(message, cause);
        }

        public ExtractionException(String message) {
            super(message);
        }

        public ExtractionException(Throwable cause) {
            super(cause);
        }
    }

    public static final class WebPageDate {
        private final ZonedDateTime date;
        private final DateSource dateSource;

        public WebPageDate(ZonedDateTime date, DateSource dateSource) {
            this.date = date;
            this.dateSource = dateSource;
        }

        public ZonedDateTime getDate() {
            return this.date;
        }

        public DateSource getDateSource() {
            return this.dateSource;
        }

        public String toString() {
            return String.format("%s [%s]", new Object[]{this.date, this.dateSource});
        }
    }

    public static enum DateSource {
        URL,
        TIME,
        META,
        TRIGGER_WORD,
        TEXT_DATE,
        HEADER,
        NOT_FOUND;

    }
}

