/*
 * Decompiled with CFR 0.152.
 */
package de.l3s.icrawl.util;

import com.google.common.collect.ImmutableSet;
import de.l3s.icrawl.util.HtmlParseException;
import de.l3s.icrawl.util.TextExtractor;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.lang.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

public final class WebPageUtils {
    private static final Set<String> PARAGRAPH_ELEMENTS = ImmutableSet.of((Object)"p", (Object)"div", (Object)"li", (Object)"dd", (Object)"dt", (Object)"blockquote", (Object[])new String[]{"pre", "caption", "th", "td"});
    private static final int MIN_PARAGRAPH_TOKENS = 50;
    private static final int MIN_JS_TOKENS = 3;

    private WebPageUtils() {
    }

    private static int tokenCount(Element n) {
        return TextExtractor.extractText(n).split("\\s+").length;
    }

    public static Element findParagraphParent(Element startNode, int minParagraphTokens) {
        Element elem;
        for (elem = startNode; elem != null && !WebPageUtils.isParagraphElement(elem) && WebPageUtils.needsMoreTokens(elem, minParagraphTokens) && elem.parent() instanceof Element; elem = (Element)elem.parentNode()) {
        }
        return elem;
    }

    public static Element containingElement(Node n) {
        Node currentN;
        for (currentN = n; currentN != null && !(n instanceof Element); currentN = currentN.parentNode()) {
        }
        return (Element)currentN;
    }

    private static boolean needsMoreTokens(Element elem, int minParagraphTokens) {
        return minParagraphTokens < 0 || WebPageUtils.tokenCount(elem) < minParagraphTokens;
    }

    private static boolean isParagraphElement(Element elem) {
        return PARAGRAPH_ELEMENTS.contains(elem.tagName().toLowerCase(Locale.ENGLISH));
    }

    public static Element findParagraphParent(Element startElement) {
        return WebPageUtils.findParagraphParent(startElement, 50);
    }

    public static Element findParagraphParent(Node node, int minParagraphTokens) {
        for (Node n = Objects.requireNonNull(node); n != null; n = n.parentNode()) {
            if (!(n instanceof Element)) continue;
            return WebPageUtils.findParagraphParent((Element)n, minParagraphTokens);
        }
        return null;
    }

    public static String extractTextFromJavascript(Document dom) {
        StringBuilder sb = new StringBuilder();
        for (Element element : dom.getElementsByTag("script")) {
            String script = element.text();
            WebPageUtils.extractTextFromJavascript(script, sb);
        }
        return StringEscapeUtils.unescapeJavaScript((String)sb.toString().trim());
    }

    private static void extractTextFromJavascript(String script, StringBuilder sb) {
        int pos = -1;
        int startOfString = -1;
        while ((pos = script.indexOf("\"", pos + 1)) >= 0) {
            if (startOfString >= 0) {
                String s = script.substring(startOfString + 1, pos);
                if (s.split("\\s+").length > 3) {
                    sb.append("\n\n").append(s);
                }
                startOfString = -1;
                continue;
            }
            startOfString = pos;
        }
    }

    public static Document parseHtml(InputStream is, String url) {
        try {
            return Jsoup.parse((InputStream)is, (String)"UTF-8", (String)url);
        }
        catch (IOException e) {
            throw new HtmlParseException(url, e);
        }
    }

    public static Document parseHtml(String content, String url) {
        return Jsoup.parse((String)content, (String)url);
    }

    public static boolean hasHtmlContent(String content) {
        if (content == null || content.isEmpty()) {
            return false;
        }
        return content.substring(0, Math.min(content.length(), 1024)).toUpperCase(Locale.ROOT).contains("<HTML");
    }
}

