package org.apache.any23.extractor.html;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import javax.xml.transform.TransformerException;
import org.apache.any23.validator.DefaultValidator;
import org.apache.any23.validator.ValidatorException;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.html.HTMLScanner;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:WEB-INF/lib/apache-any23-core-2.0.jar:org/apache/any23/extractor/html/TagSoupParser.class */
public class TagSoupParser {
    public static final String ELEMENT_LOCATION = "Element-Location";
    private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations";
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) TagSoupParser.class);
    private final InputStream input;
    private final String documentIRI;
    private final String encoding;
    private Document result;

    /* loaded from: input_file:WEB-INF/lib/apache-any23-core-2.0.jar:org/apache/any23/extractor/html/TagSoupParser$ElementLocation.class */
    public static class ElementLocation {
        private int beginLineNumber;
        private int beginColumnNumber;
        private int endLineNumber;
        private int endColumnNumber;

        private ElementLocation(int i, int i2, int i3, int i4) {
            this.beginLineNumber = i;
            this.beginColumnNumber = i2;
            this.endLineNumber = i3;
            this.endColumnNumber = i4;
        }

        public int getBeginLineNumber() {
            return this.beginLineNumber;
        }

        public int getBeginColumnNumber() {
            return this.beginColumnNumber;
        }

        public int getEndLineNumber() {
            return this.endLineNumber;
        }

        public int getEndColumnNumber() {
            return this.endColumnNumber;
        }
    }

    public TagSoupParser(InputStream inputStream, String str) {
        this.result = null;
        this.input = inputStream;
        this.documentIRI = str;
        this.encoding = null;
    }

    public TagSoupParser(InputStream inputStream, String str, String str2) {
        this.result = null;
        if (str2 != null && !Charset.isSupported(str2)) {
            throw new UnsupportedCharsetException(String.format("Charset %s is not supported", str2));
        }
        this.input = inputStream;
        this.documentIRI = str;
        this.encoding = str2;
    }

    public Document getDOM() throws IOException {
        if (this.result == null) {
            long currentTimeMillis = System.currentTimeMillis();
            try {
                try {
                    this.result = parse();
                    logger.debug("Parsed " + this.documentIRI + " with NekoHTML, " + (System.currentTimeMillis() - currentTimeMillis) + "ms");
                } catch (NullPointerException e) {
                    if (e.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
                        throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", e);
                    }
                    throw e;
                } catch (TransformerException e2) {
                    throw new RuntimeException("Shouldn not happen, it's a tag soup parser", e2);
                } catch (SAXException e3) {
                    throw new RuntimeException("Shouldn not happen, it's a tag soup parser", e3);
                }
            } catch (Throwable th) {
                logger.debug("Parsed " + this.documentIRI + " with NekoHTML, " + (System.currentTimeMillis() - currentTimeMillis) + "ms");
                throw th;
            }
        }
        this.result.setDocumentURI(this.documentIRI);
        return this.result;
    }

    public DocumentReport getValidatedDOM(boolean z) throws IOException, ValidatorException {
        try {
            URI uri = new URI(this.documentIRI);
            DefaultValidator defaultValidator = new DefaultValidator();
            Document dom = getDOM();
            return new DocumentReport(defaultValidator.validate(uri, dom, z), dom);
        } catch (IllegalArgumentException | URISyntaxException e) {
            throw new ValidatorException("Error while performing validation, invalid document IRI.", e);
        }
    }

    private Document parse() throws IOException, SAXException, TransformerException {
        DOMParser dOMParser = new DOMParser() { // from class: org.apache.any23.extractor.html.TagSoupParser.1
            private QName currentQName;
            private Augmentations currentAugmentations;

            @Override // org.apache.xerces.parsers.AbstractDOMParser
            protected Element createElementNode(QName qName) {
                Element createElementNode = super.createElementNode(qName);
                if (qName.equals(this.currentQName) && this.currentAugmentations != null) {
                    createElementNode.setUserData(TagSoupParser.ELEMENT_LOCATION, createElementLocation(this.currentAugmentations.getItem(TagSoupParser.AUGMENTATIONS_FEATURE)), null);
                }
                return createElementNode;
            }

            @Override // org.apache.xerces.parsers.AbstractDOMParser, org.apache.xerces.parsers.AbstractXMLDocumentParser, org.apache.xerces.xni.XMLDocumentHandler
            public void startElement(QName qName, XMLAttributes xMLAttributes, Augmentations augmentations) throws XNIException {
                super.startElement(qName, xMLAttributes, augmentations);
                this.currentQName = qName;
                this.currentAugmentations = augmentations;
            }

            private ElementLocation createElementLocation(Object obj) {
                if (obj == null) {
                    return null;
                }
                String str = null;
                try {
                    str = obj.toString();
                    if ("synthesized".equals(str)) {
                        return null;
                    }
                    String[] split = str.split(":");
                    return new ElementLocation(Integer.parseInt(split[0]), Integer.parseInt(split[1]), Integer.parseInt(split[3]), Integer.parseInt(split[4]));
                } catch (Exception e) {
                    TagSoupParser.logger.warn(String.format("Unexpected string format for given augmentation: [%s]", str), (Throwable) e);
                    return null;
                }
            }
        };
        dOMParser.setFeature("http://xml.org/sax/features/namespaces", false);
        dOMParser.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
        dOMParser.setFeature(AUGMENTATIONS_FEATURE, true);
        if (this.encoding != null) {
            dOMParser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding);
        }
        dOMParser.parse(new InputSource(new SpanCloserInputStream(this.input)));
        return dOMParser.getDocument();
    }
}
