package org.apache.tika.parser.html;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.james.mime4j.dom.field.ContentTypeField;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.utils.CharsetUtils;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Parser;
import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:WEB-INF/lib/tika-parsers-0.10.jar:org/apache/tika/parser/html/HtmlParser.class */
public class HtmlParser extends AbstractParser {
    private static final String DEFAULT_CHARSET = "windows-1252";
    private static final int META_TAG_BUFFER_SIZE = 8192;
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.text("html"), MediaType.application("xhtml+xml"), MediaType.application("vnd.wap.xhtml+xml"), MediaType.application("x-asp"))));
    private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile("(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]([^'\\\"]+)['\\\"]");
    private static final Schema HTML_SCHEMA = new HTMLSchema();

    /* loaded from: input_file:WEB-INF/lib/tika-parsers-0.10.jar:org/apache/tika/parser/html/HtmlParser$HtmlParserMapper.class */
    private class HtmlParserMapper implements HtmlMapper {
        private HtmlParserMapper() {
        }

        @Override // org.apache.tika.parser.html.HtmlMapper
        public String mapSafeElement(String str) {
            return HtmlParser.this.mapSafeElement(str);
        }

        @Override // org.apache.tika.parser.html.HtmlMapper
        public boolean isDiscardElement(String str) {
            return HtmlParser.this.isDiscardElement(str);
        }

        @Override // org.apache.tika.parser.html.HtmlMapper
        public String mapSafeAttribute(String str, String str2) {
            return HtmlParser.this.mapSafeAttribute(str, str2);
        }
    }

    @Override // org.apache.tika.parser.Parser
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    private String getEncoding(InputStream inputStream, Metadata metadata) throws IOException {
        MediaType parse;
        String str;
        inputStream.mark(8192);
        char[] cArr = new char[8192];
        int read = new InputStreamReader(inputStream, "us-ascii").read(cArr);
        inputStream.reset();
        if (read != -1) {
            Matcher matcher = HTTP_EQUIV_PATTERN.matcher(new String(cArr, 0, read));
            if (matcher.find()) {
                for (String str2 : matcher.group(1).split(";")) {
                    String[] split = str2.trim().split("=");
                    if (split.length == 2 && split[0].equalsIgnoreCase(ContentTypeField.PARAM_CHARSET)) {
                        String clean = CharsetUtils.clean(split[1]);
                        if (CharsetUtils.isSupported(clean)) {
                            metadata.set(HttpHeaders.CONTENT_ENCODING, clean);
                            return clean;
                        }
                    }
                }
            }
        }
        CharsetDetector charsetDetector = new CharsetDetector();
        String str3 = metadata.get(HttpHeaders.CONTENT_ENCODING);
        String str4 = metadata.get("Content-Type");
        if (str3 == null && str4 != null && (parse = MediaType.parse(str4)) != null && (str = parse.getParameters().get(ContentTypeField.PARAM_CHARSET)) != null && Charset.isSupported(str)) {
            str3 = str;
        }
        if (str3 != null) {
            charsetDetector.setDeclaredEncoding(str3);
        }
        charsetDetector.enableInputFilter(true);
        charsetDetector.setText(inputStream);
        CharsetMatch[] detectAll = charsetDetector.detectAll();
        int length = detectAll.length;
        int i = 0;
        while (true) {
            if (i >= length) {
                break;
            }
            CharsetMatch charsetMatch = detectAll[i];
            if (Charset.isSupported(charsetMatch.getName())) {
                metadata.set(HttpHeaders.CONTENT_ENCODING, charsetMatch.getName());
                break;
            }
            i++;
        }
        String str5 = metadata.get(HttpHeaders.CONTENT_ENCODING);
        if (str5 == null) {
            str5 = Charset.isSupported(DEFAULT_CHARSET) ? DEFAULT_CHARSET : Charset.defaultCharset().name();
            metadata.set(HttpHeaders.CONTENT_ENCODING, str5);
        }
        return str5;
    }

    @Override // org.apache.tika.parser.Parser
    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        if (!inputStream.markSupported()) {
            inputStream = new BufferedInputStream(inputStream);
        }
        CloseShieldInputStream closeShieldInputStream = new CloseShieldInputStream(inputStream);
        InputSource inputSource = new InputSource(closeShieldInputStream);
        inputSource.setEncoding(getEncoding(closeShieldInputStream, metadata));
        HtmlMapper htmlMapper = (HtmlMapper) parseContext.get(HtmlMapper.class, new HtmlParserMapper());
        Parser parser = new Parser();
        parser.setProperty(Parser.schemaProperty, HTML_SCHEMA);
        parser.setFeature(Parser.ignoreBogonsFeature, true);
        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(htmlMapper, contentHandler, metadata)));
        parser.parse(inputSource);
    }

    protected String mapSafeElement(String str) {
        return DefaultHtmlMapper.INSTANCE.mapSafeElement(str);
    }

    protected boolean isDiscardElement(String str) {
        return DefaultHtmlMapper.INSTANCE.isDiscardElement(str);
    }

    public String mapSafeAttribute(String str, String str2) {
        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(str, str2);
    }
}
