package org.apache.any23.extractor;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.encoding.EncodingDetector;
import org.apache.any23.encoding.TikaEncodingDetector;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.TagSoupExtractionResult;
import org.apache.any23.extractor.html.DocumentReport;
import org.apache.any23.extractor.html.HTMLDocument;
import org.apache.any23.extractor.html.MicroformatExtractor;
import org.apache.any23.extractor.html.TagSoupParser;
import org.apache.any23.mime.MIMEType;
import org.apache.any23.mime.MIMETypeDetector;
import org.apache.any23.rdf.Any23ValueFactoryWrapper;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.LocalCopyFactory;
import org.apache.any23.source.MemCopyFactory;
import org.apache.any23.validator.EmptyValidationReport;
import org.apache.any23.validator.ValidatorException;
import org.apache.any23.vocab.SINDICE;
import org.apache.any23.writer.CompositeTripleHandler;
import org.apache.any23.writer.CountingTripleHandler;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.openrdf.model.BNode;
import org.openrdf.model.URI;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/apache-any23-core-1.0.jar:org/apache/any23/extractor/SingleDocumentExtraction.class */
public class SingleDocumentExtraction {
    private static final SINDICE vSINDICE = SINDICE.getInstance();
    private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
    private final Configuration configuration;
    private final DocumentSource in;
    private URI documentURI;
    private final ExtractorGroup extractors;
    private final TripleHandler output;
    private final EncodingDetector encoderDetector;
    private LocalCopyFactory copyFactory;
    private DocumentSource localDocumentSource;
    private MIMETypeDetector detector;
    private ExtractorGroup matchingExtractors;
    private MIMEType detectedMIMEType;
    private DocumentReport documentReport;
    private ExtractionParameters tagSoupDOMRelatedParameters;
    private String parserEncoding;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:WEB-INF/lib/apache-any23-core-1.0.jar:org/apache/any23/extractor/SingleDocumentExtraction$SingleExtractionReport.class */
    public class SingleExtractionReport {
        private final Collection<IssueReport.Issue> issues;
        private final List<TagSoupExtractionResult.ResourceRoot> resourceRoots;
        private final List<TagSoupExtractionResult.PropertyPath> propertyPaths;

        public SingleExtractionReport(Collection<IssueReport.Issue> collection, List<TagSoupExtractionResult.ResourceRoot> list, List<TagSoupExtractionResult.PropertyPath> list2) {
            this.issues = collection;
            this.resourceRoots = list;
            this.propertyPaths = list2;
        }
    }

    public SingleDocumentExtraction(Configuration configuration, DocumentSource documentSource, ExtractorGroup extractorGroup, TripleHandler tripleHandler) {
        this.copyFactory = null;
        this.localDocumentSource = null;
        this.detector = null;
        this.matchingExtractors = null;
        this.detectedMIMEType = null;
        this.documentReport = null;
        this.tagSoupDOMRelatedParameters = null;
        this.parserEncoding = null;
        if (configuration == null) {
            throw new NullPointerException("configuration cannot be null.");
        }
        if (documentSource == null) {
            throw new NullPointerException("in cannot be null.");
        }
        this.configuration = configuration;
        this.in = documentSource;
        this.extractors = extractorGroup;
        ArrayList arrayList = new ArrayList();
        arrayList.add(tripleHandler);
        arrayList.add(new CountingTripleHandler());
        this.output = new CompositeTripleHandler(arrayList);
        this.encoderDetector = new TikaEncodingDetector();
    }

    public SingleDocumentExtraction(Configuration configuration, DocumentSource documentSource, ExtractorFactory<?> extractorFactory, TripleHandler tripleHandler) {
        this(configuration, documentSource, new ExtractorGroup(Collections.singletonList(extractorFactory)), tripleHandler);
        setMIMETypeDetector(null);
    }

    public SingleDocumentExtraction(DocumentSource documentSource, ExtractorFactory<?> extractorFactory, TripleHandler tripleHandler) {
        this(DefaultConfiguration.singleton(), documentSource, new ExtractorGroup(Collections.singletonList(extractorFactory)), tripleHandler);
        setMIMETypeDetector(null);
    }

    public void setLocalCopyFactory(LocalCopyFactory localCopyFactory) {
        this.copyFactory = localCopyFactory;
    }

    public void setMIMETypeDetector(MIMETypeDetector mIMETypeDetector) {
        this.detector = mIMETypeDetector;
    }

    public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters) throws ExtractionException, IOException {
        if (extractionParameters == null) {
            extractionParameters = ExtractionParameters.newDefault(this.configuration);
        }
        String property = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_URI_PROPERTY);
        ensureHasLocalCopy();
        try {
            this.documentURI = new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance()).createURI("?".equals(property) ? this.in.getDocumentURI() : property);
            if (log.isInfoEnabled()) {
                log.info("Processing " + this.documentURI);
            }
            filterExtractorsByMIMEType();
            if (log.isDebugEnabled()) {
                StringBuffer stringBuffer = new StringBuffer("Extractors ");
                Iterator<ExtractorFactory<?>> it = this.matchingExtractors.iterator();
                while (it.hasNext()) {
                    stringBuffer.append(it.next().getExtractorName());
                    stringBuffer.append(' ');
                }
                stringBuffer.append("match ").append(this.documentURI);
                log.debug(stringBuffer.toString());
            }
            try {
                this.output.startDocument(this.documentURI);
                this.output.setContentLength(this.in.getContentLength());
                ArrayList arrayList = new ArrayList();
                ArrayList arrayList2 = new ArrayList();
                HashMap hashMap = new HashMap();
                try {
                    String extractDocumentLanguage = extractDocumentLanguage(extractionParameters);
                    Iterator<ExtractorFactory<?>> it2 = this.matchingExtractors.iterator();
                    while (it2.hasNext()) {
                        ExtractorFactory<?> next = it2.next();
                        SingleExtractionReport runExtractor = runExtractor(extractionParameters, extractDocumentLanguage, next.createExtractor());
                        arrayList.addAll(runExtractor.resourceRoots);
                        arrayList2.addAll(runExtractor.propertyPaths);
                        hashMap.put(next.getExtractorName(), runExtractor.issues);
                    }
                    boolean flag = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
                    ExtractionContext consolidateResources = extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG) ? consolidateResources(arrayList, arrayList2, flag, this.output) : consolidateResources(arrayList, flag, this.output);
                    if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
                        try {
                            addExtractionTimeSizeMetaTriples(consolidateResources);
                        } catch (TripleHandlerException e) {
                            throw new ExtractionException(String.format("Error while adding extraction metadata triples document with URI %s", this.documentURI), e);
                        }
                    }
                    try {
                        this.output.endDocument(this.documentURI);
                        return new SingleDocumentExtractionReport(this.documentReport == null ? EmptyValidationReport.getInstance() : this.documentReport.getReport(), hashMap);
                    } catch (TripleHandlerException e2) {
                        log.error(String.format("Error ending document with URI %s", this.documentURI));
                        throw new ExtractionException(String.format("Error ending document with URI %s", this.documentURI), e2);
                    }
                } catch (ValidatorException e3) {
                    throw new ExtractionException("An error occurred during the validation phase.", e3);
                }
            } catch (TripleHandlerException e4) {
                log.error(String.format("Error starting document with URI %s", this.documentURI));
                throw new ExtractionException(String.format("Error starting document with URI %s", this.documentURI), e4);
            }
        } catch (Exception e5) {
            throw new IllegalArgumentException("Invalid URI: " + this.in.getDocumentURI(), e5);
        }
    }

    public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
        return run(ExtractionParameters.newDefault(this.configuration));
    }

    public String getDetectedMIMEType() throws IOException {
        filterExtractorsByMIMEType();
        if (this.detectedMIMEType == null) {
            return null;
        }
        return this.detectedMIMEType.toString();
    }

    public boolean hasMatchingExtractors() throws IOException {
        filterExtractorsByMIMEType();
        return !this.matchingExtractors.isEmpty();
    }

    public List<Extractor> getMatchingExtractors() {
        ArrayList arrayList = new ArrayList();
        Iterator<ExtractorFactory<?>> it = this.matchingExtractors.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().createExtractor());
        }
        return arrayList;
    }

    public String getParserEncoding() {
        if (this.parserEncoding == null) {
            this.parserEncoding = detectEncoding();
        }
        return this.parserEncoding;
    }

    public void setParserEncoding(String str) {
        this.parserEncoding = str;
        this.documentReport = null;
    }

    private boolean isHTMLDocument() throws IOException {
        filterExtractorsByMIMEType();
        return !this.matchingExtractors.filterByMIMEType(MIMEType.parse("text/html")).isEmpty();
    }

    private String extractDocumentLanguage(ExtractionParameters extractionParameters) throws IOException, ValidatorException {
        if (!isHTMLDocument()) {
            return null;
        }
        try {
            return new HTMLDocument(getTagSoupDOM(extractionParameters).getDocument()).getDefaultLanguage();
        } catch (IOException e) {
            log.debug("Cannot extract language from document.", (Throwable) e);
            return null;
        }
    }

    private void filterExtractorsByMIMEType() throws IOException {
        if (this.matchingExtractors != null) {
            return;
        }
        if (this.detector == null || this.extractors.allExtractorsSupportAllContentTypes()) {
            this.matchingExtractors = this.extractors;
            return;
        }
        ensureHasLocalCopy();
        this.detectedMIMEType = this.detector.guessMIMEType(java.net.URI.create(this.documentURI.stringValue()).getPath(), this.localDocumentSource.openInputStream(), MIMEType.parse(this.localDocumentSource.getContentType()));
        log.debug("detected media type: " + this.detectedMIMEType);
        this.matchingExtractors = this.extractors.filterByMIMEType(this.detectedMIMEType);
    }

    /* JADX WARN: Finally extract failed */
    private SingleExtractionReport runExtractor(ExtractionParameters extractionParameters, String str, Extractor<?> extractor) throws ExtractionException, IOException, ValidatorException {
        if (log.isDebugEnabled()) {
            log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + this.documentURI);
        }
        long currentTimeMillis = System.currentTimeMillis();
        ExtractionContext extractionContext = new ExtractionContext(extractor.getDescription().getExtractorName(), this.documentURI, str);
        ExtractionResultImpl extractionResultImpl = new ExtractionResultImpl(extractionContext, extractor, this.output);
        try {
            try {
                if (extractor instanceof Extractor.BlindExtractor) {
                    ((Extractor.BlindExtractor) extractor).run(extractionParameters, extractionContext, this.documentURI, extractionResultImpl);
                } else if (extractor instanceof Extractor.ContentExtractor) {
                    ensureHasLocalCopy();
                    ((Extractor.ContentExtractor) extractor).run(extractionParameters, extractionContext, this.localDocumentSource.openInputStream(), extractionResultImpl);
                } else {
                    if (!(extractor instanceof Extractor.TagSoupDOMExtractor)) {
                        throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
                    }
                    ((Extractor.TagSoupDOMExtractor) extractor).run(extractionParameters, extractionContext, getTagSoupDOM(extractionParameters).getDocument(), extractionResultImpl);
                }
                SingleExtractionReport singleExtractionReport = new SingleExtractionReport(extractionResultImpl.getIssues(), new ArrayList(extractionResultImpl.getResourceRoots()), new ArrayList(extractionResultImpl.getPropertyPaths()));
                if (log.isDebugEnabled() && extractionResultImpl.hasIssues()) {
                    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                    extractionResultImpl.printReport(new PrintStream(byteArrayOutputStream));
                    log.debug(byteArrayOutputStream.toString());
                }
                extractionResultImpl.close();
                long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                if (log.isDebugEnabled()) {
                    log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + currentTimeMillis2 + "ms");
                }
                return singleExtractionReport;
            } catch (ExtractionException e) {
                if (log.isDebugEnabled()) {
                    log.debug(extractor.getDescription().getExtractorName() + ": " + e.getMessage());
                }
                throw e;
            }
        } catch (Throwable th) {
            if (log.isDebugEnabled() && extractionResultImpl.hasIssues()) {
                ByteArrayOutputStream byteArrayOutputStream2 = new ByteArrayOutputStream();
                extractionResultImpl.printReport(new PrintStream(byteArrayOutputStream2));
                log.debug(byteArrayOutputStream2.toString());
            }
            extractionResultImpl.close();
            long currentTimeMillis3 = System.currentTimeMillis() - currentTimeMillis;
            if (log.isDebugEnabled()) {
                log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + currentTimeMillis3 + "ms");
            }
            throw th;
        }
    }

    private void ensureHasLocalCopy() throws IOException {
        if (this.localDocumentSource != null) {
            return;
        }
        if (this.in.isLocal()) {
            this.localDocumentSource = this.in;
            return;
        }
        if (this.copyFactory == null) {
            this.copyFactory = new MemCopyFactory();
        }
        this.localDocumentSource = this.copyFactory.createLocalCopy(this.in);
    }

    private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters) throws IOException, ValidatorException {
        if (this.documentReport == null || !extractionParameters.equals(this.tagSoupDOMRelatedParameters)) {
            ensureHasLocalCopy();
            BufferedInputStream bufferedInputStream = new BufferedInputStream(this.localDocumentSource.openInputStream());
            bufferedInputStream.mark(Integer.MAX_VALUE);
            String parserEncoding = getParserEncoding();
            bufferedInputStream.reset();
            TagSoupParser tagSoupParser = new TagSoupParser(bufferedInputStream, this.documentURI.stringValue(), parserEncoding);
            if (extractionParameters.isValidate()) {
                this.documentReport = tagSoupParser.getValidatedDOM(extractionParameters.isFix());
            } else {
                this.documentReport = new DocumentReport(EmptyValidationReport.getInstance(), tagSoupParser.getDOM());
            }
            this.tagSoupDOMRelatedParameters = extractionParameters;
        }
        return this.documentReport;
    }

    private String detectEncoding() {
        try {
            ensureHasLocalCopy();
            BufferedInputStream bufferedInputStream = new BufferedInputStream(this.localDocumentSource.openInputStream());
            String guessEncoding = this.encoderDetector.guessEncoding(bufferedInputStream);
            bufferedInputStream.close();
            return guessEncoding;
        } catch (Exception e) {
            throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
        }
    }

    private boolean subPath(String[] strArr, String[] strArr2) {
        if (strArr2.length > strArr.length) {
            return false;
        }
        for (int i = 0; i < strArr2.length; i++) {
            if (!strArr2[i].equals(strArr[i])) {
                return false;
            }
        }
        return true;
    }

    private void addDomainTriplesPerResourceRoots(List<TagSoupExtractionResult.ResourceRoot> list, ExtractionContext extractionContext) throws ExtractionException {
        try {
            try {
                try {
                    String host = new java.net.URI(this.in.getDocumentURI()).getHost();
                    if (host != null) {
                        Iterator<TagSoupExtractionResult.ResourceRoot> it = list.iterator();
                        while (it.hasNext()) {
                            this.output.receiveTriple(it.next().getRoot(), vSINDICE.getProperty("domain"), ValueFactoryImpl.getInstance().createLiteral(host), null, extractionContext);
                        }
                    }
                    try {
                        this.output.closeContext(extractionContext);
                    } catch (TripleHandlerException e) {
                        throw new ExtractionException("Error while closing context.", e);
                    }
                } catch (URISyntaxException e2) {
                    throw new IllegalArgumentException("An error occurred while extracting the host from the document URI.", e2);
                }
            } catch (TripleHandlerException e3) {
                throw new ExtractionException("Error while writing triple triple.", e3);
            }
        } catch (Throwable th) {
            try {
                this.output.closeContext(extractionContext);
                throw th;
            } catch (TripleHandlerException e4) {
                throw new ExtractionException("Error while closing context.", e4);
            }
        }
    }

    private ExtractionContext createExtractionContext() {
        return new ExtractionContext("consolidation-extractor", this.documentURI, UUID.randomUUID().toString());
    }

    private void addNestingRelationship(List<TagSoupExtractionResult.ResourceRoot> list, List<TagSoupExtractionResult.PropertyPath> list2, ExtractionContext extractionContext) throws TripleHandlerException {
        for (int i = 0; i < list.size(); i++) {
            TagSoupExtractionResult.ResourceRoot resourceRoot = list.get(i);
            for (int i2 = 0; i2 < list2.size(); i2++) {
                TagSoupExtractionResult.PropertyPath propertyPath = list2.get(i2);
                Class<? extends MicroformatExtractor> extractor = resourceRoot.getExtractor();
                Class<? extends MicroformatExtractor> extractor2 = propertyPath.getExtractor();
                if (!extractor.equals(extractor2) && !MicroformatExtractor.includes(extractor2, extractor) && subPath(resourceRoot.getPath(), propertyPath.getPath())) {
                    createNestingRelationship(propertyPath, resourceRoot, this.output, extractionContext);
                }
            }
        }
    }

    private ExtractionContext consolidateResources(List<TagSoupExtractionResult.ResourceRoot> list, List<TagSoupExtractionResult.PropertyPath> list2, boolean z, TripleHandler tripleHandler) throws ExtractionException {
        ExtractionContext createExtractionContext = createExtractionContext();
        try {
            tripleHandler.openContext(createExtractionContext);
            if (z) {
                try {
                    try {
                        addDomainTriplesPerResourceRoots(list, createExtractionContext);
                    } catch (TripleHandlerException e) {
                        throw new ExtractionException("Error while writing triple triple.", e);
                    }
                } catch (Throwable th) {
                    try {
                        tripleHandler.closeContext(createExtractionContext);
                        throw th;
                    } catch (TripleHandlerException e2) {
                        throw new ExtractionException("Error while closing context.", e2);
                    }
                }
            }
            addNestingRelationship(list, list2, createExtractionContext);
            try {
                tripleHandler.closeContext(createExtractionContext);
                return createExtractionContext;
            } catch (TripleHandlerException e3) {
                throw new ExtractionException("Error while closing context.", e3);
            }
        } catch (TripleHandlerException e4) {
            throw new ExtractionException(String.format("Error starting document with URI %s", this.documentURI), e4);
        }
    }

    private ExtractionContext consolidateResources(List<TagSoupExtractionResult.ResourceRoot> list, boolean z, TripleHandler tripleHandler) throws ExtractionException {
        ExtractionContext createExtractionContext = createExtractionContext();
        try {
            tripleHandler.openContext(createExtractionContext);
            if (z) {
                try {
                    addDomainTriplesPerResourceRoots(list, createExtractionContext);
                } catch (Throwable th) {
                    try {
                        tripleHandler.closeContext(createExtractionContext);
                        throw th;
                    } catch (TripleHandlerException e) {
                        throw new ExtractionException("Error while closing context.", e);
                    }
                }
            }
            try {
                tripleHandler.closeContext(createExtractionContext);
                return createExtractionContext;
            } catch (TripleHandlerException e2) {
                throw new ExtractionException("Error while closing context.", e2);
            }
        } catch (TripleHandlerException e3) {
            throw new ExtractionException(String.format("Error starting document with URI %s", this.documentURI), e3);
        }
    }

    private void addExtractionTimeSizeMetaTriples(ExtractionContext extractionContext) throws TripleHandlerException {
        this.output.receiveTriple(new URIImpl(this.documentURI.toString()), vSINDICE.getProperty("date"), ValueFactoryImpl.getInstance().createLiteral(RDFUtils.toXSDDateTime(new Date())), null, extractionContext);
        int i = 0;
        for (TripleHandler tripleHandler : ((CompositeTripleHandler) this.output).getChilds()) {
            if (tripleHandler instanceof CountingTripleHandler) {
                i = ((CountingTripleHandler) tripleHandler).getCount();
            }
        }
        this.output.receiveTriple(new URIImpl(this.documentURI.toString()), vSINDICE.getProperty("size"), ValueFactoryImpl.getInstance().createLiteral(i + 1), null, extractionContext);
    }

    private void createNestingRelationship(TagSoupExtractionResult.PropertyPath propertyPath, TagSoupExtractionResult.ResourceRoot resourceRoot, TripleHandler tripleHandler, ExtractionContext extractionContext) throws TripleHandlerException {
        BNode object = propertyPath.getObject();
        BNode bNode = RDFUtils.getBNode(propertyPath.getProperty().stringValue() + (object == null ? "" : object.getID()));
        tripleHandler.receiveTriple(bNode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), propertyPath.getProperty(), null, extractionContext);
        tripleHandler.receiveTriple(bNode, vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), propertyPath.getObject() == null ? resourceRoot.getRoot() : propertyPath.getObject(), null, extractionContext);
        tripleHandler.receiveTriple(propertyPath.getSubject(), vSINDICE.getProperty(SINDICE.NESTING), bNode, null, extractionContext);
    }
}
