package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeoutException;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.guava.common.collect.Lists;
import org.apache.jackrabbit.guava.common.io.CountingInputStream;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditor;
import org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditorContext;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.jackrabbit.oak.stats.StatisticsProvider;
import org.apache.jackrabbit.oak.stats.StatsOptions;
import org.apache.jackrabbit.oak.stats.TimerStats;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

/* JADX WARN: Classes with same name are omitted:
  input_file:oak-search-1.62.0.jar:org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.class
 */
/* loaded from: input_file:org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.class */
public class FulltextBinaryTextExtractor {
    private static final String TEXT_EXTRACTION_TIMER_METRIC_NAME = "TEXT_EXTRACTION_TIME";
    private static final Logger log = LoggerFactory.getLogger(FulltextBinaryTextExtractor.class);
    private static final Parser defaultParser = createDefaultParser();
    private static final long SMALL_BINARY = Long.getLong("oak.search.smallBinary", 16384).longValue();
    private final TextExtractionStats textExtractionStats = new TextExtractionStats();
    private final ExtractedTextCache extractedTextCache;
    private final IndexDefinition definition;
    private final boolean reindex;
    private Parser parser;
    private TikaConfigHolder tikaConfig;
    private Set<MediaType> supportedMediaTypes;
    private Set<MediaType> nonIndexedMediaType;

    /* JADX INFO: Access modifiers changed from: private */
    /* JADX WARN: Classes with same name are omitted:
      input_file:oak-search-1.62.0.jar:org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor$TikaConfigHolder.class
     */
    /* loaded from: input_file:org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor$TikaConfigHolder.class */
    public static final class TikaConfigHolder {
        final TikaConfig config;
        final String sourceInfo;

        public TikaConfigHolder(TikaConfig tikaConfig, String str) {
            this.config = tikaConfig;
            this.sourceInfo = str;
        }

        public String toString() {
            return this.sourceInfo;
        }
    }

    public FulltextBinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition indexDefinition, boolean z) {
        this.extractedTextCache = extractedTextCache;
        this.definition = indexDefinition;
        this.reindex = z;
    }

    public void done(boolean z) {
        this.textExtractionStats.log(z);
        this.textExtractionStats.collectStats(this.extractedTextCache);
    }

    public List<String> newBinary(PropertyState propertyState, NodeState nodeState, String str) {
        String string;
        ArrayList newArrayList = Lists.newArrayList();
        Metadata metadata = new Metadata();
        String tikaMappedMimeType = this.definition.getTikaMappedMimeType(nodeState.getString("jcr:mimeType"));
        if (tikaMappedMimeType == null || !isSupportedMediaType(tikaMappedMimeType)) {
            log.trace("[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]", new Object[]{getIndexName(), str, tikaMappedMimeType});
            return newArrayList;
        }
        metadata.set("Content-Type", tikaMappedMimeType);
        if ("jcr:data".equals(propertyState.getName()) && (string = nodeState.getString("jcr:encoding")) != null) {
            metadata.set("Content-Encoding", string);
        }
        Iterator it = ((Iterable) propertyState.getValue(Type.BINARIES)).iterator();
        while (it.hasNext()) {
            String parseStringValue = parseStringValue((Blob) it.next(), metadata, str, propertyState.getName());
            if (parseStringValue != null) {
                newArrayList.add(parseStringValue);
            }
        }
        return newArrayList;
    }

    private String parseStringValue(Blob blob, Metadata metadata, String str, String str2) {
        String str3 = this.extractedTextCache.get(str, str2, blob, this.reindex);
        if (str3 == null) {
            StatisticsProvider statisticsProvider = this.extractedTextCache.getStatisticsProvider();
            if (statisticsProvider != null) {
                TimerStats.Context time = statisticsProvider.getTimer(TEXT_EXTRACTION_TIMER_METRIC_NAME, StatsOptions.METRICS_ONLY).time();
                str3 = parseStringValue0(blob, metadata, str);
                time.stop();
            } else {
                str3 = parseStringValue0(blob, metadata, str);
            }
        }
        return str3;
    }

    private String parseStringValue0(Blob blob, final Metadata metadata, String str) {
        final WriteOutContentHandler writeOutContentHandler = new WriteOutContentHandler(this.definition.getMaxExtractLength());
        long currentTimeMillis = System.currentTimeMillis();
        long j = 0;
        long length = blob.length();
        if (log.isDebugEnabled()) {
            log.debug("Extracting {}, {} bytes, id {}", new Object[]{str, Long.valueOf(length), blob.getContentIdentity()});
        }
        try {
            final CountingInputStream countingInputStream = new CountingInputStream(new LazyInputStream(() -> {
                return blob.getNewStream();
            }));
            try {
                if (length > SMALL_BINARY) {
                    this.extractedTextCache.process("Extracting " + str + ", " + length + " bytes", new Callable<Void>() { // from class: org.apache.jackrabbit.oak.plugins.index.search.spi.binary.FulltextBinaryTextExtractor.1
                        /* JADX WARN: Can't rename method to resolve collision */
                        @Override // java.util.concurrent.Callable
                        public Void call() throws Exception {
                            FulltextBinaryTextExtractor.this.getParser().parse(countingInputStream, writeOutContentHandler, metadata, new ParseContext());
                            return null;
                        }
                    });
                } else {
                    getParser().parse(countingInputStream, writeOutContentHandler, metadata, new ParseContext());
                }
                j = countingInputStream.getCount();
                countingInputStream.close();
            } catch (Throwable th) {
                countingInputStream.getCount();
                countingInputStream.close();
                throw th;
            }
        } catch (LinkageError e) {
            String indexName = getIndexName();
            log.info("[{}] Failed to extract text from a binary property: {}. This often happens when the media types is disabled by configuration.", indexName, str);
            log.debug("[{}] Failed to extract text from a binary property: {}. This often happens when the media types is disabled by configuration.", new Object[]{indexName, str, e});
            this.extractedTextCache.put(blob, ExtractedText.ERROR);
            return FulltextIndexEditor.TEXT_EXTRACTION_ERROR;
        } catch (TimeoutException e2) {
            log.warn("[{}] Failed to extract text from a binary property due to timeout: {}.", getIndexName(), str);
            this.extractedTextCache.put(blob, ExtractedText.ERROR);
            this.extractedTextCache.putTimeout(blob, ExtractedText.ERROR);
            return FulltextIndexEditor.TEXT_EXTRACTION_ERROR;
        } catch (Throwable th2) {
            if (!writeOutContentHandler.isWriteLimitReached(th2)) {
                String indexName2 = getIndexName();
                log.info("[{}] Failed to extract text from a binary property: {}. This is quite common, and usually nothing to worry about.", indexName2, str);
                log.debug("[{}] Failed to extract text from a binary property: {}. This is quite common, and usually nothing to worry about.", new Object[]{indexName2, str, th2});
                this.extractedTextCache.put(blob, ExtractedText.ERROR);
                return FulltextIndexEditor.TEXT_EXTRACTION_ERROR;
            }
            log.debug("Extracted text size exceeded configured limit({})", Integer.valueOf(this.definition.getMaxExtractLength()));
        }
        String writeOutContentHandler2 = writeOutContentHandler.toString();
        if (j > 0) {
            long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
            int length2 = writeOutContentHandler2.length();
            recordTextExtractionStats(currentTimeMillis2, j, length2);
            if (log.isDebugEnabled()) {
                log.debug("Extracting {} took {} ms, {} bytes read, {} text size", new Object[]{str, Long.valueOf(currentTimeMillis2), Long.valueOf(j), Integer.valueOf(length2)});
            }
        }
        this.extractedTextCache.put(blob, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, writeOutContentHandler2));
        return writeOutContentHandler2;
    }

    private void recordTextExtractionStats(long j, long j2, int i) {
        this.textExtractionStats.addStats(j, j2, i);
    }

    private String getIndexName() {
        return this.definition.getIndexName();
    }

    public TikaConfig getTikaConfig() {
        if (this.tikaConfig == null) {
            this.tikaConfig = initializeTikaConfig(this.definition);
        }
        return this.tikaConfig.config;
    }

    private Parser getParser() {
        if (this.parser == null) {
            this.parser = initializeTikaParser(this.definition);
        }
        return this.parser;
    }

    private boolean isSupportedMediaType(String str) {
        if (this.supportedMediaTypes == null) {
            this.supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
            this.nonIndexedMediaType = getNonIndexedMediaTypes();
        }
        MediaType parse = MediaType.parse(str);
        return this.supportedMediaTypes.contains(parse) && !this.nonIndexedMediaType.contains(parse);
    }

    private Set<MediaType> getNonIndexedMediaTypes() {
        InputStream inputStream = null;
        String str = null;
        try {
            try {
                if (this.definition.hasCustomTikaConfig()) {
                    str = String.format("Custom config at %s", this.definition.getIndexPath());
                    inputStream = this.definition.getTikaConfig();
                } else {
                    URL resource = FulltextIndexEditorContext.class.getResource("tika-config.xml");
                    str = "Default : tika-config.xml";
                    if (resource != null) {
                        inputStream = resource.openStream();
                    }
                }
            } catch (TikaException | IOException | SAXException e) {
                log.warn("Tika configuration not available : " + str, e);
                IOUtils.closeQuietly(inputStream);
            }
            if (inputStream == null) {
                IOUtils.closeQuietly(inputStream);
                return Collections.emptySet();
            }
            Set<MediaType> nonIndexedMediaTypes = TikaParserConfig.getNonIndexedMediaTypes(inputStream);
            IOUtils.closeQuietly(inputStream);
            return nonIndexedMediaTypes;
        } catch (Throwable th) {
            IOUtils.closeQuietly(inputStream);
            throw th;
        }
    }

    private static TikaConfigHolder initializeTikaConfig(@Nullable IndexDefinition indexDefinition) {
        ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
        InputStream inputStream = null;
        String str = null;
        try {
            try {
                Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
                if (indexDefinition == null || !indexDefinition.hasCustomTikaConfig()) {
                    URL resource = FulltextIndexEditorContext.class.getResource("tika-config.xml");
                    if (resource != null) {
                        str = resource.toString();
                        inputStream = resource.openStream();
                    }
                } else {
                    log.debug("[{}] Using custom tika config", indexDefinition.getIndexName());
                    str = "Custom config at " + indexDefinition.getIndexPath();
                    inputStream = indexDefinition.getTikaConfig();
                }
            } catch (TikaException | IOException | SAXException e) {
                log.warn("Tika configuration not available : " + str, e);
                IOUtils.closeQuietly(inputStream);
                Thread.currentThread().setContextClassLoader(contextClassLoader);
            }
            if (inputStream == null) {
                IOUtils.closeQuietly(inputStream);
                Thread.currentThread().setContextClassLoader(contextClassLoader);
                return new TikaConfigHolder(TikaConfig.getDefaultConfig(), "Default Config");
            }
            TikaConfigHolder tikaConfigHolder = new TikaConfigHolder(new TikaConfig(inputStream), str);
            IOUtils.closeQuietly(inputStream);
            Thread.currentThread().setContextClassLoader(contextClassLoader);
            return tikaConfigHolder;
        } catch (Throwable th) {
            IOUtils.closeQuietly(inputStream);
            Thread.currentThread().setContextClassLoader(contextClassLoader);
            throw th;
        }
    }

    private Parser initializeTikaParser(IndexDefinition indexDefinition) {
        ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
        try {
            if (!indexDefinition.hasCustomTikaConfig()) {
                Thread.currentThread().setContextClassLoader(contextClassLoader);
                return defaultParser;
            }
            Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
            AutoDetectParser autoDetectParser = new AutoDetectParser(getTikaConfig());
            Thread.currentThread().setContextClassLoader(contextClassLoader);
            return autoDetectParser;
        } catch (Throwable th) {
            Thread.currentThread().setContextClassLoader(contextClassLoader);
            throw th;
        }
    }

    private static AutoDetectParser createDefaultParser() {
        ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
        TikaConfigHolder tikaConfigHolder = null;
        try {
            try {
                tikaConfigHolder = initializeTikaConfig(null);
                Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
                log.info("Loaded default Tika Config from classpath {}", tikaConfigHolder);
                AutoDetectParser autoDetectParser = new AutoDetectParser(tikaConfigHolder.config);
                Thread.currentThread().setContextClassLoader(contextClassLoader);
                return autoDetectParser;
            } catch (Exception e) {
                log.warn("Tika configuration not available : " + tikaConfigHolder, e);
                Thread.currentThread().setContextClassLoader(contextClassLoader);
                return new AutoDetectParser();
            }
        } catch (Throwable th) {
            Thread.currentThread().setContextClassLoader(contextClassLoader);
            throw th;
        }
    }
}
