package org.apache.jackrabbit.oak.plugins.tika;

import com.google.common.io.ByteSource;
import com.google.common.io.CountingInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextExtractor.class */
public class TextExtractor implements Closeable {
    private static final Logger log = LoggerFactory.getLogger((Class<?>) TextExtractor.class);
    private static final Logger parserError = LoggerFactory.getLogger("org.apache.jackrabbit.oak.plugins.tika.ParserError");
    private static final int PROGRESS_BATCH_SIZE = 1000;
    private static final int MAX_EXTRACT_LENGTH = 100000;
    private static final String ERROR_TEXT = "TextExtractionError";
    private final TextWriter textWriter;
    private BlockingQueue<WorkItem> inputQueue;
    private ExecutorService executorService;
    private File tikaConfig;
    private TikaHelper tika;
    private boolean initialized;
    private BinaryStats stats;
    private boolean closed;
    private final WorkItem SHUTDOWN_SIGNAL = new WorkItem(null);
    private int threadPoolSize = Runtime.getRuntime().availableProcessors();
    private int queueSize = 100;
    private final AtomicInteger errorCount = new AtomicInteger();
    private final AtomicLong timeTaken = new AtomicLong();
    private final AtomicInteger extractionCount = new AtomicInteger();
    private final AtomicInteger textWrittenCount = new AtomicInteger();
    private final AtomicInteger parserErrorCount = new AtomicInteger();
    private final AtomicInteger processedCount = new AtomicInteger();
    private final AtomicInteger emptyCount = new AtomicInteger();
    private final AtomicInteger notSupportedCount = new AtomicInteger();
    private final AtomicInteger alreadyExtractedCount = new AtomicInteger();
    private final AtomicLong extractedTextSize = new AtomicLong();
    private final AtomicLong nonEmptyExtractedTextSize = new AtomicLong();
    private final AtomicLong totalSizeRead = new AtomicLong();
    private int maxExtractedLength = 100000;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextExtractor$Extractor.class */
    public class Extractor implements Runnable {
        private Extractor() {
        }

        @Override // java.lang.Runnable
        public void run() {
            while (true) {
                WorkItem workItem = null;
                try {
                    workItem = (WorkItem) TextExtractor.this.inputQueue.take();
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    return;
                } catch (Exception e2) {
                    TextExtractor.this.errorCount.incrementAndGet();
                    TextExtractor.log.warn("Error occurred while processing {}", workItem, e2);
                }
                if (workItem == TextExtractor.this.SHUTDOWN_SIGNAL) {
                    TextExtractor.this.inputQueue.put(TextExtractor.this.SHUTDOWN_SIGNAL);
                    return;
                } else {
                    TextExtractor.this.extractText(workItem.source);
                    TextExtractor.this.dumpProgress(TextExtractor.this.processedCount.incrementAndGet());
                }
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextExtractor$WorkItem.class */
    public static class WorkItem {
        final BinaryResource source;

        private WorkItem(BinaryResource binaryResource) {
            this.source = binaryResource;
        }

        public String toString() {
            return this.source != null ? this.source.toString() : "<EMPTY>";
        }
    }

    public TextExtractor(TextWriter textWriter) {
        this.textWriter = textWriter;
    }

    public void extract(Iterable<BinaryResource> iterable) throws InterruptedException, IOException {
        initialize();
        Iterator<BinaryResource> it = iterable.iterator();
        while (it.hasNext()) {
            this.inputQueue.put(new WorkItem(it.next()));
        }
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
        if (this.closed) {
            return;
        }
        if (!this.inputQueue.isEmpty()) {
            log.info("Shutting down the extractor. Pending task count {}", Integer.valueOf(this.inputQueue.size()));
        }
        if (this.executorService != null) {
            try {
                this.inputQueue.put(this.SHUTDOWN_SIGNAL);
                this.executorService.shutdown();
                this.executorService.awaitTermination(10L, TimeUnit.DAYS);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }
        dumpStats();
        this.closed = true;
    }

    public void setTikaConfig(File file) {
        this.tikaConfig = file;
    }

    public void setThreadPoolSize(int i) {
        this.threadPoolSize = i;
    }

    public void setStats(BinaryStats binaryStats) {
        this.stats = binaryStats;
    }

    private void dumpStats() {
        StringWriter stringWriter = new StringWriter();
        PrintWriter printWriter = new PrintWriter(stringWriter);
        printWriter.println("Text extraction stats");
        printWriter.printf("\t Processed Count           : %d%n", Integer.valueOf(this.processedCount.get()));
        printWriter.printf("\t   Extraction Count        : %d%n", Integer.valueOf(this.extractionCount.get()));
        printWriter.printf("\t     Empty Count           : %d%n", Integer.valueOf(this.emptyCount.get()));
        printWriter.printf("\t     Text Written Count    : %d%n", Integer.valueOf(this.textWrittenCount.get()));
        printWriter.printf("\t   Parser Error Count      : %d%n", Integer.valueOf(this.parserErrorCount.get()));
        printWriter.printf("\t   Error Count             : %d%n", Integer.valueOf(this.errorCount.get()));
        printWriter.printf("\t   Not Supported Count     : %d%n", Integer.valueOf(this.notSupportedCount.get()));
        printWriter.printf("\t   Already processed Count : %d%n", Integer.valueOf(this.alreadyExtractedCount.get()));
        printWriter.printf("\t Total bytes read          : %s%n", IOUtils.humanReadableByteCount(this.totalSizeRead.get()));
        printWriter.printf("\t Total text extracted      : %s%n", IOUtils.humanReadableByteCount(this.extractedTextSize.get()));
        printWriter.printf("\t   Non empty text          : %s%n", IOUtils.humanReadableByteCount(this.nonEmptyExtractedTextSize.get()));
        printWriter.printf("\t Time taken                : %d sec%n", Long.valueOf(this.timeTaken.get() / 1000));
        printWriter.close();
        log.info(stringWriter.toString());
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void dumpProgress(int i) {
        if (i % 1000 == 0) {
            log.info("Processed {} {} binaries so far ...", Integer.valueOf(i), this.stats != null ? String.format("(%1.2f%%) (Extraction stats %d/%d %1.2f%%, Ignored count %d)", Double.valueOf(((i * 1.0d) / this.stats.getTotalCount()) * 100.0d), Integer.valueOf(this.extractionCount.get()), Long.valueOf(this.stats.getIndexedCount()), Double.valueOf(((this.extractionCount.get() * 1.0d) / this.stats.getIndexedCount()) * 100.0d), Integer.valueOf(this.notSupportedCount.get())) : "");
        }
    }

    private synchronized void initialize() throws IOException {
        if (this.initialized) {
            return;
        }
        this.inputQueue = new ArrayBlockingQueue(this.queueSize);
        this.tika = new TikaHelper(this.tikaConfig);
        initializeExecutorService();
        this.initialized = true;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void extractText(BinaryResource binaryResource) throws IOException {
        String mimeType = binaryResource.getMimeType();
        if (mimeType == null || !this.tika.isSupportedMediaType(mimeType)) {
            log.trace("Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]", binaryResource, mimeType);
            this.notSupportedCount.incrementAndGet();
            return;
        }
        String blobId = binaryResource.getBlobId();
        if (this.textWriter.isProcessed(blobId)) {
            this.alreadyExtractedCount.incrementAndGet();
            return;
        }
        Metadata metadata = new Metadata();
        metadata.set("Content-Type", mimeType);
        if (binaryResource.getEncoding() != null) {
            metadata.set("Content-Encoding", binaryResource.getEncoding());
        }
        String parseStringValue = parseStringValue(binaryResource.getByteSource(), metadata, binaryResource.getPath());
        if ("TextExtractionError".equals(parseStringValue)) {
            this.textWriter.markError(blobId);
            return;
        }
        if (parseStringValue != null) {
            String trim = parseStringValue.trim();
            if (trim.isEmpty()) {
                this.textWriter.markEmpty(blobId);
                this.emptyCount.incrementAndGet();
            } else {
                this.nonEmptyExtractedTextSize.addAndGet(trim.length());
                this.textWriter.write(blobId, trim);
                this.textWrittenCount.incrementAndGet();
            }
        }
    }

    private void initializeExecutorService() {
        this.executorService = Executors.newFixedThreadPool(this.threadPoolSize);
        for (int i = 0; i < this.threadPoolSize; i++) {
            this.executorService.submit(new Extractor());
        }
        log.info("Initialized text extractor pool with {} threads", Integer.valueOf(this.threadPoolSize));
    }

    private String parseStringValue(ByteSource byteSource, Metadata metadata, String str) {
        WriteOutContentHandler writeOutContentHandler = new WriteOutContentHandler(this.maxExtractedLength);
        long currentTimeMillis = System.currentTimeMillis();
        long j = 0;
        try {
            CountingInputStream countingInputStream = new CountingInputStream(new LazyInputStream(byteSource));
            try {
                this.tika.getParser().parse(countingInputStream, writeOutContentHandler, metadata, new ParseContext());
                j = countingInputStream.getCount();
                countingInputStream.close();
            } catch (Throwable th) {
                countingInputStream.getCount();
                countingInputStream.close();
                throw th;
            }
        } catch (LinkageError e) {
            log.debug("Failed to extract text from a binary property: {}. This often happens when some media types are disabled by configuration. The stack trace is included to flag some 'unintended' failures", str, e);
            this.parserErrorCount.incrementAndGet();
            return "TextExtractionError";
        } catch (Throwable th2) {
            if (!writeOutContentHandler.isWriteLimitReached(th2)) {
                this.parserErrorCount.incrementAndGet();
                parserError.debug("Failed to extract text from a binary property: " + str + " This is a fairly common case, and nothing to worry about. The stack trace is included to help improve the text extraction feature.", th2);
                return "TextExtractionError";
            }
            parserError.debug("Extracted text size exceeded configured limit({})", Integer.valueOf(this.maxExtractedLength));
        }
        String writeOutContentHandler2 = writeOutContentHandler.toString();
        this.timeTaken.addAndGet(System.currentTimeMillis() - currentTimeMillis);
        if (j <= 0) {
            return null;
        }
        this.extractedTextSize.addAndGet(writeOutContentHandler2.length());
        this.extractionCount.incrementAndGet();
        this.totalSizeRead.addAndGet(j);
        return writeOutContentHandler2;
    }
}
