package org.apache.jackrabbit.oak.plugins.tika;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.jackrabbit.guava.common.collect.FluentIterable;
import org.apache.jackrabbit.guava.common.collect.Lists;
import org.apache.jackrabbit.guava.common.collect.Sets;
import org.apache.jackrabbit.guava.common.io.ByteSource;
import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
import org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory;
import org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer;
import org.apache.jackrabbit.oak.plugins.tika.TextPopulator;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.jetbrains.annotations.NotNull;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

/* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.class */
public class TextPopulatorTest {

    @Rule
    public TemporaryFolder temporaryFolder = new TemporaryFolder();
    private File indexDir = null;
    private File csv = null;
    private FakeTextWriter textWriter = new FakeTextWriter();
    private TextPopulator.PopulatorStats stats = new TextPopulator.PopulatorStats();
    private TextPopulator textPopulator = new TextPopulator(this.textWriter);

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest$FakeBinaryResourceProvider.class */
    public static class FakeBinaryResourceProvider implements BinaryResourceProvider {
        private List<BinaryResource> binaries = Lists.newArrayList();

        FakeBinaryResourceProvider(String... strArr) {
            for (String str : strArr) {
                this.binaries.add(new BinaryResource(new StringByteSource(""), (String) null, (String) null, str, getBlobId(str)));
            }
        }

        static String getBlobId(String str) {
            return str + ":" + str;
        }

        public FluentIterable<BinaryResource> getBinaries(String str) {
            return new FluentIterable<BinaryResource>() { // from class: org.apache.jackrabbit.oak.plugins.tika.TextPopulatorTest.FakeBinaryResourceProvider.1
                @NotNull
                public Iterator<BinaryResource> iterator() {
                    return FakeBinaryResourceProvider.this.binaries.iterator();
                }
            };
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest$FakeTextWriter.class */
    public static class FakeTextWriter implements TextWriter {
        final Set<String> processed = Sets.newHashSet();
        final Map<String, String> data = new HashMap();

        private FakeTextWriter() {
        }

        public void write(@NotNull String str, @NotNull String str2) {
            this.processed.add(str);
            this.data.put(str, str2);
        }

        public void markEmpty(String str) {
            this.processed.add(str);
        }

        public void markError(String str) {
            this.processed.add(str);
        }

        public boolean isProcessed(String str) {
            return this.processed.contains(str);
        }
    }

    /* loaded from: input_file:org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest$StringByteSource.class */
    private static class StringByteSource extends ByteSource {
        private final String data;

        StringByteSource(String str) {
            this.data = str;
        }

        public InputStream openStream() {
            return new ByteArrayInputStream(this.data.getBytes(StandardCharsets.UTF_8));
        }
    }

    @Before
    public void setup() throws Exception {
        this.indexDir = this.temporaryFolder.newFolder("index-dump");
        this.csv = this.temporaryFolder.newFile("blobs.csv");
        this.textPopulator.setStats(this.stats);
        setupIndexData();
    }

    private void setupIndexData() throws Exception {
        HashMap hashMap = new HashMap();
        hashMap.put("/sentence", "some sentence.");
        hashMap.put("/para", "some sentence.\nAnd more sentence after a new line");
        hashMap.put("/error", "TextExtractionError");
        hashMap.put("/null", null);
        hashMap.put("/empty", "");
        hashMap.put("/untrimmed-empty", " ");
        hashMap.put("/untrimmed", " untrimmed ");
        IndexWriter indexWriter = new IndexWriter(FSDirectory.open(this.indexDir), new IndexWriterConfig(Version.LUCENE_47, new OakAnalyzer(Version.LUCENE_47)));
        try {
            for (Map.Entry entry : hashMap.entrySet()) {
                indexWriter.addDocument(createLuceneDocument((String) entry.getKey(), (String) entry.getValue()));
            }
            indexWriter.addDocument(createLuceneDocument("/multi", "value1", "value2"));
            indexWriter.close();
        } catch (Throwable th) {
            try {
                indexWriter.close();
            } catch (Throwable th2) {
                th.addSuppressed(th2);
            }
            throw th;
        }
    }

    private void setupCSV(String... strArr) throws IOException {
        new CSVFileGenerator(this.csv).generate(new FakeBinaryResourceProvider(strArr).getBinaries("/"));
    }

    private List<Field> createLuceneDocument(@NotNull String str, String... strArr) {
        ArrayList newArrayList = Lists.newArrayList();
        for (String str2 : strArr) {
            if (str2 != null) {
                newArrayList.add(FieldFactory.newFulltextField(str2, true));
            }
        }
        newArrayList.add(FieldFactory.newPathField(str));
        return newArrayList;
    }

    @Test
    public void simpleTest() throws Exception {
        setupCSV("/sentence", "/para");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Incorrect binaries processed", 2L, this.stats.processed);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated call for already processed stuff shouldn't process anything more", 2L, this.stats.ignored);
        assertConsistentStatsAndWriter();
        assertStatsInvariants();
    }

    @Test
    public void untrimmedText() throws Exception {
        setupCSV("/untrimmed");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Store generation didn't trim data", "untrimmed", this.textWriter.data.get(FakeBinaryResourceProvider.getBlobId("/untrimmed")));
        assertConsistentStatsAndWriter();
        assertStatsInvariants();
    }

    @Test
    public void indexedError() throws Exception {
        setupCSV("/error");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Indexed data reporting errored extraction not marked as error", 1L, this.stats.errored);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated run for indexed error shouldn't get processed again", 1L, this.stats.ignored);
        assertConsistentStatsAndWriter();
        assertStatsInvariants();
    }

    @Test
    public void indexedEmpty() throws Exception {
        setupCSV("/empty");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Indexed data for empty extraction not marked as empty", 1L, this.stats.empty);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated run for empty extraction shouldn't get processed again", 1L, this.stats.ignored);
        assertConsistentStatsAndWriter();
        assertStatsInvariants();
    }

    @Test
    public void indexedUntrimmedEmpty() throws Exception {
        setupCSV("/untrimmed-empty");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Indexed data for untrimmed empty extraction not marked as empty", 1L, this.stats.empty);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated run for untrimmed empty extraction shouldn't get processed again", 1L, this.stats.ignored);
        assertConsistentStatsAndWriter();
        assertStatsInvariants();
    }

    @Test
    public void multiFTField() throws Exception {
        setupCSV("/multi");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Multi FT field in a doc not marked as error", 1L, this.stats.errored);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated run for multi FT error should get processed again", 0L, this.stats.ignored);
        assertStatsInvariants();
    }

    @Test
    public void indexHasDocumentButNotData() throws Exception {
        setupCSV("/null");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("No FT field in a doc not marked as error", 1L, this.stats.errored);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated run for no FT error should get processed again", 0L, this.stats.ignored);
        assertStatsInvariants();
    }

    @Test
    public void indexDoesNotHaveDocument() throws Exception {
        setupCSV("/somethingRandom");
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("No indexed doc not marked as error", 1L, this.stats.errored);
        this.textPopulator.populate(this.csv, this.indexDir);
        Assert.assertEquals("Repeated run for no indexed doc error should get processed again", 0L, this.stats.ignored);
        assertStatsInvariants();
    }

    private void assertConsistentStatsAndWriter() {
        Assert.assertEquals("Num blobs processed by text writer didn't process same not same as reported in stats", this.textWriter.processed.size(), this.stats.processed);
    }

    private void assertStatsInvariants() {
        Assert.assertTrue("Read (" + this.stats.read + ") != Processed (" + this.stats.processed + ") + Ignored (" + this.stats.ignored + ")", this.stats.read == this.stats.processed + this.stats.ignored);
        Assert.assertTrue("Processed (" + this.stats.processed + ") != Empty (" + this.stats.empty + ") + Errored (" + this.stats.errored + ") + Parsed (" + this.stats.parsed + ")", this.stats.processed == (this.stats.empty + this.stats.errored) + this.stats.parsed);
    }
}
