package org.apache.tika.eval.tokens;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.commons.math3.util.FastMath;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.util.BytesRef;

/* loaded from: input_file:org/apache/tika/eval/tokens/LuceneTokenCounter.class */
public class LuceneTokenCounter {
    private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a";
    private final Analyzer generalAnalyzer;
    private int topN = 10;
    Map<String, TokenStatistics> fieldStats = new HashMap();
    private final MemoryIndex memoryIndex = new MemoryIndex();
    private final LeafReader leafReader = this.memoryIndex.createSearcher().getIndexReader();

    public LuceneTokenCounter(Analyzer analyzer) throws IOException {
        this.generalAnalyzer = analyzer;
    }

    public void add(String str, String str2) throws IOException {
        this.memoryIndex.addField(str, str2, this.generalAnalyzer);
        count(str);
    }

    void count(String str) throws IOException {
        long sumTotalTermFreq = this.leafReader.getSumTotalTermFreq(str);
        if (sumTotalTermFreq > 2147483647L) {
            throw new IllegalArgumentException("can't handle longs");
        }
        int i = (int) sumTotalTermFreq;
        int i2 = 0;
        SummaryStatistics summaryStatistics = new SummaryStatistics();
        double d = 0.0d;
        Terms terms = this.leafReader.terms(str);
        if (terms == null) {
            this.fieldStats.put(str, new TokenStatistics(0, i, new TokenIntPair[0], 0.0d, summaryStatistics));
            return;
        }
        TermsEnum it = terms.iterator();
        TokenCountPriorityQueue tokenCountPriorityQueue = new TokenCountPriorityQueue(this.topN);
        for (BytesRef next = it.next(); next != null; next = it.next()) {
            long j = it.totalTermFreq();
            if (j > 2147483647L) {
                throw new IllegalArgumentException("Sorry can't handle longs yet");
            }
            int i3 = (int) j;
            String utf8ToString = next.utf8ToString();
            int codePointCount = utf8ToString.codePointCount(0, utf8ToString.length());
            for (int i4 = 0; i4 < i3; i4++) {
                summaryStatistics.addValue(codePointCount);
            }
            double d2 = i3 / sumTotalTermFreq;
            d += d2 * FastMath.log(2.0d, d2);
            if (tokenCountPriorityQueue.top() == null || tokenCountPriorityQueue.size() < this.topN || i3 >= ((TokenIntPair) tokenCountPriorityQueue.top()).getValue()) {
                tokenCountPriorityQueue.insertWithOverflow(new TokenIntPair(utf8ToString, i3));
            }
            i2++;
        }
        if (i > 0) {
            d = ((-1.0d) / i) * d;
        }
        this.fieldStats.put(str, new TokenStatistics(i2, i, tokenCountPriorityQueue.getArray(), d, summaryStatistics));
    }

    public void setTopN(int i) {
        this.topN = i;
    }

    public TokenStatistics getTokenStatistics(String str) {
        return this.fieldStats.get(str);
    }

    public Terms getTerms(String str) throws IOException {
        return this.leafReader.terms(str);
    }

    public void clear() {
        this.memoryIndex.reset();
        this.fieldStats.clear();
    }
}
