package org.apache.tika.parser.txt;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.poi.util.IOUtils;
import org.apache.tika.parser.txt.CharsetRecog_2022;
import org.apache.tika.parser.txt.CharsetRecog_Unicode;
import org.apache.tika.parser.txt.CharsetRecog_mbcs;
import org.apache.tika.parser.txt.CharsetRecog_sbcs;

/* loaded from: input_file:tika-parsers-1.28.2.jar:org/apache/tika/parser/txt/CharsetDetector.class */
public class CharsetDetector {
    private static final int MAX_CONFIDENCE = 100;
    static final int DEFAULT_MARK_LIMIT = 12000;
    private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
    final byte[] fInputBytes;
    int fInputLen;
    short[] fByteStats;
    boolean fC1Bytes;
    String fDeclaredEncoding;
    byte[] fRawInput;
    int fRawLength;
    InputStream fInputStream;
    private boolean fStripTags;
    private boolean[] fEnabledRecognizers;
    private final int kBufSize;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:tika-parsers-1.28.2.jar:org/apache/tika/parser/txt/CharsetDetector$CSRecognizerInfo.class */
    public static class CSRecognizerInfo {
        CharsetRecognizer recognizer;
        boolean isDefaultEnabled;

        CSRecognizerInfo(CharsetRecognizer charsetRecognizer, boolean z) {
            this.recognizer = charsetRecognizer;
            this.isDefaultEnabled = z;
        }
    }

    public CharsetDetector() {
        this(DEFAULT_MARK_LIMIT);
    }

    public CharsetDetector(int i) {
        this.fByteStats = new short[256];
        this.fC1Bytes = false;
        this.fStripTags = false;
        this.kBufSize = i;
        this.fInputBytes = new byte[this.kBufSize];
    }

    public static String[] getAllDetectableCharsets() {
        String[] strArr = new String[ALL_CS_RECOGNIZERS.size()];
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
        }
        return strArr;
    }

    public CharsetDetector setDeclaredEncoding(String str) {
        setCanonicalDeclaredEncoding(str);
        return this;
    }

    public CharsetDetector setText(byte[] bArr) {
        return setText(bArr, bArr.length);
    }

    private CharsetDetector setText(byte[] bArr, int i) {
        this.fRawInput = bArr;
        this.fRawLength = i;
        MungeInput();
        return this;
    }

    public CharsetDetector setText(InputStream inputStream) throws IOException {
        this.fInputStream = inputStream;
        this.fInputStream.mark(this.kBufSize);
        byte[] bArr = new byte[this.kBufSize];
        try {
            long readFully = IOUtils.readFully(this.fInputStream, bArr);
            if (readFully >= 2147483647L) {
                throw new IOException("Can't have read > Integer.MAX_VALUE bytes");
            }
            return readFully < 1 ? setText(new byte[0]) : ((long) this.kBufSize) > readFully ? setText(bArr, (int) readFully) : setText(bArr);
        } finally {
            this.fInputStream.reset();
        }
    }

    public CharsetMatch detect() {
        CharsetMatch[] detectAll = detectAll();
        if (detectAll == null || detectAll.length == 0) {
            return null;
        }
        return detectAll[0];
    }

    public CharsetMatch[] detectAll() {
        int confidence;
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
            CharsetRecognizer charsetRecognizer = ALL_CS_RECOGNIZERS.get(i).recognizer;
            CharsetMatch match = charsetRecognizer.match(this);
            if (match != null && (confidence = match.getConfidence() & 255) > 0) {
                int min = Math.min(confidence, 100);
                if (this.fDeclaredEncoding != null && this.fDeclaredEncoding.equalsIgnoreCase(charsetRecognizer.getName())) {
                    min += (100 - min) / 2;
                }
                arrayList.add(new CharsetMatch(this, charsetRecognizer, min, match.getName(), match.getLanguage()));
            }
        }
        Collections.sort(arrayList);
        Collections.reverse(arrayList);
        return (CharsetMatch[]) arrayList.toArray(new CharsetMatch[arrayList.size()]);
    }

    public Reader getReader(InputStream inputStream, String str) {
        this.fDeclaredEncoding = str;
        try {
            setText(inputStream);
            CharsetMatch detect = detect();
            if (detect == null) {
                return null;
            }
            return detect.getReader();
        } catch (IOException e) {
            return null;
        }
    }

    public String getString(byte[] bArr, String str) {
        this.fDeclaredEncoding = str;
        try {
            setText(bArr);
            CharsetMatch detect = detect();
            if (detect == null) {
                return null;
            }
            return detect.getString(-1);
        } catch (IOException e) {
            return null;
        }
    }

    public boolean inputFilterEnabled() {
        return this.fStripTags;
    }

    public boolean enableInputFilter(boolean z) {
        boolean z2 = this.fStripTags;
        this.fStripTags = z;
        return z2;
    }

    private void setCanonicalDeclaredEncoding(String str) {
        Charset forName;
        if (str == null || str.isEmpty() || (forName = Charset.forName(str)) == null) {
            return;
        }
        this.fDeclaredEncoding = forName.name();
    }

    private void MungeInput() {
        int i = 0;
        boolean z = false;
        int i2 = 0;
        int i3 = 0;
        if (this.fStripTags) {
            for (int i4 = 0; i4 < this.fRawLength && i < this.fInputBytes.length; i4++) {
                byte b = this.fRawInput[i4];
                if (b == 60) {
                    if (z) {
                        i3++;
                    }
                    z = true;
                    i2++;
                }
                if (!z) {
                    int i5 = i;
                    i++;
                    this.fInputBytes[i5] = b;
                }
                if (b == 62) {
                    z = false;
                }
            }
            this.fInputLen = i;
        }
        if (i2 < 5 || i2 / 5 < i3 || (this.fInputLen < 100 && this.fRawLength > 600)) {
            int i6 = this.fRawLength;
            if (i6 > this.kBufSize) {
                i6 = this.kBufSize;
            }
            int i7 = 0;
            while (i7 < i6) {
                this.fInputBytes[i7] = this.fRawInput[i7];
                i7++;
            }
            this.fInputLen = i7;
        }
        Arrays.fill(this.fByteStats, (short) 0);
        for (int i8 = 0; i8 < this.fInputLen; i8++) {
            int i9 = this.fInputBytes[i8] & 255;
            short[] sArr = this.fByteStats;
            sArr[i9] = (short) (sArr[i9] + 1);
        }
        this.fC1Bytes = false;
        for (int i10 = 128; i10 <= 159; i10++) {
            if (this.fByteStats[i10] != 0) {
                this.fC1Bytes = true;
                return;
            }
        }
    }

    @Deprecated
    public String[] getDetectableCharsets() {
        ArrayList arrayList = new ArrayList(ALL_CS_RECOGNIZERS.size());
        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
            CSRecognizerInfo cSRecognizerInfo = ALL_CS_RECOGNIZERS.get(i);
            if (this.fEnabledRecognizers == null ? cSRecognizerInfo.isDefaultEnabled : this.fEnabledRecognizers[i]) {
                arrayList.add(cSRecognizerInfo.recognizer.getName());
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    @Deprecated
    public CharsetDetector setDetectableCharset(String str, boolean z) {
        int i = -1;
        boolean z2 = false;
        int i2 = 0;
        while (true) {
            if (i2 >= ALL_CS_RECOGNIZERS.size()) {
                break;
            }
            CSRecognizerInfo cSRecognizerInfo = ALL_CS_RECOGNIZERS.get(i2);
            if (cSRecognizerInfo.recognizer.getName().equals(str)) {
                i = i2;
                z2 = cSRecognizerInfo.isDefaultEnabled == z;
            } else {
                i2++;
            }
        }
        if (i < 0) {
            throw new IllegalArgumentException("Invalid encoding: \"" + str + OperatorName.SHOW_TEXT_LINE_AND_SPACE);
        }
        if (this.fEnabledRecognizers == null && !z2) {
            this.fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
            for (int i3 = 0; i3 < ALL_CS_RECOGNIZERS.size(); i3++) {
                this.fEnabledRecognizers[i3] = ALL_CS_RECOGNIZERS.get(i3).isDefaultEnabled;
            }
        }
        if (this.fEnabledRecognizers != null) {
            this.fEnabledRecognizers[i] = z;
        }
        return this;
    }

    static {
        ArrayList arrayList = new ArrayList();
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_gb_18030(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), true));
        arrayList.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru(), true));
        ALL_CS_RECOGNIZERS = Collections.unmodifiableList(arrayList);
    }
}
