package com.microsoft.semantickernel;

import com.microsoft.semantickernel.settings.GPT3Settings;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import reactor.util.function.Tuple2;
import reactor.util.function.Tuples;

/* loaded from: input_file:com/microsoft/semantickernel/GPT3Tokenizer.class */
public class GPT3Tokenizer {
    private static final char[] bytesToUnicode;
    private static final Pattern encodingRegex;
    private static final Map<String, List<String>> bpeCache;
    private static final String MAX_TOKENIZER_CACHE_SIZE_KEY = "MAX_TOKENIZER_CACHE_SIZE";
    private static final String MAX_TOKENIZER_CACHE_SIZE_DEFAULT = "100000";
    static final /* synthetic */ boolean $assertionsDisabled;

    public static List<Integer> encode(String str) {
        ArrayList arrayList = new ArrayList();
        if (str != null && !str.isEmpty()) {
            Matcher matcher = encodingRegex.matcher(str);
            int i = 0;
            while (matcher.find()) {
                int encodingUtf8GetByteCount = encodingUtf8GetByteCount(matcher.group());
                if (encodingUtf8GetByteCount > i) {
                    i = encodingUtf8GetByteCount;
                }
            }
            char[] cArr = new char[i];
            byte[] bArr = new byte[i];
            matcher.reset();
            while (matcher.find()) {
                int encodingUtf8GetBytes = encodingUtf8GetBytes(matcher.group(), bArr);
                for (int i2 = encodingUtf8GetBytes - 1; i2 >= 0; i2--) {
                    cArr[i2] = bytesToUnicode[bArr[i2] & 255];
                }
                Iterator<String> it = bytePairEncoding(new String(cArr, 0, encodingUtf8GetBytes)).iterator();
                while (it.hasNext()) {
                    arrayList.add(GPT3Settings.encoder.get(it.next()));
                }
            }
        }
        return arrayList;
    }

    public static List<Integer> encode(StringBuilder sb) {
        return sb != null ? encode(sb.toString()) : Collections.emptyList();
    }

    public static List<Integer> encode(char[] cArr) {
        return cArr != null ? encode(new String(cArr)) : Collections.emptyList();
    }

    public static List<Integer> encode(Iterable<Character> iterable) {
        if (iterable == null) {
            return Collections.emptyList();
        }
        StringBuilder sb = new StringBuilder();
        Iterator<Character> it = iterable.iterator();
        while (it.hasNext()) {
            sb.append(it.next());
        }
        return encode(sb.toString());
    }

    private static int encodingUtf8GetByteCount(String str) {
        return str.getBytes(StandardCharsets.UTF_8).length;
    }

    private static int encodingUtf8GetBytes(String str, byte[] bArr) {
        if (!$assertionsDisabled && str == null) {
            throw new AssertionError();
        }
        if (!$assertionsDisabled && bArr == null) {
            throw new AssertionError();
        }
        byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
        System.arraycopy(bytes, 0, bArr, 0, bytes.length);
        return bytes.length;
    }

    private static List<String> bytePairEncoding(String str) {
        if (bpeCache.containsKey(str)) {
            return bpeCache.get(str);
        }
        if (str.length() <= 1) {
            ArrayList arrayList = new ArrayList(1);
            arrayList.add(str);
            bpeCache.put(str, arrayList);
            return arrayList;
        }
        ArrayList arrayList2 = new ArrayList(str.length());
        for (char c : str.toCharArray()) {
            arrayList2.add(String.valueOf(c));
        }
        long j = Long.MAX_VALUE;
        Tuple2 of = Tuples.of("", "");
        while (arrayList2.size() >= 2) {
            for (int i = 0; i < arrayList2.size() - 1; i++) {
                Tuple2 of2 = Tuples.of((String) arrayList2.get(i), (String) arrayList2.get(i + 1));
                long intValue = GPT3Settings.bpeRanks.containsKey(of2) ? GPT3Settings.bpeRanks.get(of2).intValue() : 100000000000L;
                if (intValue <= j) {
                    j = intValue;
                    of = of2;
                }
            }
            if (!GPT3Settings.bpeRanks.containsKey(of)) {
                break;
            }
            String str2 = (String) of.getT1();
            String str3 = (String) of.getT2();
            ArrayList arrayList3 = new ArrayList(arrayList2.size());
            int i2 = 0;
            while (i2 < arrayList2.size()) {
                int indexOf = arrayList2.subList(i2, arrayList2.size()).indexOf(str2);
                if (indexOf >= 0) {
                    indexOf += i2;
                }
                int size = indexOf < 0 ? arrayList2.size() : indexOf;
                for (int i3 = i2; i3 < size; i3++) {
                    arrayList3.add((String) arrayList2.get(i3));
                }
                if (indexOf < 0) {
                    break;
                }
                int i4 = indexOf;
                if (i4 < arrayList2.size() - 1 && ((String) arrayList2.get(i4)).equals(str2) && ((String) arrayList2.get(i4 + 1)).equals(str3)) {
                    arrayList3.add(str2 + str3);
                    i4++;
                } else {
                    arrayList3.add((String) arrayList2.get(i4));
                }
                i2 = i4 + 1;
            }
            ArrayList arrayList4 = arrayList2;
            arrayList2 = arrayList3;
            arrayList4.clear();
            j = Long.MAX_VALUE;
        }
        bpeCache.put(str, arrayList2);
        return arrayList2;
    }

    static {
        $assertionsDisabled = !GPT3Tokenizer.class.desiredAssertionStatus();
        bytesToUnicode = new char[]{256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, '!', '\"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 323, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
        encodingRegex = Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+");
        final int parseInt = Integer.parseInt(System.getProperty(MAX_TOKENIZER_CACHE_SIZE_KEY, MAX_TOKENIZER_CACHE_SIZE_DEFAULT));
        bpeCache = new LinkedHashMap<String, List<String>>() { // from class: com.microsoft.semantickernel.GPT3Tokenizer.1
            @Override // java.util.LinkedHashMap
            protected boolean removeEldestEntry(Map.Entry<String, List<String>> entry) {
                return size() > parseInt;
            }
        };
    }
}
