package opennlp.tools.tokenize;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.formats.ResourceAsStreamFactory;
import opennlp.tools.tokenize.DummyTokenizerFactory;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/* loaded from: input_file:opennlp/tools/tokenize/TokenizerFactoryTest.class */
public class TokenizerFactoryTest {
    private static final Locale LOCALE_DUTCH = new Locale("nl");
    private static final Locale LOCALE_POLISH = new Locale("pl");
    private static final Locale LOCALE_PORTUGUESE = new Locale("pt");
    private static final Locale LOCALE_SPANISH = new Locale("es");

    private static ObjectStream<TokenSample> createSampleStream() throws IOException {
        return new TokenSampleStream(new PlainTextByLineStream(new ResourceAsStreamFactory(TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train"), StandardCharsets.UTF_8));
    }

    private static TokenizerModel train(TokenizerFactory tokenizerFactory) throws IOException {
        return TokenizerME.train(createSampleStream(), tokenizerFactory, TrainingParameters.defaultParams());
    }

    private static Dictionary loadAbbDictionary(Locale locale) throws IOException {
        return new Dictionary(TokenizerFactoryTest.class.getClassLoader().getResourceAsStream(locale.equals(LOCALE_DUTCH) ? "opennlp/tools/lang/abb_NL.xml" : locale.equals(Locale.GERMAN) ? "opennlp/tools/lang/abb_DE.xml" : locale.equals(Locale.FRENCH) ? "opennlp/tools/lang/abb_FR.xml" : locale.equals(Locale.ITALIAN) ? "opennlp/tools/lang/abb_IT.xml" : locale.equals(LOCALE_POLISH) ? "opennlp/tools/lang/abb_PL.xml" : locale.equals(LOCALE_PORTUGUESE) ? "opennlp/tools/lang/abb_PT.xml" : locale.equals(LOCALE_SPANISH) ? "opennlp/tools/lang/abb_ES.xml" : "opennlp/tools/lang/abb_EN.xml"));
    }

    @Test
    void testDefault() throws IOException {
        TokenizerModel train = train(new TokenizerFactory("eng", loadAbbDictionary(Locale.ENGLISH), false, (Pattern) null));
        TokenizerFactory factory = train.getFactory();
        Assertions.assertNotNull(factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
        String pattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
        Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", factory.getLanguageCode());
        Assertions.assertEquals("eng", train.getLanguage());
        Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assertions.assertNotNull(factory2.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory2.getContextGenerator());
        Assertions.assertEquals(pattern, factory2.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", factory2.getLanguageCode());
        Assertions.assertEquals("eng", train.getLanguage());
        Assertions.assertFalse(factory2.isUseAlphaNumericOptimization());
    }

    @Test
    void testNullDict() throws IOException {
        TokenizerModel train = train(new TokenizerFactory("eng", (Dictionary) null, false, (Pattern) null));
        TokenizerFactory factory = train.getFactory();
        Assertions.assertNull(factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
        String pattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
        Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", factory.getLanguageCode());
        Assertions.assertEquals("eng", train.getLanguage());
        Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assertions.assertNull(factory2.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory2.getContextGenerator());
        Assertions.assertEquals(pattern, factory2.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", factory2.getLanguageCode());
        Assertions.assertEquals("eng", train.getLanguage());
        Assertions.assertFalse(factory2.isUseAlphaNumericOptimization());
    }

    @Test
    void testCustomPatternAndAlphaOpt() throws IOException {
        TokenizerModel train = train(new TokenizerFactory("spa", (Dictionary) null, true, Pattern.compile("^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$")));
        TokenizerFactory factory = train.getFactory();
        Assertions.assertNull(factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
        Assertions.assertEquals("^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$", factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("spa", factory.getLanguageCode());
        Assertions.assertEquals("spa", train.getLanguage());
        Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assertions.assertNull(factory2.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory2.getContextGenerator());
        Assertions.assertEquals("^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$", factory2.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("spa", factory2.getLanguageCode());
        Assertions.assertEquals("spa", train.getLanguage());
        Assertions.assertTrue(factory2.isUseAlphaNumericOptimization());
    }

    void checkCustomPatternForTokenizerME(String str, String str2, String str3, int i) throws IOException {
        Locale locale = Locale.ENGLISH;
        if ("dut".equals(str) || "nld".equals(str)) {
            locale = LOCALE_DUTCH;
        } else if ("deu".equals(str)) {
            locale = Locale.GERMAN;
        } else if ("fra".equals(str)) {
            locale = Locale.FRENCH;
        } else if ("ita".equals(str)) {
            locale = Locale.ITALIAN;
        } else if ("pol".equals(str)) {
            locale = LOCALE_POLISH;
        } else if ("por".equals(str)) {
            locale = LOCALE_PORTUGUESE;
        } else if ("spa".equals(str)) {
            locale = LOCALE_SPANISH;
        }
        String[] strArr = new TokenizerME(train(new TokenizerFactory(str, loadAbbDictionary(locale), true, Pattern.compile(str2)))).tokenize(str3);
        Assertions.assertEquals(i, strArr.length);
        String[] split = str3.replaceAll("'", " '").replaceAll(",", " ,").split(" ");
        for (int i2 = 0; i2 < split.length; i2++) {
            String str4 = split[i2];
            if (i2 == split.length - 1) {
                str4 = str4.replace(".", "");
            }
            Assertions.assertEquals(str4, strArr[i2]);
        }
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsDeu() throws IOException {
        checkCustomPatternForTokenizerME("deu", "^[A-Za-z0-9äéöüÄÉÖÜß]+$", "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.", 14);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsDut() throws IOException {
        checkCustomPatternForTokenizerME("dut", "^[A-Za-z0-9äöüëèéïĳÄÖÜËÉÈÏĲ]+$", "Ik kies voor de droom van de botanische monografie die op p. 183 en volgende wordt beschreven.", 18);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsFra() throws IOException {
        checkCustomPatternForTokenizerME("fra", "^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$", "Je choisis le rêve de la monographie botanique communiqué à la p. 205.", 14);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsPol() throws IOException {
        checkCustomPatternForTokenizerME("pol", "^[A-Za-z0-9żźćńółęąśŻŹĆĄŚĘŁÓŃ]+$", "W szkicu autobiograficznym pt. moje życie i psychoanaliza Freud pisze, że jego przodkowie żyli przez wiele lat w Kolonii.", 21);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsPor() throws IOException {
        checkCustomPatternForTokenizerME("por", "^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$", "O povo pernambucano, tradicionalmente inimigo dos imperadores, lembrava-se do tempo em que o Sr. D. Pedro de Alcantara dava-se ao luxo de visitar o norte.", 28);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsSpa() throws IOException {
        checkCustomPatternForTokenizerME("spa", "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$", "Elegiremos el de la monografía botánica expuesto antes del capítulo V en pág. 448 del presente volumen.", 18);
    }

    @Test
    void testCustomPatternForTokenizerMEPor() throws IOException {
        checkCustomPatternForTokenizerME("por", "^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$", "Na floresta mágica a raposa dança com unicórnios felizes.", 10);
    }

    @Test
    void testCustomPatternForTokenizerMESpa() throws IOException {
        checkCustomPatternForTokenizerME("spa", "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$", "En el verano los niños juegan en el parque y sus risas crean alegría.", 15);
    }

    @Test
    void testCustomPatternForTokenizerMECat() throws IOException {
        checkCustomPatternForTokenizerME("cat", "^[0-9a-zàèéíïòóúüçA-ZÀÈÉÍÏÒÓÚÜÇ]+$", "Als xiuxiuejants avets l'os blau neda amb cignes i s'ho passen bé.", 15);
    }

    @Test
    void testCustomPatternForTokenizerMEIta() throws IOException {
        checkCustomPatternForTokenizerME("ita", "^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$", "Cosa fare di domenica per migliorare il tuo lunedì.", 10);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsIta() throws IOException {
        checkCustomPatternForTokenizerME("ita", "^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$", "La chiesa fu costruita fra il 1258 ed il 1308 ca. come chiesa del convento degli Agostiniani.", 18);
    }

    @Test
    void testContractionsIta() throws IOException {
        String[] strArr = new TokenizerME(train(new TokenizerFactory("ita", (Dictionary) null, true, Pattern.compile("^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$")))).tokenize("La contrazione di \"dove è\" è \"dov'è\".");
        Assertions.assertEquals(11, strArr.length);
        String[] split = "La contrazione di \"dove è\" è \"dov'è\".".replaceAll("\\.", " .").replaceAll("'", " '").replaceAll("([^ ])\"", "$1 \"").split(" ");
        for (int i = 0; i < split.length; i++) {
            Assertions.assertEquals(split[i], strArr[i]);
        }
    }

    @Test
    void testContractionsEng() throws IOException {
        String[] strArr = new TokenizerME(train(new TokenizerFactory("eng", (Dictionary) null, true, Pattern.compile("^[A-Za-z0-9]+$")))).tokenize("The cat wasn't in the house and the dog wasn't either.");
        Assertions.assertEquals(14, strArr.length);
        String[] split = "The cat wasn't in the house and the dog wasn't either.".replaceAll("\\.", " .").replaceAll("'", " '").split(" ");
        for (int i = 0; i < split.length; i++) {
            Assertions.assertEquals(split[i], strArr[i]);
        }
    }

    @Test
    void testDummyFactory() throws IOException {
        TokenizerModel train = train(new DummyTokenizerFactory("eng", loadAbbDictionary(Locale.ENGLISH), true, Pattern.compile("^[0-9A-Za-z]+$")));
        TokenizerFactory factory = train.getFactory();
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyDictionary.class, factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyContextGenerator.class, factory.getContextGenerator());
        Assertions.assertEquals("^[0-9A-Za-z]+$", factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", factory.getLanguageCode());
        Assertions.assertEquals("eng", train.getLanguage());
        Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyDictionary.class, factory2.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyContextGenerator.class, factory2.getContextGenerator());
        Assertions.assertEquals("^[0-9A-Za-z]+$", factory2.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", factory2.getLanguageCode());
        Assertions.assertEquals("eng", train.getLanguage());
        Assertions.assertTrue(factory2.isUseAlphaNumericOptimization());
    }

    @Test
    void testCreateDummyFactory() throws IOException {
        TokenizerFactory create = TokenizerFactory.create(DummyTokenizerFactory.class.getCanonicalName(), "eng", loadAbbDictionary(Locale.ENGLISH), true, Pattern.compile("^[0-9A-Za-z]+$"));
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyDictionary.class, create.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyContextGenerator.class, create.getContextGenerator());
        Assertions.assertEquals("^[0-9A-Za-z]+$", create.getAlphaNumericPattern().pattern());
        Assertions.assertEquals("eng", create.getLanguageCode());
        Assertions.assertTrue(create.isUseAlphaNumericOptimization());
    }
}
