/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.tokenize;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.formats.ResourceAsStreamFactory;
import opennlp.tools.tokenize.DefaultTokenContextGenerator;
import opennlp.tools.tokenize.DummyTokenizerFactory;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

public class TokenizerFactoryTest {
    private static final Locale LOCALE_DUTCH = new Locale("nl");
    private static final Locale LOCALE_POLISH = new Locale("pl");
    private static final Locale LOCALE_PORTUGUESE = new Locale("pt");
    private static final Locale LOCALE_SPANISH = new Locale("es");

    private static ObjectStream<TokenSample> createSampleStream() throws IOException {
        ResourceAsStreamFactory in = new ResourceAsStreamFactory(TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train");
        return new TokenSampleStream((ObjectStream)new PlainTextByLineStream((InputStreamFactory)in, StandardCharsets.UTF_8));
    }

    private static TokenizerModel train(TokenizerFactory factory) throws IOException {
        return TokenizerME.train(TokenizerFactoryTest.createSampleStream(), (TokenizerFactory)factory, (TrainingParameters)TrainingParameters.defaultParams());
    }

    private static Dictionary loadAbbDictionary(Locale loc) throws IOException {
        String abbrevDict = loc.equals(LOCALE_DUTCH) ? "opennlp/tools/lang/abb_NL.xml" : (loc.equals(Locale.GERMAN) ? "opennlp/tools/lang/abb_DE.xml" : (loc.equals(Locale.FRENCH) ? "opennlp/tools/lang/abb_FR.xml" : (loc.equals(Locale.ITALIAN) ? "opennlp/tools/lang/abb_IT.xml" : (loc.equals(LOCALE_POLISH) ? "opennlp/tools/lang/abb_PL.xml" : (loc.equals(LOCALE_PORTUGUESE) ? "opennlp/tools/lang/abb_PT.xml" : (loc.equals(LOCALE_SPANISH) ? "opennlp/tools/lang/abb_ES.xml" : "opennlp/tools/lang/abb_EN.xml"))))));
        return new Dictionary(TokenizerFactoryTest.class.getClassLoader().getResourceAsStream(abbrevDict));
    }

    @Test
    void testDefault() throws IOException {
        Dictionary dic = TokenizerFactoryTest.loadAbbDictionary(Locale.ENGLISH);
        String lang = "eng";
        TokenizerModel model = TokenizerFactoryTest.train(new TokenizerFactory("eng", dic, false, null));
        TokenizerFactory factory = model.getFactory();
        Assertions.assertNotNull((Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, (Object)factory.getContextGenerator());
        String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
        Assertions.assertEquals((Object)defaultPattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"eng", (Object)model.getLanguage());
        Assertions.assertFalse((boolean)factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        model.serialize((OutputStream)out);
        ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
        TokenizerModel fromSerialized = new TokenizerModel((InputStream)in);
        factory = fromSerialized.getFactory();
        Assertions.assertNotNull((Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)defaultPattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"eng", (Object)model.getLanguage());
        Assertions.assertFalse((boolean)factory.isUseAlphaNumericOptimization());
    }

    @Test
    void testNullDict() throws IOException {
        Dictionary dic = null;
        String lang = "eng";
        TokenizerModel model = TokenizerFactoryTest.train(new TokenizerFactory("eng", dic, false, null));
        TokenizerFactory factory = model.getFactory();
        Assertions.assertNull((Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, (Object)factory.getContextGenerator());
        String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
        Assertions.assertEquals((Object)defaultPattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"eng", (Object)model.getLanguage());
        Assertions.assertFalse((boolean)factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        model.serialize((OutputStream)out);
        ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
        TokenizerModel fromSerialized = new TokenizerModel((InputStream)in);
        factory = fromSerialized.getFactory();
        Assertions.assertNull((Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)defaultPattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"eng", (Object)model.getLanguage());
        Assertions.assertFalse((boolean)factory.isUseAlphaNumericOptimization());
    }

    @Test
    void testCustomPatternAndAlphaOpt() throws IOException {
        Dictionary dic = null;
        String lang = "spa";
        String pattern = "^[0-9a-z\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00fd\u00f1A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00dd\u00d1]+$";
        TokenizerModel model = TokenizerFactoryTest.train(new TokenizerFactory("spa", dic, true, Pattern.compile(pattern)));
        TokenizerFactory factory = model.getFactory();
        Assertions.assertNull((Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)pattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"spa", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"spa", (Object)model.getLanguage());
        Assertions.assertTrue((boolean)factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        model.serialize((OutputStream)out);
        ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
        TokenizerModel fromSerialized = new TokenizerModel((InputStream)in);
        factory = fromSerialized.getFactory();
        Assertions.assertNull((Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)pattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"spa", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"spa", (Object)model.getLanguage());
        Assertions.assertTrue((boolean)factory.isUseAlphaNumericOptimization());
    }

    void checkCustomPatternForTokenizerME(String lang, String pattern, String sentence, int expectedNumTokens) throws IOException {
        Locale loc = Locale.ENGLISH;
        if ("dut".equals(lang) || "nld".equals(lang)) {
            loc = LOCALE_DUTCH;
        } else if ("deu".equals(lang)) {
            loc = Locale.GERMAN;
        } else if ("fra".equals(lang)) {
            loc = Locale.FRENCH;
        } else if ("ita".equals(lang)) {
            loc = Locale.ITALIAN;
        } else if ("pol".equals(lang)) {
            loc = LOCALE_POLISH;
        } else if ("por".equals(lang)) {
            loc = LOCALE_PORTUGUESE;
        } else if ("spa".equals(lang)) {
            loc = LOCALE_SPANISH;
        }
        TokenizerModel model = TokenizerFactoryTest.train(new TokenizerFactory(lang, TokenizerFactoryTest.loadAbbDictionary(loc), true, Pattern.compile(pattern)));
        TokenizerME tokenizer = new TokenizerME(model);
        String[] tokens = tokenizer.tokenize(sentence);
        Assertions.assertEquals((int)expectedNumTokens, (int)tokens.length);
        String[] sentSplit = sentence.replaceAll("'", " '").replaceAll(",", " ,").split(" ");
        for (int i = 0; i < sentSplit.length; ++i) {
            String sElement = sentSplit[i];
            if (i == sentSplit.length - 1) {
                sElement = sElement.replace(".", "");
            }
            Assertions.assertEquals((Object)sElement, (Object)tokens[i]);
        }
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsDeu() throws IOException {
        String lang = "deu";
        String pattern = "^[A-Za-z0-9\u00e4\u00e9\u00f6\u00fc\u00c4\u00c9\u00d6\u00dc\u00df]+$";
        String sentence = "Ich w\u00e4hle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 14);
    }

    @Test
    void testCustomPatternForTokenizerMEWithMultiDotAbbreviationsDeu() throws IOException {
        String lang = "deu";
        String pattern = "^[A-Za-z0-9\u00e4\u00e9\u00f6\u00fc\u00c4\u00c9\u00d6\u00dc\u00df]+$";
        String sentence = "Ich w\u00e4hle z.B. den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsDut() throws IOException {
        String lang = "dut";
        String pattern = "^[A-Za-z0-9\u00e4\u00f6\u00fc\u00eb\u00e8\u00e9\u00ef\u0133\u00c4\u00d6\u00dc\u00cb\u00c9\u00c8\u00cf\u0132]+$";
        String sentence = "Ik kies voor de droom van de botanische monografie die op p. 183 en volgende wordt beschreven.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
    }

    @Test
    void testCustomPatternForTokenizerMEWithMultiDotAbbreviationsDut() throws IOException {
        String lang = "dut";
        String pattern = "^[A-Za-z0-9\u00e4\u00f6\u00fc\u00eb\u00e8\u00e9\u00ef\u0133\u00c4\u00d6\u00dc\u00cb\u00c9\u00c8\u00cf\u0132]+$";
        String sentence = "Ik kies voor de droom van de botanische monografie die op p. 183 e.v. wordt beschreven.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 17);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsFra() throws IOException {
        String lang = "fra";
        String pattern = "^[a-zA-Z0-9\u00e0\u00e2\u00e4\u00e8\u00e9\u00ea\u00eb\u00ee\u00ef\u00f4\u0153\u00f9\u00fb\u00fc\u00ff\u00e7\u00c0\u00c2\u00c4\u00c8\u00c9\u00ca\u00cb\u00ce\u00cf\u00d4\u0152\u00d9\u00db\u00dc\u0178\u00c7]+$";
        String sentence = "Je choisis le r\u00eave de la monographie botanique communiqu\u00e9 \u00e0 la p. 205.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 14);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsPol() throws IOException {
        String lang = "pol";
        String pattern = "^[A-Za-z0-9\u017c\u017a\u0107\u0144\u00f3\u0142\u0119\u0105\u015b\u017b\u0179\u0106\u0104\u015a\u0118\u0141\u00d3\u0143]+$";
        String sentence = "W szkicu autobiograficznym pt. moje \u017cycie i psychoanaliza Freud pisze, \u017ce jego przodkowie \u017cyli przez wiele lat w Kolonii.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 21);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsPor() throws IOException {
        String lang = "por";
        String pattern = "^[0-9a-z\u00e1\u00e3\u00e2\u00e0\u00e9\u00ea\u00ed\u00f3\u00f5\u00f4\u00fa\u00fc\u00e7A-Z\u00c1\u00c3\u00c2\u00c0\u00c9\u00ca\u00cd\u00d3\u00d5\u00d4\u00da\u00dc\u00c7]+$";
        String sentence = "O povo pernambucano, tradicionalmente inimigo dos imperadores, lembrava-se do tempo em que o Sr. D. Pedro de Alcantara dava-se ao luxo de visitar o norte.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 28);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsSpa() throws IOException {
        String lang = "spa";
        String pattern = "^[0-9a-z\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00fd\u00f1A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00dd\u00d1]+$";
        String sentence = "Elegiremos el de la monograf\u00eda bot\u00e1nica expuesto antes del cap\u00edtulo V en p\u00e1g. 448 del presente volumen.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
    }

    @Test
    void testCustomPatternForTokenizerMEPor() throws IOException {
        String lang = "por";
        String pattern = "^[0-9a-z\u00e1\u00e3\u00e2\u00e0\u00e9\u00ea\u00ed\u00f3\u00f5\u00f4\u00fa\u00fc\u00e7A-Z\u00c1\u00c3\u00c2\u00c0\u00c9\u00ca\u00cd\u00d3\u00d5\u00d4\u00da\u00dc\u00c7]+$";
        String sentence = "Na floresta m\u00e1gica a raposa dan\u00e7a com unic\u00f3rnios felizes.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 10);
    }

    @Test
    void testCustomPatternForTokenizerMESpa() throws IOException {
        String lang = "spa";
        String pattern = "^[0-9a-z\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00fd\u00f1A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00dd\u00d1]+$";
        String sentence = "En el verano los ni\u00f1os juegan en el parque y sus risas crean alegr\u00eda.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
    }

    @Test
    void testCustomPatternForTokenizerMECat() throws IOException {
        String lang = "cat";
        String pattern = "^[0-9a-z\u00e0\u00e8\u00e9\u00ed\u00ef\u00f2\u00f3\u00fa\u00fc\u00e7A-Z\u00c0\u00c8\u00c9\u00cd\u00cf\u00d2\u00d3\u00da\u00dc\u00c7]+$";
        String sentence = "Als xiuxiuejants avets l'os blau neda amb cignes i s'ho passen b\u00e9.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
    }

    @Test
    void testCustomPatternForTokenizerMEIta() throws IOException {
        String lang = "ita";
        String pattern = "^[0-9a-z\u00e0\u00e8\u00e9\u00ec\u00ee\u00ed\u00f2\u00f3\u00f9\u00fcA-Z\u00c0\u00c8\u00c9\u00cc\u00ce\u00cd\u00d2\u00d3\u00d9\u00dc]+$";
        String sentence = "Cosa fare di domenica per migliorare il tuo luned\u00ec.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 10);
    }

    @Test
    void testCustomPatternForTokenizerMEWithAbbreviationsIta() throws IOException {
        String lang = "ita";
        String pattern = "^[0-9a-z\u00e0\u00e8\u00e9\u00ec\u00ee\u00ed\u00f2\u00f3\u00f9\u00fcA-Z\u00c0\u00c8\u00c9\u00cc\u00ce\u00cd\u00d2\u00d3\u00d9\u00dc]+$";
        String sentence = "La chiesa fu costruita fra il 1258 ed il 1308 ca. come chiesa del convento degli Agostiniani.";
        this.checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
    }

    @Test
    void testContractionsIta() throws IOException {
        Dictionary dic = null;
        String lang = "ita";
        String pattern = "^[0-9a-z\u00e0\u00e8\u00e9\u00ec\u00ee\u00ed\u00f2\u00f3\u00f9\u00fcA-Z\u00c0\u00c8\u00c9\u00cc\u00ce\u00cd\u00d2\u00d3\u00d9\u00dc]+$";
        TokenizerModel model = TokenizerFactoryTest.train(new TokenizerFactory(lang, dic, true, Pattern.compile(pattern)));
        TokenizerME tokenizer = new TokenizerME(model);
        String sentence = "La contrazione di \"dove \u00e8\" \u00e8 \"dov'\u00e8\".";
        String[] tokens = tokenizer.tokenize(sentence);
        Assertions.assertEquals((int)11, (int)tokens.length);
        String[] sentSplit = sentence.replaceAll("\\.", " .").replaceAll("'", " '").replaceAll("([^ ])\"", "$1 \"").split(" ");
        for (int i = 0; i < sentSplit.length; ++i) {
            Assertions.assertEquals((Object)sentSplit[i], (Object)tokens[i]);
        }
    }

    @Test
    void testContractionsEng() throws IOException {
        Dictionary dic = null;
        String lang = "eng";
        String pattern = "^[A-Za-z0-9]+$";
        TokenizerModel model = TokenizerFactoryTest.train(new TokenizerFactory(lang, dic, true, Pattern.compile(pattern)));
        TokenizerME tokenizer = new TokenizerME(model);
        String sentence = "The cat wasn't in the house and the dog wasn't either.";
        String[] tokens = tokenizer.tokenize(sentence);
        Assertions.assertEquals((int)14, (int)tokens.length);
        String[] sentSplit = sentence.replaceAll("\\.", " .").replaceAll("'", " '").split(" ");
        for (int i = 0; i < sentSplit.length; ++i) {
            Assertions.assertEquals((Object)sentSplit[i], (Object)tokens[i]);
        }
    }

    @Test
    void testDummyFactory() throws IOException {
        Dictionary dic = TokenizerFactoryTest.loadAbbDictionary(Locale.ENGLISH);
        String lang = "eng";
        String pattern = "^[0-9A-Za-z]+$";
        TokenizerModel model = TokenizerFactoryTest.train(new DummyTokenizerFactory("eng", dic, true, Pattern.compile(pattern)));
        TokenizerFactory factory = model.getFactory();
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyDictionary.class, (Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)pattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"eng", (Object)model.getLanguage());
        Assertions.assertTrue((boolean)factory.isUseAlphaNumericOptimization());
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        model.serialize((OutputStream)out);
        ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
        TokenizerModel fromSerialized = new TokenizerModel((InputStream)in);
        factory = fromSerialized.getFactory();
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyDictionary.class, (Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)pattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertEquals((Object)"eng", (Object)model.getLanguage());
        Assertions.assertTrue((boolean)factory.isUseAlphaNumericOptimization());
    }

    @Test
    void testCreateDummyFactory() throws IOException {
        Dictionary dic = TokenizerFactoryTest.loadAbbDictionary(Locale.ENGLISH);
        String lang = "eng";
        String pattern = "^[0-9A-Za-z]+$";
        TokenizerFactory factory = TokenizerFactory.create((String)DummyTokenizerFactory.class.getCanonicalName(), (String)"eng", (Dictionary)dic, (boolean)true, (Pattern)Pattern.compile(pattern));
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyDictionary.class, (Object)factory.getAbbreviationDictionary());
        Assertions.assertInstanceOf(DummyTokenizerFactory.DummyContextGenerator.class, (Object)factory.getContextGenerator());
        Assertions.assertEquals((Object)pattern, (Object)factory.getAlphaNumericPattern().pattern());
        Assertions.assertEquals((Object)"eng", (Object)factory.getLanguageCode());
        Assertions.assertTrue((boolean)factory.isUseAlphaNumericOptimization());
    }
}

