package opennlp.tools.tokenize;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.formats.ResourceAsStreamFactory;
import opennlp.tools.tokenize.DummyTokenizerFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import org.junit.Assert;
import org.junit.Test;

/* loaded from: input_file:opennlp/tools/tokenize/TokenizerFactoryTest.class */
public class TokenizerFactoryTest {
    private static ObjectStream<TokenSample> createSampleStream() throws IOException {
        return new TokenSampleStream(new PlainTextByLineStream(new ResourceAsStreamFactory(TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train"), StandardCharsets.UTF_8));
    }

    private static TokenizerModel train(TokenizerFactory tokenizerFactory) throws IOException {
        return TokenizerME.train(createSampleStream(), tokenizerFactory, TrainingParameters.defaultParams());
    }

    private static Dictionary loadAbbDictionary() throws IOException {
        return new Dictionary(TokenizerFactoryTest.class.getClassLoader().getResourceAsStream("opennlp/tools/sentdetect/abb.xml"));
    }

    @Test
    public void testDefault() throws IOException {
        TokenizerModel train = train(new TokenizerFactory("spa", loadAbbDictionary(), false, (Pattern) null));
        TokenizerFactory factory = train.getFactory();
        Assert.assertTrue(factory.getAbbreviationDictionary() != null);
        Assert.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
        Assert.assertEquals("^[A-Za-z0-9]+$", factory.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertFalse(factory.isUseAlphaNumericOptmization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assert.assertTrue(factory2.getAbbreviationDictionary() != null);
        Assert.assertTrue(factory2.getContextGenerator() instanceof DefaultTokenContextGenerator);
        Assert.assertEquals("^[A-Za-z0-9]+$", factory2.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory2.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertFalse(factory2.isUseAlphaNumericOptmization());
    }

    @Test
    public void testNullDict() throws IOException {
        TokenizerModel train = train(new TokenizerFactory("spa", (Dictionary) null, false, (Pattern) null));
        TokenizerFactory factory = train.getFactory();
        Assert.assertNull(factory.getAbbreviationDictionary());
        Assert.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
        Assert.assertEquals("^[A-Za-z0-9]+$", factory.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertFalse(factory.isUseAlphaNumericOptmization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assert.assertNull(factory2.getAbbreviationDictionary());
        Assert.assertTrue(factory2.getContextGenerator() instanceof DefaultTokenContextGenerator);
        Assert.assertEquals("^[A-Za-z0-9]+$", factory2.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory2.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertFalse(factory2.isUseAlphaNumericOptmization());
    }

    @Test
    public void testCustomPatternAndAlphaOpt() throws IOException {
        TokenizerModel train = train(new TokenizerFactory("spa", (Dictionary) null, true, Pattern.compile("^[0-9A-Za-z]+$")));
        TokenizerFactory factory = train.getFactory();
        Assert.assertNull(factory.getAbbreviationDictionary());
        Assert.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
        Assert.assertEquals("^[0-9A-Za-z]+$", factory.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertTrue(factory.isUseAlphaNumericOptmization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assert.assertNull(factory2.getAbbreviationDictionary());
        Assert.assertTrue(factory2.getContextGenerator() instanceof DefaultTokenContextGenerator);
        Assert.assertEquals("^[0-9A-Za-z]+$", factory2.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory2.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertTrue(factory2.isUseAlphaNumericOptmization());
    }

    @Test
    public void testDummyFactory() throws IOException {
        TokenizerModel train = train(new DummyTokenizerFactory("spa", loadAbbDictionary(), true, Pattern.compile("^[0-9A-Za-z]+$")));
        TokenizerFactory factory = train.getFactory();
        Assert.assertTrue(factory.getAbbreviationDictionary() instanceof DummyTokenizerFactory.DummyDictionary);
        Assert.assertTrue(factory.getContextGenerator() instanceof DummyTokenizerFactory.DummyContextGenerator);
        Assert.assertEquals("^[0-9A-Za-z]+$", factory.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertTrue(factory.isUseAlphaNumericOptmization());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        train.serialize(byteArrayOutputStream);
        TokenizerFactory factory2 = new TokenizerModel(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).getFactory();
        Assert.assertTrue(factory2.getAbbreviationDictionary() instanceof DummyTokenizerFactory.DummyDictionary);
        Assert.assertTrue(factory2.getContextGenerator() instanceof DummyTokenizerFactory.DummyContextGenerator);
        Assert.assertEquals("^[0-9A-Za-z]+$", factory2.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", factory2.getLanguageCode());
        Assert.assertEquals("spa", train.getLanguage());
        Assert.assertTrue(factory2.isUseAlphaNumericOptmization());
    }

    @Test
    public void testCreateDummyFactory() throws IOException {
        TokenizerFactory create = TokenizerFactory.create(DummyTokenizerFactory.class.getCanonicalName(), "spa", loadAbbDictionary(), true, Pattern.compile("^[0-9A-Za-z]+$"));
        Assert.assertTrue(create.getAbbreviationDictionary() instanceof DummyTokenizerFactory.DummyDictionary);
        Assert.assertTrue(create.getContextGenerator() instanceof DummyTokenizerFactory.DummyContextGenerator);
        Assert.assertEquals("^[0-9A-Za-z]+$", create.getAlphaNumericPattern().pattern());
        Assert.assertEquals("spa", create.getLanguageCode());
        Assert.assertTrue(create.isUseAlphaNumericOptmization());
    }
}
