package opennlp.tools.tokenize;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.formats.ResourceAsStreamFactory;
import opennlp.tools.util.InsufficientTrainingDataException;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

/* loaded from: input_file:opennlp/tools/tokenize/TokenizerMETest.class */
public class TokenizerMETest {
    @Test
    void testTokenizerSimpleModel() throws IOException {
        String[] strArr = new TokenizerME(TokenizerTestUtil.createSimpleMaxentTokenModel()).tokenize("test,");
        Assertions.assertEquals(2, strArr.length);
        Assertions.assertEquals("test", strArr[0]);
        Assertions.assertEquals(",", strArr[1]);
    }

    @Test
    void testTokenizer() throws IOException {
        String[] strArr = new TokenizerME(TokenizerTestUtil.createMaxentTokenModel()).tokenize("Sounds like it's not properly thought through!");
        Assertions.assertEquals(9, strArr.length);
        Assertions.assertEquals("Sounds", strArr[0]);
        Assertions.assertEquals("like", strArr[1]);
        Assertions.assertEquals("it", strArr[2]);
        Assertions.assertEquals("'s", strArr[3]);
        Assertions.assertEquals("not", strArr[4]);
        Assertions.assertEquals("properly", strArr[5]);
        Assertions.assertEquals("thought", strArr[6]);
        Assertions.assertEquals("through", strArr[7]);
        Assertions.assertEquals("!", strArr[8]);
    }

    @Test
    void testInsufficientData() {
        Assertions.assertThrows(InsufficientTrainingDataException.class, () -> {
            TokenSampleStream tokenSampleStream = new TokenSampleStream(new PlainTextByLineStream(new ResourceAsStreamFactory(TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train"), StandardCharsets.UTF_8));
            TrainingParameters trainingParameters = new TrainingParameters();
            trainingParameters.put("Iterations", 100);
            trainingParameters.put("Cutoff", 5);
            TokenizerME.train(tokenSampleStream, TokenizerFactory.create((String) null, "eng", (Dictionary) null, true, (Pattern) null), trainingParameters);
        });
    }

    @Test
    void testNewLineAwareTokenization() throws IOException {
        TokenizerME tokenizerME = new TokenizerME(TokenizerTestUtil.createMaxentTokenModel());
        tokenizerME.setKeepNewLines(true);
        Assertions.assertEquals(2, tokenizerME.tokenize("a\n").length);
        Assertions.assertArrayEquals(new String[]{"a", "\n"}, tokenizerME.tokenize("a\n"));
        Assertions.assertEquals(3, tokenizerME.tokenize("a\nb").length);
        Assertions.assertArrayEquals(new String[]{"a", "\n", "b"}, tokenizerME.tokenize("a\nb"));
        Assertions.assertEquals(4, tokenizerME.tokenize("a\n\n b").length);
        Assertions.assertArrayEquals(new String[]{"a", "\n", "\n", "b"}, tokenizerME.tokenize("a\n\n b"));
        Assertions.assertEquals(7, tokenizerME.tokenize("a\n\n b\n\n c").length);
        Assertions.assertArrayEquals(new String[]{"a", "\n", "\n", "b", "\n", "\n", "c"}, tokenizerME.tokenize("a\n\n b\n\n c"));
    }

    @Test
    void testTokenizationOfStringWithWindowsNewLineTokens() throws IOException {
        TokenizerME tokenizerME = new TokenizerME(TokenizerTestUtil.createMaxentTokenModel());
        tokenizerME.setKeepNewLines(true);
        Assertions.assertEquals(3, tokenizerME.tokenize("a\r\n").length);
        Assertions.assertArrayEquals(new String[]{"a", "\r", "\n"}, tokenizerME.tokenize("a\r\n"));
        Assertions.assertEquals(4, tokenizerME.tokenize("a\r\nb").length);
        Assertions.assertArrayEquals(new String[]{"a", "\r", "\n", "b"}, tokenizerME.tokenize("a\r\nb"));
        Assertions.assertEquals(6, tokenizerME.tokenize("a\r\n\r\n b").length);
        Assertions.assertArrayEquals(new String[]{"a", "\r", "\n", "\r", "\n", "b"}, tokenizerME.tokenize("a\r\n\r\n b"));
        Assertions.assertEquals(11, tokenizerME.tokenize("a\r\n\r\n b\r\n\r\n c").length);
        Assertions.assertArrayEquals(new String[]{"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"}, tokenizerME.tokenize("a\r\n\r\n b\r\n\r\n c"));
    }
}
