/*
 * Decompiled with CFR 0.152.
 */
package net.sf.okapi.steps.tokenization;

import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.FileLocation;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.IResource;
import net.sf.okapi.common.LocaleFilter;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Range;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.steps.tokenization.Parameters;
import net.sf.okapi.steps.tokenization.TokenizationStep;
import net.sf.okapi.steps.tokenization.Tokenizer;
import net.sf.okapi.steps.tokenization.common.TokensAnnotation;
import net.sf.okapi.steps.tokenization.engine.RbbiLexer;
import net.sf.okapi.steps.tokenization.engine.javacc.ParseException;
import net.sf.okapi.steps.tokenization.engine.javacc.SimpleCharStream;
import net.sf.okapi.steps.tokenization.engine.javacc.Token;
import net.sf.okapi.steps.tokenization.engine.javacc.WordTokenizer;
import net.sf.okapi.steps.tokenization.engine.javacc.WordTokenizerTokenManager;
import net.sf.okapi.steps.tokenization.locale.LocaleUtil;
import net.sf.okapi.steps.tokenization.tokens.Tokens;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@RunWith(value=JUnit4.class)
public class TokenizationTest {
    private String text = "Jaguar will sell its new XJ-6 model in the U.S. for a small fortune :-). Expect to pay around USD 120ks ($120,000.00 on 05/30/2007 at 12.30PM). Custom options can set you back another few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\" alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com. See http://www.jaguar.com/sales, www.jaguar.com, AT&T, P&G, Johnson&Johnson, 192.168.0.5 for info 3.5pct.";
    private LocaleId locENUS = LocaleId.fromString((String)"en-us");
    private LocaleId locENGB = LocaleId.fromString((String)"en-gb");
    private LocaleId locDEDE = LocaleId.fromString((String)"de-de");
    private LocaleId locDECH = LocaleId.fromString((String)"de-ch");
    private LocaleId locFR = LocaleId.fromString((String)"fr");
    private TokenizationStep ts;
    private Tokens tokens;

    private String streamAsString(InputStream input) throws IOException {
        BufferedReader reader = null;
        reader = new BufferedReader(new InputStreamReader(input, "UTF-8"));
        StringBuilder tmp = new StringBuilder();
        char[] buf = new char[2048];
        int count = 0;
        while ((count = reader.read(buf)) != -1) {
            tmp.append(buf, 0, count);
        }
        return tmp.toString();
    }

    private Tokens tokenizeText() {
        Tokens res = new Tokens();
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        StartDocument startDoc = new StartDocument("tokenization");
        startDoc.setLocale(this.locENUS);
        startDoc.setMultilingual(false);
        Event event = new Event(EventType.START_DOCUMENT, (IResource)startDoc);
        this.ts.handleEvent(event);
        ITextUnit tu = TextUnitUtil.buildTU((String)this.text);
        event = new Event(EventType.TEXT_UNIT, (IResource)tu);
        this.ts.handleEvent(event);
        TokensAnnotation ta = (TokensAnnotation)TextUnitUtil.getSourceAnnotation((ITextUnit)tu, TokensAnnotation.class);
        if (ta != null) {
            res.addAll((Collection)ta.getTokens());
        }
        this.ts.handleEvent(new Event(EventType.END_BATCH));
        return res;
    }

    @Before
    public void setUp() {
        this.ts = new TokenizationStep();
    }

    @Test
    public void testDefRules() {
        RuleBasedBreakIterator iterator = (RuleBasedBreakIterator)BreakIterator.getWordInstance();
    }

    @Test
    public void testLocaleUtil() {
        Assert.assertEquals((Object)"en-us", (Object)LocaleUtil.normalizeLanguageCode_Okapi((String)"en_US"));
        Assert.assertEquals((Object)"en_US", (Object)LocaleUtil.normalizeLanguageCode_ICU((String)"EN-US"));
    }

    @Test
    public void testTS() {
        this.ts = new TokenizationStep();
        ITextUnit tu = TextUnitUtil.buildTU((String)this.text);
        Event event = new Event(EventType.TEXT_UNIT, (IResource)tu);
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        this.ts.handleEvent(event);
        this.ts.handleEvent(new Event(EventType.END_BATCH));
    }

    private void listTokens(Tokens tokens) {
        if (tokens == null) {
            return;
        }
        StringBuilder sb = new StringBuilder();
        for (net.sf.okapi.steps.tokenization.common.Token token : tokens) {
            sb.append(token.toString() + "\n");
        }
        Logger localLogger = LoggerFactory.getLogger(this.getClass());
        localLogger.debug(sb.toString());
    }

    @Test
    public void listTokenizerOutput() {
        Tokens tokens = Tokenizer.tokenize((String)"NASDAQ is a U.S. stock exchange.", (LocaleId)this.locENUS, (String[])new String[0]);
        this.listTokens(tokens);
    }

    @Test
    public void testFilters() {
        Parameters params = new Parameters();
        this.ts.setParameters((IParameters)params);
        LocaleFilter languageFilter = params.getLocaleFilter();
        Assert.assertNotNull((Object)languageFilter);
        Assert.assertTrue((boolean)params.supportsLanguage(this.locENUS));
        Assert.assertTrue((boolean)params.supportsToken("FAKE_TOKEN"));
        Assert.assertTrue((boolean)params.supportsToken(Integer.MAX_VALUE));
        params.setLocaleFilter("");
        Assert.assertTrue((boolean)params.supportsLanguage(this.locENUS));
        Assert.assertTrue((boolean)params.supportsLanguage(this.locENGB));
        Assert.assertTrue((boolean)params.supportsLanguage(this.locDEDE));
        Assert.assertTrue((boolean)params.supportsLanguage(this.locDECH));
        params.setLocaleFilter("en !en-gb de-*-* !de-ch");
        Assert.assertTrue((boolean)params.supportsLanguage(this.locENUS));
        Assert.assertFalse((boolean)params.supportsLanguage(this.locENGB));
        Assert.assertTrue((boolean)params.supportsLanguage(this.locDEDE));
        Assert.assertFalse((boolean)params.supportsLanguage(this.locDECH));
        params.setTokenNames((String[])null);
        Assert.assertTrue((boolean)params.supportsToken("FAKE_TOKEN"));
        Assert.assertTrue((boolean)params.supportsToken(Integer.MAX_VALUE));
        params.setTokenNames(new String[]{"WORD", "PUNKTUATION"});
        Assert.assertFalse((boolean)params.supportsToken("FAKE_TOKEN"));
        Assert.assertFalse((boolean)params.supportsToken(Integer.MAX_VALUE));
        Assert.assertTrue((boolean)params.supportsToken("WORD"));
        params = (Parameters)this.ts.getParameters();
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        Assert.assertTrue((boolean)params.supportsLanguage(this.locENUS));
        Assert.assertFalse((boolean)params.supportsLanguage(this.locENGB));
        Assert.assertTrue((boolean)params.supportsLanguage(this.locDEDE));
        Assert.assertFalse((boolean)params.supportsLanguage(this.locDECH));
        Assert.assertFalse((boolean)params.supportsToken("FAKE_TOKEN"));
        Assert.assertFalse((boolean)params.supportsToken(Integer.MAX_VALUE));
        Assert.assertTrue((boolean)params.supportsToken("WORD"));
        this.ts.handleEvent(new Event(EventType.END_BATCH));
    }

    @Test
    public void testTokenizer1() {
        this.ts.setConfiguration(this.getClass(), "test_config1.tprm");
        Parameters params = (Parameters)this.ts.getParameters();
        Assert.assertTrue((boolean)params.supportsToken("WORD"));
        List lexers = this.ts.getLexers();
        Assert.assertEquals((long)1L, (long)lexers.size());
        this.tokens = this.tokenizeText();
        Assert.assertEquals((long)183L, (long)this.tokens.size());
    }

    @Test
    public void testTokenizer2() {
        Tokens tokens = Tokenizer.tokenize((String)"word1 word2 word3", (LocaleId)this.locENUS, (String[])new String[]{"WORD"});
        Assert.assertEquals((long)3L, (long)tokens.size());
        Assert.assertEquals((Object)"word1", (Object)((net.sf.okapi.steps.tokenization.common.Token)tokens.get(0)).getValue());
        Assert.assertEquals((Object)"word2", (Object)((net.sf.okapi.steps.tokenization.common.Token)tokens.get(1)).getValue());
        Assert.assertEquals((Object)"word3", (Object)((net.sf.okapi.steps.tokenization.common.Token)tokens.get(2)).getValue());
    }

    @Test
    public void testJavaCC() {
        StringReader sr = new StringReader("This is a 1248-th test. U.S.A.F. read-through\n didn't AT&T, P&G, Johnson&Johnson \n\nadmin@yahoo.com 192.168.0.7");
        SimpleCharStream stream = new SimpleCharStream((Reader)sr);
        WordTokenizer tokenizer = new WordTokenizer(new WordTokenizerTokenManager(stream));
        Token token = null;
        do {
            try {
                token = tokenizer.nextToken();
            }
            catch (IOException | ParseException e) {
                e.printStackTrace();
                break;
            }
        } while (token != null && token != null);
    }

    @Test
    public void testRetainRemove() {
        ArrayList<String> list = new ArrayList<String>();
        list.add("A");
        list.add("B");
        list.add("C");
        ArrayList<String> whiteList = new ArrayList<String>();
        whiteList.add("A");
        whiteList.add("B");
        ArrayList<String> blackList = new ArrayList<String>();
        blackList.add("B");
        Assert.assertEquals((long)3L, (long)list.size());
        Assert.assertEquals((Object)"A", list.get(0));
        Assert.assertEquals((Object)"B", list.get(1));
        Assert.assertEquals((Object)"C", list.get(2));
        list.retainAll(whiteList);
        Assert.assertEquals((long)2L, (long)list.size());
        Assert.assertEquals((Object)"A", list.get(0));
        Assert.assertEquals((Object)"B", list.get(1));
        list.removeAll(blackList);
        Assert.assertEquals((long)1L, (long)list.size());
        Assert.assertEquals((Object)"A", list.get(0));
    }

    @Test
    public void testFormRbbiRules() throws IOException {
        FileLocation root = FileLocation.fromClass(this.getClass());
        String expected = Util.normalizeNewlines((String)this.streamAsString(root.in("rbbi_custom.txt").asInputStream()));
        String rules = this.streamAsString(root.in("rbbi_default.txt").asInputStream());
        rules = RbbiLexer.formatRule((String)rules, (String)"Abbreviation", (String)"Abbreviation: Uppercase alpha chars separated by period and optionally followed by a period", (String)"[A-Z0-9](\\.[A-Z0-9])+(\\.)*", (int)500);
        rules = RbbiLexer.formatRule((String)rules, (String)"HyphenatedWord", (String)"Hyphenated Word : sequence of letter or digit, (punctuated by - or _, with following letter or digit sequence)+", (String)"[A-Za-z0-9]+([\\-_][A-Za-z0-9]+)+", (int)501);
        rules = RbbiLexer.formatRule((String)rules, (String)"EmailAddress", (String)"Email address: sequence of letters, digits and punctuation followed by @ and followed by another sequence", (String)"[A-Za-z0-9_\\-\\.]+\\@[A-Za-z][A-Za-z0-9_]+\\.[a-z]+", (int)502);
        rules = RbbiLexer.formatRule((String)rules, (String)"InternetAddress", (String)"Internet Addresses: http://www.foo.com(/bar)", (String)"[a-z]+\\:\\/\\/[a-z0-9]+(\\.[a-z0-9]+)+(\\/[a-z0-9][a-z0-9\\.]+)", (int)503);
        rules = RbbiLexer.formatRule((String)rules, (String)"XmlMarkup", (String)"XML markup: A run begins with < and ends with the first matching >", (String)"\\<[^\\>]+\\>", (int)504);
        rules = RbbiLexer.formatRule((String)rules, (String)"Emoticon", (String)"Emoticon: A run that starts with :;B8{[ and contains only one or more of the following -=/{})(", (String)"[B8\\:\\;\\{\\[][-=\\/\\{\\}\\)\\(]+", (int)505);
        Assert.assertEquals((Object)expected, (Object)rules);
    }

    @Test
    public void testRange() {
        Range r1 = new Range(1, 5);
        Range r2 = new Range(1, 5);
        Assert.assertFalse((r1 == r2 ? 1 : 0) != 0);
        Assert.assertFalse((boolean)r1.equals(r2));
        Assert.assertFalse((r1.hashCode() == r2.hashCode() ? 1 : 0) != 0);
        Assert.assertFalse((r1.toString() == r2.toString() ? 1 : 0) != 0);
    }
}

