/*
 * Decompiled with CFR 0.152.
 */
package abner;

import abner.Input2TokenSequence;
import edu.umass.cs.mallet.base.fst.CRF4;
import edu.umass.cs.mallet.base.fst.MultiSegmentationEvaluator;
import edu.umass.cs.mallet.base.fst.TransducerEvaluator;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureVectorSequence;
import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;
import edu.umass.cs.mallet.base.pipe.iterator.PipeInputIterator;
import edu.umass.cs.mallet.base.pipe.tsf.OffsetConjunctions;
import edu.umass.cs.mallet.base.pipe.tsf.RegexMatches;
import edu.umass.cs.mallet.base.pipe.tsf.TokenTextCharPrefix;
import edu.umass.cs.mallet.base.pipe.tsf.TokenTextCharSuffix;
import edu.umass.cs.mallet.base.types.InstanceList;
import java.io.File;
import java.io.FileReader;
import java.io.Reader;
import java.util.regex.Pattern;

public class Trainer {
    int numEvaluations = 0;
    static int iterationsBetweenEvals = 16;
    private static String CAPS = "[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00dc]";
    private static String LOW = "[a-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00fc]";
    private static String CAPSNUM = "[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00dc0-9]";
    private static String ALPHA = "[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00fc]";
    private static String ALPHANUM = "[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00fc0-9]";
    private static String PUNCTUATION = "[,\\.;:?!()]";
    private static String QUOTE = "[\"`']";
    private static String GREEK = "(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)";

    public void train(String trainFile, String modelFile) {
        this.train(trainFile, modelFile, null);
    }

    public void train(String trainFile, String modelFile, String[] tags) {
        try {
            SerialPipes p = new SerialPipes(new Pipe[]{new Input2TokenSequence(), new RegexMatches("INITCAPS", Pattern.compile("[A-Z].*")), new RegexMatches("INITCAPSALPHA", Pattern.compile("[A-Z][a-z].*")), new RegexMatches("ALLCAPS", Pattern.compile("[A-Z]+")), new RegexMatches("CAPSMIX", Pattern.compile("[A-Za-z]+")), new RegexMatches("HASDIGIT", Pattern.compile(".*[0-9].*")), new RegexMatches("SINGLEDIGIT", Pattern.compile("[0-9]")), new RegexMatches("DOUBLEDIGIT", Pattern.compile("[0-9][0-9]")), new RegexMatches("NATURALNUMBER", Pattern.compile("[0-9]+")), new RegexMatches("REALNUMBER", Pattern.compile("[-0-9]+[.,]+[0-9.,]+")), new RegexMatches("HASDASH", Pattern.compile(".*-.*")), new RegexMatches("INITDASH", Pattern.compile("-.*")), new RegexMatches("ENDDASH", Pattern.compile(".*-")), new TokenTextCharPrefix("PREFIX=", 3), new TokenTextCharPrefix("PREFIX=", 4), new TokenTextCharSuffix("SUFFIX=", 3), new TokenTextCharSuffix("SUFFIX=", 4), new OffsetConjunctions((int[][])new int[][]{{-1}, {1}}), new RegexMatches("ALPHANUMERIC", Pattern.compile(".*[A-Za-z].*[0-9].*")), new RegexMatches("ALPHANUMERIC", Pattern.compile(".*[0-9].*[A-Za-z].*")), new RegexMatches("ROMAN", Pattern.compile("[IVXDLCM]+")), new RegexMatches("HASROMAN", Pattern.compile(".*\\b[IVXDLCM]+\\b.*")), new RegexMatches("GREEK", Pattern.compile(GREEK)), new RegexMatches("HASGREEK", Pattern.compile(".*\\b" + GREEK + "\\b.*")), new RegexMatches("PUNCTUATION", Pattern.compile("[,.;:?!-+]")), new TokenSequence2FeatureVectorSequence(true, true)});
            CRF4 crf = new CRF4((Pipe)p, null);
            System.out.println("Reading '" + trainFile + "' file...");
            InstanceList trainingData = new InstanceList((Pipe)p);
            trainingData.add((PipeInputIterator)new LineGroupIterator((Reader)new FileReader(new File(trainFile)), Pattern.compile("^.*$"), false));
            System.out.println("Doing the deed...");
            System.out.println("Number of features = " + p.getDataAlphabet().size());
            System.out.println("Training on " + trainingData.size() + " training instances...");
            crf.addStatesForLabelsConnectedAsIn(trainingData);
            if (tags != null) {
                Object[] bTags = new String[tags.length];
                Object[] iTags = new String[tags.length];
                for (int i = 0; i < tags.length; ++i) {
                    bTags[i] = "B-" + tags[i];
                    iTags[i] = "I-" + tags[i];
                }
                MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator(bTags, iTags, false);
                crf.train(trainingData, (InstanceList)null, (InstanceList)null, (TransducerEvaluator)eval, 99999, 10, new double[]{0.2, 0.5, 0.8});
            } else {
                crf.train(trainingData, (InstanceList)null, (InstanceList)null, (TransducerEvaluator)((MultiSegmentationEvaluator)null), 99999, 10, new double[]{0.2, 0.5, 0.8});
            }
            crf.write(new File(modelFile));
        }
        catch (Exception e) {
            System.err.println(e);
        }
    }
}

