/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.ner.exec;

import de.datexis.common.CommandLineParser;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.encoder.Encoder;
import de.datexis.encoder.impl.PositionEncoder;
import de.datexis.encoder.impl.SurfaceEncoder;
import de.datexis.encoder.impl.TrigramEncoder;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.ner.MatchingAnnotator;
import de.datexis.ner.MentionAnnotator;
import de.datexis.reader.RawTextDatasetReader;
import java.io.IOException;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TrainMentionAnnotatorSeedList {
    protected static final Logger log = LoggerFactory.getLogger(TrainMentionAnnotatorSeedList.class);

    public static void main(String[] args) throws IOException {
        ExecParams params = new ExecParams();
        CommandLineParser parser = new CommandLineParser((CommandLineParser.Options)params);
        try {
            parser.parse(args);
            new TrainMentionAnnotatorSeedList().runTraining(params);
            System.exit(0);
        }
        catch (ParseException e) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("texoo-train-ner-seed", "TeXoo: train MentionAnnotator with seed list", params.setUpCliOptions(), "", true);
            System.exit(1);
        }
    }

    protected void runTraining(ExecParams params) throws IOException {
        Resource inputPath = Resource.fromDirectory((String)params.inputFiles);
        Resource outputPath = Resource.fromDirectory((String)params.outputPath);
        Resource seedPath = Resource.fromDirectory((String)params.seedList);
        WordHelpers.Language lang = WordHelpers.getLanguage((String)params.language);
        Dataset train = new RawTextDatasetReader().read(inputPath);
        MatchingAnnotator match = new MatchingAnnotator(MatchingAnnotator.MatchingStrategy.LOWERCASE);
        match.loadTermsToMatch(seedPath);
        MentionAnnotator ner = new MentionAnnotator.Builder().withEncoders("tri", new Encoder[]{new PositionEncoder(), new SurfaceEncoder(), new TrigramEncoder()}).enableTrainingUI(params.trainingUI).withTrainingParams(1.0E-4, 16, 1).withModelParams(512, 256).withWorkspaceParams(1).pretrain(train).build();
        ner.trainModel(train, Annotation.Source.SILVER, lang, 5000, false, true);
        System.out.println("saving model to path: " + outputPath);
        ner.writeModel(outputPath);
    }

    protected static class ExecParams
    implements CommandLineParser.Options {
        protected String inputFiles;
        protected String seedList;
        protected String language;
        protected String outputPath = null;
        protected boolean trainingUI = false;

        protected ExecParams() {
        }

        public void setParams(CommandLine parse) {
            this.inputFiles = parse.getOptionValue("i");
            this.seedList = parse.getOptionValue("s");
            this.outputPath = parse.getOptionValue("o");
            this.trainingUI = parse.hasOption("u");
            this.language = parse.getOptionValue("l", "en");
        }

        public Options setUpCliOptions() {
            Options op = new Options();
            op.addRequiredOption("i", "input", true, "path or file name for raw input text");
            op.addRequiredOption("s", "seed", true, "path to seed list text file");
            op.addRequiredOption("o", "output", true, "path to create and store the model");
            op.addOption("l", "language", true, "language to use for sentence splitting and stopwords (EN or DE)");
            op.addOption("u", "ui", false, "enable training UI (http://127.0.0.1:9000)");
            return op;
        }
    }
}

