/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jcore.ae.jtbd;

import cc.mallet.fst.CRF;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelSequence;
import de.julielab.jcore.ae.jtbd.EOSSymbols;
import de.julielab.jcore.ae.jtbd.Tokenizer;
import de.julielab.jcore.ae.jtbd.Unit;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.FileUtils;

public class TokenizerApplication {
    private static EvalResult do9010Evaluation(File orgSentencesFile, File tokSentencesFile, ArrayList<String> errors, ArrayList<String> predictions) {
        int i;
        ArrayList<String> orgSentences = TokenizerApplication.readFile(orgSentencesFile);
        ArrayList<String> tokSentences = TokenizerApplication.readFile(tokSentencesFile);
        long seed = 1L;
        Collections.shuffle(orgSentences, new Random(1L));
        Collections.shuffle(tokSentences, new Random(1L));
        int sizeAll = orgSentences.size();
        int sizeTest = (int)((double)sizeAll * 0.1);
        int sizeTrain = sizeAll - sizeTest;
        if (sizeTest == 0) {
            System.err.println("Error: no test files for this split.");
            System.exit(-1);
        }
        System.out.println("all: " + sizeAll + "\ttrain: " + sizeTrain + "\ttest: " + sizeTest);
        ArrayList<String> trainOrgSentences = new ArrayList<String>();
        ArrayList<String> trainTokSentences = new ArrayList<String>();
        ArrayList<String> predictOrgSentences = new ArrayList<String>();
        ArrayList<String> predictTokSentences = new ArrayList<String>();
        for (i = 0; i < sizeTrain; ++i) {
            trainOrgSentences.add(orgSentences.get(i));
            trainTokSentences.add(tokSentences.get(i));
        }
        for (i = sizeTrain; i < sizeAll; ++i) {
            predictOrgSentences.add(orgSentences.get(i));
            predictTokSentences.add(tokSentences.get(i));
        }
        return TokenizerApplication.doEvaluation(trainOrgSentences, trainTokSentences, predictOrgSentences, predictTokSentences, predictions, errors);
    }

    private static void doCheck(File orgSentencesFile, File tokSentencesFile) {
        Tokenizer tokenizer = new Tokenizer();
        System.out.println("checking on files: \n * " + orgSentencesFile.toString() + "\n * " + tokSentencesFile.toString() + "\n");
        ArrayList<String> orgSentences = TokenizerApplication.readFile(orgSentencesFile);
        ArrayList<String> tokSentences = TokenizerApplication.readFile(tokSentencesFile);
        InstanceList trainData = tokenizer.makeTrainingData(orgSentences, tokSentences);
        Pipe myPipe = trainData.getPipe();
        System.out.println("\n\n\n# Features resulting from training data: " + myPipe.getDataAlphabet().size());
        System.out.println("(critical sentences were omitted for feature generation)");
        System.out.println("Done.");
    }

    private static double doCrossEvaluation(int n, File orgSentencesFile, File tokSentencesFile, ArrayList<String> errors, ArrayList<String> predictions) {
        ArrayList<String> orgSentences = TokenizerApplication.readFile(orgSentencesFile);
        ArrayList<String> tokSentences = TokenizerApplication.readFile(tokSentencesFile);
        long seed = 1L;
        Collections.shuffle(orgSentences, new Random(1L));
        Collections.shuffle(tokSentences, new Random(1L));
        int pos = 0;
        int sizeRound = orgSentences.size() / n;
        int sizeAll = orgSentences.size();
        int sizeLastRound = sizeRound + sizeAll % n;
        System.out.println("number of files in directory: " + sizeAll);
        System.out.println("size of each/last round: " + sizeRound + "/" + sizeLastRound);
        System.out.println();
        EvalResult[] er = new EvalResult[n];
        double avgAcc = 0.0;
        double avgF = 0.0;
        for (int i = 0; i < n; ++i) {
            int j;
            ArrayList<String> predictOrgSentences = new ArrayList<String>();
            ArrayList<String> predictTokSentences = new ArrayList<String>();
            ArrayList<String> trainOrgSentences = new ArrayList<String>();
            ArrayList<String> trainTokSentences = new ArrayList<String>();
            if (i == n - 1) {
                for (j = 0; j < orgSentences.size(); ++j) {
                    if (j < pos) {
                        trainOrgSentences.add(orgSentences.get(j));
                        trainTokSentences.add(tokSentences.get(j));
                        continue;
                    }
                    predictOrgSentences.add(orgSentences.get(j));
                    predictTokSentences.add(tokSentences.get(j));
                }
            } else {
                for (j = 0; j < orgSentences.size(); ++j) {
                    if (j < pos || j >= pos + sizeRound) {
                        trainOrgSentences.add(orgSentences.get(j));
                        trainTokSentences.add(tokSentences.get(j));
                        continue;
                    }
                    predictOrgSentences.add(orgSentences.get(j));
                    predictTokSentences.add(tokSentences.get(j));
                }
                pos += sizeRound;
            }
            System.out.println("training size: " + trainOrgSentences.size());
            System.out.println("prediction size: " + predictOrgSentences.size());
            er[i] = TokenizerApplication.doEvaluation(trainOrgSentences, trainTokSentences, predictOrgSentences, predictTokSentences, predictions, errors);
        }
        DecimalFormat df = new DecimalFormat("0.000");
        for (int i = 0; i < er.length; ++i) {
            avgAcc += er[i].ACC;
            avgF += er[i].getF();
            System.out.println("ACC in round " + i + ": " + df.format(er[i].ACC));
        }
        System.out.println("\n\n------------------------------------");
        System.out.println("avg accuracy: " + df.format(avgAcc /= (double)n));
        System.out.println("avg F-score: " + df.format(avgF /= (double)n));
        System.out.println("------------------------------------");
        return avgAcc;
    }

    public static EvalResult doEvaluation(ArrayList<String> trainOrgSentences, ArrayList<String> trainTokSentences, ArrayList<String> predictOrgSentences, ArrayList<String> predictTokSentences, ArrayList<String> errors, ArrayList<String> predictions) {
        Tokenizer tokenizer = new Tokenizer();
        InstanceList trainData = tokenizer.makeTrainingData(trainOrgSentences, trainTokSentences);
        Pipe myPipe = trainData.getPipe();
        System.out.println("training model...");
        tokenizer.train(trainData, myPipe);
        return TokenizerApplication.doEvaluation(tokenizer.getModel(), predictOrgSentences, predictTokSentences, errors, predictions);
    }

    private static EvalResult doEvaluation(CRF crf, ArrayList<String> predictOrgSentences, ArrayList<String> predictTokSentences, ArrayList<String> errors, ArrayList<String> predictions) {
        Tokenizer tokenizer = new Tokenizer();
        tokenizer.setModel(crf);
        InstanceList predData = tokenizer.makePredictionData(predictOrgSentences, predictTokSentences);
        int nrDecisions = 0;
        int corrDecisions = 0;
        int fp = 0;
        int fn = 0;
        for (int i = 0; i < predData.size(); ++i) {
            String orgSentence = predictOrgSentences.get(i);
            String tokSentence = predictTokSentences.get(i);
            String sentenceBoundary = orgSentence.substring(orgSentence.length() - 1, orgSentence.length());
            Instance inst = (Instance)predData.get(i);
            ArrayList<Unit> units = null;
            units = tokenizer.predict(inst);
            ArrayList<String> orgLabels = tokenizer.getLabelsFromLabelSequence((LabelSequence)inst.getTarget());
            ArrayList wSpaces = (ArrayList)inst.getSource();
            String sentence = "";
            int localDec = 0;
            int localCorr = 0;
            boolean hasError = false;
            for (int j = 0; j < units.size(); ++j) {
                String sp = units.get((int)j).label.equals("P") ? " " : "";
                sentence = sentence + units.get((int)j).rep + sp;
                if (((String)wSpaces.get(j)).equals("WS") || j >= units.size() - 1) continue;
                ++localDec;
                if (orgLabels.get(j).equals(units.get((int)j).label)) {
                    ++localCorr;
                    continue;
                }
                hasError = true;
                if (orgLabels.get(j).equals("P") && units.get((int)j).label.equals("N")) {
                    ++fn;
                }
                if (orgLabels.get(j).equals("N") && units.get((int)j).label.equals("P")) {
                    ++fp;
                }
                errors.add("@" + orgLabels.get(j) + "->" + units.get((int)j).label);
                errors.add(tokenizer.showErrorContext(j, units, orgLabels));
            }
            nrDecisions += localDec;
            corrDecisions += localCorr;
            if (!sentence.substring(sentence.length() - 1, sentence.length()).equals(" ")) {
                sentenceBoundary = " " + sentenceBoundary;
            }
            predictions.add(sentence + sentenceBoundary);
            if (!hasError) continue;
            errors.add(sentence + sentenceBoundary);
            errors.add(tokSentence);
            errors.add("\n");
        }
        double ACC = (double)corrDecisions / (double)nrDecisions;
        EvalResult er = new EvalResult();
        er.ACC = ACC;
        er.fn = fn;
        er.fp = fp;
        er.corrDecisions = corrDecisions;
        System.out.println("\n* ------------------------------------");
        System.out.println("* critical decisions: " + nrDecisions);
        System.out.println("* correct decisions: " + corrDecisions);
        System.out.println("* fp: " + fp);
        System.out.println("* fn: " + fn);
        System.out.println("* R: " + er.getR());
        System.out.println("* P: " + er.getP());
        System.out.println("* F: " + er.getF());
        System.out.println("* ACC = " + ACC);
        System.out.println("* ------------------------------------\n");
        return er;
    }

    public static void doPrediction(File inDir, File outDir, String modelFilename) throws IOException {
        File[] predictOrgFiles;
        Tokenizer tokenizer = new Tokenizer();
        try {
            tokenizer.readModel(new File(modelFilename));
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        for (File predictOrgFile : predictOrgFiles = inDir.listFiles()) {
            long start = System.currentTimeMillis();
            List<String> orgSentences = FileUtils.readLines(predictOrgFile, "utf-8");
            ArrayList<String> tokSentences = new ArrayList<String>();
            ArrayList<String> predictions = new ArrayList<String>();
            for (int j = 0; j < orgSentences.size(); ++j) {
                tokSentences.add("");
            }
            InstanceList predData = tokenizer.makePredictionData(orgSentences, tokSentences);
            for (int i = 0; i < predData.size(); ++i) {
                String sentence = "";
                String orgSentence = orgSentences.get(i);
                if (!orgSentence.isEmpty()) {
                    char lastChar = orgSentence.charAt(orgSentence.length() - 1);
                    Instance inst = (Instance)predData.get(i);
                    ArrayList<Unit> units = null;
                    units = tokenizer.predict(inst);
                    for (int j = 0; j < units.size(); ++j) {
                        String sp = units.get((int)j).label.equals("P") ? " " : "";
                        sentence = sentence + units.get((int)j).rep + sp;
                    }
                    if (EOSSymbols.contains(Character.valueOf(lastChar))) {
                        sentence = sentence + " " + lastChar;
                    }
                    sentence = sentence.replaceAll(" +", " ");
                }
                predictions.add(sentence);
            }
            String fName = predictOrgFile.toString();
            String newfName = fName.substring(fName.lastIndexOf("/") + 1, fName.length());
            File fNew = new File(outDir.toString() + "/" + newfName);
            TokenizerApplication.writeFile(predictions, fNew);
            orgSentences = null;
            tokSentences = null;
            predictions = null;
            predData = null;
            System.gc();
            long stop = System.currentTimeMillis();
            System.out.println("took: " + (stop - start));
        }
        System.out.println("Tokenized texts written to: " + outDir.toString());
    }

    public static void doTraining(File orgSentencesFile, File tokSentencesFile, String modelFilename) {
        Tokenizer tokenizer = new Tokenizer();
        ArrayList<String> trainTokSentences = TokenizerApplication.readFile(tokSentencesFile);
        ArrayList<String> trainOrgSentences = TokenizerApplication.readFile(orgSentencesFile);
        InstanceList trainData = tokenizer.makeTrainingData(trainOrgSentences, trainTokSentences);
        Pipe myPipe = trainData.getPipe();
        System.out.println("training model...");
        tokenizer.train(trainData, myPipe);
        tokenizer.writeModel(modelFilename);
        System.out.println("\nmodel written to: " + modelFilename);
    }

    public static void main(String[] args) throws IOException {
        String mode;
        if (args.length < 1) {
            System.err.println("usage: JTBD <mode> <mode-specific-parameters>");
            TokenizerApplication.showModes();
            System.exit(-1);
        }
        if ((mode = args[0]).equals("c")) {
            TokenizerApplication.startCheckMode(args);
        } else if (mode.equals("s")) {
            TokenizerApplication.start9010ValidationMode(args);
        } else if (mode.equals("x")) {
            TokenizerApplication.startXValidationMode(args);
        } else if (mode.equals("t")) {
            TokenizerApplication.startTrainingMode(args);
        } else if (mode.equals("p")) {
            TokenizerApplication.startPredictionMode(args);
        } else if (mode.equals("e")) {
            TokenizerApplication.startCompareValidationMode(args);
        } else {
            System.err.println("unknown mode");
            TokenizerApplication.showModes();
        }
    }

    static ArrayList<String> readFile(File myFile) {
        ArrayList<String> lines = new ArrayList<String>();
        try {
            BufferedReader b = new BufferedReader(new FileReader(myFile));
            String line = "";
            while ((line = b.readLine()) != null) {
                line = line.replaceAll("[ ]+", " ");
                if ((line = line.trim()).length() <= 1 || line.equals(" ")) continue;
                lines.add(line);
            }
            b.close();
        }
        catch (Exception e) {
            System.err.println("ERR: error reading file: " + myFile.toString());
            e.printStackTrace();
            System.exit(-1);
        }
        return lines;
    }

    private static void showModes() {
        System.err.println("\nAvailable modes:");
        System.err.println("c: check data ");
        System.err.println("s: 90-10 split evaluation");
        System.err.println("x: cross validation ");
        System.err.println("t: train a tokenizer ");
        System.err.println("p: predict with tokenizer ");
        System.err.println("e: evaluation on previously trained model");
        System.exit(-1);
    }

    private static void start9010ValidationMode(String[] args) {
        if (args.length != 5) {
            System.err.println("usage: JTBD s <sent-file> <tok-file> <predout-file> <errout-file>");
            System.exit(-1);
        }
        File orgSentencesFile = new File(args[1]);
        File tokSentencesFile = new File(args[2]);
        File predOutFile = new File(args[3]);
        File errOutFile = new File(args[4]);
        ArrayList<String> errors = new ArrayList<String>();
        ArrayList<String> predictions = new ArrayList<String>();
        TokenizerApplication.do9010Evaluation(orgSentencesFile, tokSentencesFile, predictions, errors);
        TokenizerApplication.writeFile(predictions, predOutFile);
        TokenizerApplication.writeFile(errors, errOutFile);
    }

    private static void startCheckMode(String[] args) {
        if (args.length != 3) {
            System.err.println("usage: JTBD c <sent-file> <tok-file>");
            System.exit(-1);
        }
        File orgSentencesFile = new File(args[1]);
        File tokSentencesFile = new File(args[2]);
        TokenizerApplication.doCheck(orgSentencesFile, tokSentencesFile);
    }

    private static void startCompareValidationMode(String[] args) {
        if (args.length != 6) {
            System.err.println("usage: JTBD e <modelFile> <sent-file> <tok-file> <predout-file> <errout-file>");
            System.exit(-1);
        }
        CRF crf = null;
        try {
            ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(new FileInputStream(args[1])));
            crf = (CRF)in.readObject();
            in.close();
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        File orgSentencesFile = new File(args[2]);
        File tokSentencesFile = new File(args[3]);
        ArrayList<String> orgSentences = TokenizerApplication.readFile(orgSentencesFile);
        ArrayList<String> tokSentences = TokenizerApplication.readFile(tokSentencesFile);
        File predOutFile = new File(args[4]);
        File errOutFile = new File(args[5]);
        ArrayList<String> errors = new ArrayList<String>();
        ArrayList<String> predictions = new ArrayList<String>();
        TokenizerApplication.doEvaluation(crf, orgSentences, tokSentences, predictions, errors);
        TokenizerApplication.writeFile(predictions, predOutFile);
        TokenizerApplication.writeFile(errors, errOutFile);
    }

    private static void startPredictionMode(String[] args) throws IOException {
        File outDir;
        File inDir;
        if (args.length != 4) {
            System.err.println("usage: JTBD p <inDir> <outDir> <model-file>");
            System.exit(-1);
        }
        if (!(inDir = new File(args[1])).isDirectory()) {
            System.err.println("Error: the specified input directory does not exist.");
            System.exit(-1);
        }
        if (!(outDir = new File(args[2])).isDirectory() || !outDir.canWrite()) {
            System.err.println("Error: the specified output directory does not exist or is not writable.");
            System.exit(-1);
        }
        String modelFilename = args[3];
        TokenizerApplication.doPrediction(inDir, outDir, modelFilename);
    }

    private static void startTrainingMode(String[] args) {
        if (args.length != 4) {
            System.err.println("usage: JTBD t <sent-file> <tok-file> <model-file>");
            System.exit(-1);
        }
        File orgSentencesFile = new File(args[1]);
        File tokSentencesFile = new File(args[2]);
        String modelFilename = args[3];
        TokenizerApplication.doTraining(orgSentencesFile, tokSentencesFile, modelFilename);
    }

    private static void startXValidationMode(String[] args) {
        if (args.length != 6) {
            System.err.println("usage: JTBD x <sent-file> <tok-file> <cross-val-rounds> <predout-file> <errout-file>");
            System.exit(-1);
        }
        File orgSentencesFile = new File(args[1]);
        File tokSentencesFile = new File(args[2]);
        int n = new Integer(args[3]);
        File predOutFile = new File(args[4]);
        File errOutFile = new File(args[5]);
        ArrayList<String> errors = new ArrayList<String>();
        ArrayList<String> predictions = new ArrayList<String>();
        TokenizerApplication.doCrossEvaluation(n, orgSentencesFile, tokSentencesFile, predictions, errors);
        TokenizerApplication.writeFile(predictions, predOutFile);
        TokenizerApplication.writeFile(errors, errOutFile);
    }

    static void writeFile(ArrayList<String> lines, File outFile) {
        try {
            FileWriter fw = new FileWriter(outFile);
            for (int i = 0; i < lines.size(); ++i) {
                fw.write(lines.get(i) + "\n");
            }
            fw.close();
        }
        catch (Exception e) {
            System.err.println("ERR: error writing file: " + outFile.toString());
            e.printStackTrace();
            System.exit(-1);
        }
    }

    private static class EvalResult {
        double ACC;
        double fp;
        double fn;
        double corrDecisions;

        private EvalResult() {
        }

        double getF() {
            return 2.0 * this.getR() * this.getP() / (this.getR() + this.getP());
        }

        double getP() {
            return this.corrDecisions / (this.corrDecisions + this.fp);
        }

        double getR() {
            return this.corrDecisions / (this.corrDecisions + this.fn);
        }
    }
}

