/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jcore.ae.opennlp.chunk.convert;

import de.julielab.java.utilities.FileUtilities;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamReader;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;

public class ToIOBConverter {
    private AnalysisEngine jtbd = AnalysisEngineFactory.createEngine((String)"de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english", (Object[])new Object[0]);
    private JCas jCas;
    private AnalysisEngine pennbioIEPosTagger = AnalysisEngineFactory.createEngine((String)"de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp-postag-ae-biomedical-english", (Object[])new Object[0]);

    public static void main(String[] args) throws Exception {
        File[] inputFiles;
        if (args.length < 2) {
            System.err.println("Usage: " + ToIOBConverter.class.getCanonicalName() + " <from file or dir> <to file or dir> [true: create single output file]");
            System.exit(1);
        }
        File from = new File(args[0]);
        File to = new File(args[1]);
        boolean singleFile = false;
        if (args.length == 3) {
            singleFile = Boolean.parseBoolean(args[2]);
        }
        if (!singleFile && (from.isFile() && to.exists() && to.isDirectory() || from.isDirectory() && to.exists() && to.isFile())) {
            throw new IllegalArgumentException("Both paths must be directories or both must be files.");
        }
        System.out.println("Input: " + from.getAbsolutePath());
        System.out.println("Output: " + to.getAbsolutePath());
        System.out.println("Output is written as a single file: " + singleFile);
        if (from.isDirectory()) {
            inputFiles = from.listFiles((f, n) -> n.endsWith(".xml") || n.endsWith(".xml.gz"));
            if (!to.exists() && !singleFile) {
                System.out.println("Creating target directory " + to.getAbsolutePath());
                to.mkdirs();
            }
        } else {
            inputFiles = new File[]{from};
        }
        System.out.println("Converting");
        ToIOBConverter converter = new ToIOBConverter();
        if (singleFile) {
            try (BufferedWriter writerToFile = FileUtilities.getWriterToFile((File)to);){
                for (int i = 0; i < inputFiles.length; ++i) {
                    File file = inputFiles[i];
                    converter.convert(FileUtilities.getInputStreamFromFile((File)file), writerToFile);
                }
            }
        }
        for (int i = 0; i < inputFiles.length; ++i) {
            File file = inputFiles[i];
            File targetFile = new File(to.getAbsolutePath() + File.separator + file.getName().replace(".xml", ".iob"));
            try (BufferedInputStream is = FileUtilities.getInputStreamFromFile((File)file);
                 BufferedWriter bw = FileUtilities.getWriterToFile((File)targetFile);){
                converter.convert(is, bw);
                continue;
            }
        }
        System.out.println("Done.");
    }

    public void convert(File from, File to, boolean singleFile) throws Exception {
        try (BufferedInputStream is = FileUtilities.getInputStreamFromFile((File)from);
             BufferedWriter bw = FileUtilities.getWriterToFile((File)to);){
            this.convert(is, bw);
        }
    }

    public void convert(InputStream input, BufferedWriter output) throws Exception {
        XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(input);
        String consTag = "O";
        String iobState = "";
        String lastWrittenConsTag = "";
        ArrayList<ChunkRecord> sentenceRecords = new ArrayList<ChunkRecord>();
        ChunkRecord previousRecord = null;
        Stack<String> consTags = new Stack<String>();
        while (reader.hasNext()) {
            String tag;
            int eventType = reader.next();
            if (eventType == 1) {
                switch (tag = reader.getLocalName()) {
                    case "sentence": {
                        sentenceRecords.clear();
                        break;
                    }
                    case "cons": {
                        String newTag = reader.getAttributeValue("", "cat");
                        iobState = "";
                        consTag = newTag;
                        consTags.add(consTag);
                        break;
                    }
                    case "tok": {
                        String tokTag = reader.getAttributeValue("", "cat");
                        String tokText = reader.getElementText();
                        List<Object> tokens = new ArrayList<String[]>();
                        tokens.add(new String[]{tokText, tokTag});
                        if (tokText.contains(" ")) {
                            tokens = this.tokenize(tokText);
                        }
                        if (tokText.contains("(")) {
                            tokens = this.balanceParenthesis(tokens);
                        }
                        for (String[] stringArray : tokens) {
                            tokText = stringArray[0];
                            tokTag = stringArray[1];
                            if (iobState.isEmpty() && !consTag.equals("O")) {
                                iobState = "B-";
                            } else if (iobState.equals("B-")) {
                                iobState = "I-";
                            } else if (consTag.equals("O")) {
                                String outerConsTag = (String)consTags.peek();
                                if (!tokTag.equals("CC")) {
                                    consTag = outerConsTag;
                                    iobState = "B-";
                                }
                            }
                            switch (tokTag) {
                                case "LRB": 
                                case "RRB": {
                                    consTag = "O";
                                    iobState = "";
                                    break;
                                }
                                case "COMMA": {
                                    if (!iobState.equals("B-")) break;
                                    consTag = "O";
                                    iobState = "";
                                }
                            }
                            if (lastWrittenConsTag.equals(consTag) && !consTag.equalsIgnoreCase("O")) {
                                iobState = "I-";
                            }
                            if (previousRecord != null && !iobState.equals("I-") && this.isPunctuation(previousRecord.tokTag)) {
                                previousRecord.consTag = "O";
                                previousRecord.iobState = "";
                            }
                            tokTag = this.mapPennTreebankToPennBioIETag(tokTag);
                            tokTag = this.mapTokenToPennBioIETag(tokText, tokTag);
                            if (null != previousRecord) {
                                sentenceRecords.add(previousRecord);
                            }
                            previousRecord = new ChunkRecord(tokText, tokTag, iobState, consTag);
                        }
                        lastWrittenConsTag = consTag;
                    }
                }
            }
            if (eventType != 2) continue;
            switch (tag = reader.getLocalName()) {
                case "cons": {
                    consTag = "O";
                    iobState = "";
                    consTags.pop();
                    break;
                }
                case "sentence": {
                    boolean omitSentence = false;
                    for (ChunkRecord cr : sentenceRecords) {
                        if (!cr.consTag.equals("S")) continue;
                        if (!this.mapPosTagToPhraseType(cr.tokTag).isEmpty()) {
                            omitSentence = true;
                            continue;
                        }
                        cr.consTag = "O";
                        cr.iobState = "";
                    }
                    if (omitSentence) {
                        previousRecord = null;
                        break;
                    }
                    for (ChunkRecord cr : sentenceRecords) {
                        output.write(cr.getRecordLine());
                    }
                    if (this.isPunctuation(previousRecord.tokTag)) {
                        previousRecord.consTag = "O";
                        previousRecord.iobState = "";
                    }
                    output.write(previousRecord.getRecordLine());
                    previousRecord = null;
                    output.write("\n");
                }
            }
        }
    }

    public void repairEnvironment(List<ChunkRecord> sentenceRecords, int i, ChunkRecord cr) {
        ChunkRecord nextCr;
        ChunkRecord previousCr = sentenceRecords.get(i - 1);
        if (previousCr.consTag.equals(cr.consTag)) {
            cr.iobState = "I-";
        } else if (!cr.consTag.equals("O")) {
            cr.iobState = "B-";
        }
        if (i < sentenceRecords.size() - 1 && (nextCr = sentenceRecords.get(i + 1)).consTag.equals(cr.consTag) && !nextCr.consTag.equals("O")) {
            nextCr.iobState = "I-";
        }
    }

    private boolean isPunctuation(String tokTag) {
        switch (tokTag) {
            case ".": 
            case ",": 
            case ":": 
            case "``": 
            case "''": 
            case "-LRB-": 
            case "-RRB-": {
                return true;
            }
        }
        return false;
    }

    private List<String[]> balanceParenthesis(List<String[]> tokens) {
        if (tokens.get(0)[1].equals("-LRB-")) {
            return tokens;
        }
        ArrayList<String[]> ret = new ArrayList<String[]>();
        int numOpen = 0;
        int numClose = 0;
        Matcher mo = Pattern.compile("\\(").matcher("");
        Matcher mc = Pattern.compile("\\)").matcher("");
        for (int i = 0; i < tokens.size(); ++i) {
            String[] tokPos = tokens.get(i);
            String token = tokPos[0];
            mo.reset(token);
            mc.reset(token);
            while (mo.find()) {
                ++numOpen;
            }
            while (mc.find()) {
                ++numClose;
            }
            while (numOpen > numClose && i < tokens.size() - 1) {
                String[] nextTokPos = tokens.get(++i);
                String nextTok = nextTokPos[0];
                mo.reset(nextTok);
                mc.reset(nextTok);
                while (mo.find()) {
                    ++numOpen;
                }
                while (mc.find()) {
                    ++numClose;
                }
                token = token + nextTok;
            }
            ret.add(new String[]{token, tokPos[1]});
        }
        return ret;
    }

    public String mapPosTagToPhraseType(String tag) {
        switch (tag) {
            case "MD": 
            case "VB": 
            case "VBC": 
            case "VBD": 
            case "VBF": 
            case "VBG": 
            case "VBN": 
            case "VBP": 
            case "VBZ": {
                return "VP";
            }
            case "NN": 
            case "NNS": 
            case "NNP": 
            case "NNPS": 
            case "PRP$": 
            case "PRP": 
            case "WP": 
            case "WP$": {
                return "NP";
            }
            case "IN": {
                return "PP";
            }
        }
        return "";
    }

    public String mapPennTreebankToPennBioIETag(String tag) {
        switch (tag) {
            case "COLON": {
                return ":";
            }
            case "COMMA": {
                return ",";
            }
            case "LQT": {
                return "``";
            }
            case "LRB": {
                return "-LRB-";
            }
            case "PERIOD": {
                return ".";
            }
            case "PRPP": {
                return "PRP$";
            }
            case "RQT": {
                return "''";
            }
            case "RRB": {
                return "-RRB-";
            }
            case "WPP": {
                return "WP$";
            }
        }
        return tag;
    }

    public String mapTokenToPennBioIETag(String token, String originalTag) {
        switch (token) {
            case "-": {
                return "HYPH";
            }
        }
        return originalTag;
    }

    public ToIOBConverter() throws Exception {
        this.jCas = JCasFactory.createJCas((String[])new String[]{"de.julielab.jcore.types.jcore-morpho-syntax-types"});
    }

    /*
     * Exception decompiling
     */
    private List<String[]> tokenize(String text) throws AnalysisEngineProcessException {
        /*
         * This method has failed to decompile.  When submitting a bug report, please provide this stack trace, and (if you hold appropriate legal rights) the relevant class file.
         * 
         * java.lang.UnsupportedOperationException
         *     at org.benf.cfr.reader.bytecode.analysis.parse.expression.NewAnonymousArray.getDimSize(NewAnonymousArray.java:142)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.isNewArrayLambda(LambdaRewriter.java:455)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.rewriteDynamicExpression(LambdaRewriter.java:409)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.rewriteDynamicExpression(LambdaRewriter.java:167)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.rewriteExpression(LambdaRewriter.java:105)
         *     at org.benf.cfr.reader.bytecode.analysis.parse.rewriters.ExpressionRewriterHelper.applyForwards(ExpressionRewriterHelper.java:12)
         *     at org.benf.cfr.reader.bytecode.analysis.parse.expression.AbstractMemberFunctionInvokation.applyExpressionRewriterToArgs(AbstractMemberFunctionInvokation.java:101)
         *     at org.benf.cfr.reader.bytecode.analysis.parse.expression.AbstractMemberFunctionInvokation.applyExpressionRewriter(AbstractMemberFunctionInvokation.java:88)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.rewriteExpression(LambdaRewriter.java:103)
         *     at org.benf.cfr.reader.bytecode.analysis.parse.expression.AbstractMemberFunctionInvokation.applyExpressionRewriter(AbstractMemberFunctionInvokation.java:87)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.rewriteExpression(LambdaRewriter.java:103)
         *     at org.benf.cfr.reader.bytecode.analysis.structured.statement.StructuredReturn.rewriteExpressions(StructuredReturn.java:99)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.op4rewriters.LambdaRewriter.rewrite(LambdaRewriter.java:88)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.Op04StructuredStatement.rewriteLambdas(Op04StructuredStatement.java:1137)
         *     at org.benf.cfr.reader.bytecode.CodeAnalyser.getAnalysisInner(CodeAnalyser.java:912)
         *     at org.benf.cfr.reader.bytecode.CodeAnalyser.getAnalysisOrWrapFail(CodeAnalyser.java:278)
         *     at org.benf.cfr.reader.bytecode.CodeAnalyser.getAnalysis(CodeAnalyser.java:201)
         *     at org.benf.cfr.reader.entities.attributes.AttributeCode.analyse(AttributeCode.java:94)
         *     at org.benf.cfr.reader.entities.Method.analyse(Method.java:531)
         *     at org.benf.cfr.reader.entities.ClassFile.analyseMid(ClassFile.java:1055)
         *     at org.benf.cfr.reader.entities.ClassFile.analyseTop(ClassFile.java:942)
         *     at org.benf.cfr.reader.Driver.doJarVersionTypes(Driver.java:257)
         *     at org.benf.cfr.reader.Driver.doJar(Driver.java:139)
         *     at org.benf.cfr.reader.CfrDriverImpl.analyse(CfrDriverImpl.java:76)
         *     at org.benf.cfr.reader.Main.main(Main.java:54)
         */
        throw new IllegalStateException("Decompilation failed");
    }

    private class ChunkRecord {
        private String tokenPart;
        private String tokTag;
        private String iobState;
        private String consTag;

        public ChunkRecord(String tokenPart, String tokTag, String iobState, String consTag) {
            this.tokenPart = tokenPart;
            this.tokTag = tokTag;
            this.iobState = iobState;
            this.consTag = consTag;
        }

        public String getRecordLine() {
            return this.tokenPart + " " + this.tokTag + " " + this.iobState + this.consTag + "\n";
        }
    }
}

