/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jtbd;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import de.julielab.jtbd.TokenBoundarySymbols;
import de.julielab.jtbd.Unit;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class Sentence2TokenPipe
extends Pipe {
    private static final long serialVersionUID = 1L;
    private static final Logger LOGGER = LoggerFactory.getLogger(Sentence2TokenPipe.class);
    private static final String CAPS = "A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc";
    private static final String LOW = "a-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc";
    private final Set<String> tbSymbols;
    private final Pattern splitPattern = Pattern.compile("[^\\s]+");

    public Sentence2TokenPipe() {
        super(new Alphabet(), new LabelAlphabet());
        this.tbSymbols = TokenBoundarySymbols.getSymbols();
    }

    private ArrayList<String> getSuperUnits(String line) {
        Matcher m = this.splitPattern.matcher(line);
        ArrayList<String> superUnits = new ArrayList<String>();
        while (m.find()) {
            superUnits.add(m.group());
        }
        return superUnits;
    }

    public ArrayList<String> makeLabels(String tokSentence) {
        LOGGER.trace("makeLabels()");
        ArrayList<String> labels = new ArrayList<String>();
        StringBuffer sentence = new StringBuffer(tokSentence);
        StringBuffer currUnit = new StringBuffer();
        while (sentence.length() > 0) {
            String c = String.valueOf(sentence.charAt(0));
            LOGGER.trace("makeLabels() - " + c);
            if (Pattern.matches("\\s", c)) {
                LOGGER.trace("makeLabels() - found WS");
                if (currUnit.length() > 0) {
                    currUnit.delete(0, currUnit.length());
                    LOGGER.trace("makeLabels() - adding label P");
                    labels.add("P");
                }
                sentence.deleteCharAt(0);
                continue;
            }
            if (this.tbSymbols.contains(c)) {
                LOGGER.trace("makeLabels() - found TB");
                if (currUnit.length() > 0) {
                    currUnit.delete(0, currUnit.length());
                    LOGGER.trace("makeLabels() - adding label N");
                    labels.add("N");
                }
                currUnit.append(c);
                if (sentence.length() > 1) {
                    String c1 = String.valueOf(sentence.charAt(1));
                    if (Pattern.matches("\\s", c1)) {
                        LOGGER.trace("makeLabels() - label P");
                        labels.add("P");
                    } else {
                        LOGGER.trace("makeLabels() - label N");
                        labels.add("N");
                    }
                } else {
                    LOGGER.trace("makeLabels() - label N");
                    labels.add("N");
                }
                if (currUnit.length() > 0) {
                    currUnit.delete(0, currUnit.length());
                }
                sentence.deleteCharAt(0);
                continue;
            }
            LOGGER.trace("makeLabels() - token");
            currUnit.append(c);
            sentence.deleteCharAt(0);
        }
        LOGGER.trace("makeLabels() -  " + tokSentence);
        if (currUnit.length() > 0) {
            labels.add("N");
        }
        LOGGER.trace("makeLabels() - " + labels.toString());
        return labels;
    }

    public void makeUnits(String orgSentence, ArrayList<Unit> units, ArrayList<String> wSpaces) {
        LOGGER.trace("makeUnits() - making units...");
        StringBuffer sentence = new StringBuffer(orgSentence);
        ArrayList<String> superUnitAlphabet = this.getSuperUnits(orgSentence);
        int superUnitIterator = 0;
        StringBuffer currUnit = new StringBuffer();
        int start = 0;
        int end = 0;
        while (sentence.length() > 0) {
            Unit unit;
            String c = String.valueOf(sentence.charAt(0));
            LOGGER.trace("makeUnits() - " + c);
            if (Pattern.matches("\\s", c)) {
                LOGGER.trace("makeUnits() - WS");
                if (currUnit.length() > 0) {
                    unit = new Unit(start, end, currUnit.toString(), superUnitAlphabet.get(superUnitIterator));
                    units.add(unit);
                    LOGGER.trace("makeUnits() -adding unit:" + currUnit + "!");
                    currUnit.delete(0, currUnit.length());
                    wSpaces.add("WS");
                    ++superUnitIterator;
                }
                sentence.deleteCharAt(0);
                LOGGER.trace("makeUnits() - " + units.toString() + " -- " + wSpaces.toString());
                start = ++end;
                continue;
            }
            if (this.tbSymbols.contains(c)) {
                LOGGER.trace("makeUnits() - TB");
                if (currUnit.length() > 0) {
                    unit = new Unit(start, end, currUnit.toString(), superUnitAlphabet.get(superUnitIterator));
                    units.add(unit);
                    LOGGER.trace("makeUnits() - Adding unit:" + currUnit + "!");
                    currUnit.delete(0, currUnit.length());
                    wSpaces.add("noWS");
                    start = end;
                    LOGGER.trace("makeUnits() - SE:" + start + "." + end);
                }
                currUnit.append(c);
                LOGGER.trace("makeUnits() - adding unit:" + currUnit + "!!");
                if (sentence.length() > 1) {
                    String c1 = String.valueOf(sentence.charAt(1));
                    if (Pattern.matches("\\s", c1)) {
                        wSpaces.add("WS");
                    } else {
                        wSpaces.add("noWS");
                    }
                } else {
                    wSpaces.add("noWS");
                }
                if (currUnit.length() > 0) {
                    LOGGER.trace("makeUnits() - SE:" + start + "." + ++end);
                    unit = new Unit(start, end, currUnit.toString(), superUnitAlphabet.get(superUnitIterator));
                    units.add(unit);
                    currUnit.delete(0, currUnit.length());
                }
                sentence.deleteCharAt(0);
                start = end;
                LOGGER.trace("makeUnits() - " + units.toString() + " -- " + wSpaces.toString());
                continue;
            }
            LOGGER.trace("makeUnits() - token");
            currUnit.append(c);
            sentence.deleteCharAt(0);
            ++end;
        }
        LOGGER.trace("makeUnits() - " + orgSentence);
        if (currUnit.length() > 0) {
            Unit unit = new Unit(start, end, currUnit.toString(), superUnitAlphabet.get(superUnitIterator));
            units.add(unit);
            wSpaces.add("noWS");
        }
        String sent = "";
        int j = 0;
        while (j < units.size()) {
            LOGGER.trace("makeUnits() - " + units.get(j) + "\t" + wSpaces.get(j));
            String sp = wSpaces.get(j).equals("WS") ? " " : "";
            sent = String.valueOf(sent) + units.get((int)j).rep + sp;
            ++j;
        }
        LOGGER.trace("makeUnits() -org: " + orgSentence);
        LOGGER.trace("makeUnits() -new: " + sent);
        LOGGER.trace("makeUnits() - " + units.toString());
    }

    @Override
    public Instance pipe(Instance carrier) {
        int i;
        ArrayList<String> labels;
        String orgSentence = (String)carrier.getData();
        String tokSentence = (String)carrier.getSource();
        TokenSequence data = new TokenSequence();
        LabelSequence target = new LabelSequence(this.getTargetAlphabet());
        ArrayList<Unit> units = new ArrayList<Unit>();
        ArrayList<String> wSpaces = new ArrayList<String>();
        this.makeUnits(orgSentence, units, wSpaces);
        if (tokSentence.length() > 0) {
            labels = this.makeLabels(tokSentence);
        } else {
            labels = new ArrayList();
            i = 0;
            while (i < units.size()) {
                labels.add("N");
                ++i;
            }
        }
        if (units.size() != labels.size() || labels.size() != wSpaces.size()) {
            int pos = -1;
            if (carrier.getName() != null) {
                pos = (Integer)carrier.getName() + 1;
            }
            LOGGER.error("Something's wrong with unit creation. Number of units: {}; number of labels: {}; number of whitespaces: {}", units.size(), labels.size(), wSpaces.size());
            LOGGER.error("pipe() - Unit and label extraction produced failure (at position " + (pos == -1 ? "unknown" : Integer.valueOf(pos)) + "). Omitting sentences for feature generation...\n" + orgSentence + "\n" + tokSentence);
            carrier.setData(data);
            carrier.setTarget(target);
            carrier.setName(units);
            return carrier;
        }
        i = 0;
        while (i < units.size()) {
            String unitRep = units.get((int)i).rep;
            String superUnitRep = units.get((int)i).superUnitRep;
            String label = labels.get(i);
            Token token = new Token(unitRep);
            token.setFeatureValue("U_lex=" + unitRep, 1.0);
            if (wSpaces.get(i).equals("WS")) {
                token.setFeatureValue("U_HasRightWhiteSpace", 1.0);
            }
            if (this.tbSymbols.contains(unitRep)) {
                token.setFeatureValue("U_isTokenBoundarySymbol", 1.0);
            }
            String bwc = unitRep;
            bwc = bwc.replaceAll("[A-Z]+", "A");
            bwc = bwc.replaceAll("[a-z]+", "a");
            bwc = bwc.replaceAll("[0-9]+", "0");
            bwc = bwc.replaceAll("[^A-Za-z0-9]+", "x");
            token.setFeatureValue("U_BWC=" + bwc, 1.0);
            if (unitRep.length() <= 3) {
                token.setFeatureValue("U_SIZE1", 1.0);
            } else if (unitRep.length() <= 6) {
                token.setFeatureValue("U_SIZE2", 1.0);
            } else {
                token.setFeatureValue("U_SIZE3", 1.0);
            }
            if (unitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc]\\.")) {
                token.setFeatureValue("U_ABBR1", 1.0);
            }
            if (unitRep.matches("([A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc]\\.)+")) {
                token.setFeatureValue("U_ABBR2", 1.0);
            }
            if (unitRep.matches("[a-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc]+\\.")) {
                token.setFeatureValue("U_ABBR3", 1.0);
            }
            if (unitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc].*")) {
                token.setFeatureValue("U_INITCAPS", 1.0);
            }
            if (unitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc]")) {
                token.setFeatureValue("U_ONECAPS", 1.0);
            }
            if (unitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc]+")) {
                token.setFeatureValue("U_ALLCAPS", 1.0);
            }
            if (unitRep.matches("(.*[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc].*[0-9].*|.*[0-9].*[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc].*)")) {
                token.setFeatureValue("U_ALPHANUMERIC", 1.0);
            }
            if (unitRep.matches("[IVXDLCM]+")) {
                token.setFeatureValue("U_ROMAN", 1.0);
            }
            if (unitRep.matches(".*\\b[IVXDLCM]+\\b.*")) {
                token.setFeatureValue("U_HASROMAN", 1.0);
            }
            if (unitRep.matches("[0-9]+")) {
                token.setFeatureValue("U_NATURALNUMBER", 1.0);
            }
            if (unitRep.matches("[-0-9]+[.,]+[0-9.,]+")) {
                token.setFeatureValue("U_REALNUMBER", 1.0);
            }
            if (unitRep.matches(".*[0-9]+.*")) {
                token.setFeatureValue("U_HASDIGITS", 1.0);
            }
            if (unitRep.matches("(\\(.*|\\[.*)")) {
                token.setFeatureValue("U_BEGINBRACKETS", 1.0);
            }
            token.setFeatureValue("SU_lex=" + superUnitRep, 1.0);
            if (superUnitRep.matches(".*[\\w]]+.*")) {
                token.setFeatureValue("SU_isAlphanumeric", 1.0);
            }
            if (superUnitRep.matches("\\(.*\\)|\\[.*\\]")) {
                token.setFeatureValue("SU_inBrackets", 1.0);
            } else if (superUnitRep.matches(".*\\(.*\\).*|.*\\[.*\\].*")) {
                token.setFeatureValue("SU_hasClosedBrackets", 1.0);
            } else if (superUnitRep.matches(".*\\(.*|.*\\[.*")) {
                token.setFeatureValue("SU_hasLeftBracketOnly", 1.0);
            } else if (superUnitRep.matches(".*\\).*|.*\\].*")) {
                token.setFeatureValue("SU_hasRightBracketOnly", 1.0);
            }
            if (superUnitRep.matches(".*-->.*") && (unitRep.equals("-") || unitRep.equals(">"))) {
                token.setFeatureValue("SU_isPartOfArrow", 1.0);
            }
            if (superUnitRep.matches("----")) {
                token.setFeatureValue("SU_isDoubleDash", 1.0);
            } else if (superUnitRep.matches(".*----.*")) {
                token.setFeatureValue("SU_hasDoubleDash", 1.0);
            } else if (superUnitRep.matches("--")) {
                token.setFeatureValue("SU_isDash", 1.0);
            } else if (superUnitRep.matches(".*--.*")) {
                token.setFeatureValue("SU_hasDash", 1.0);
            }
            if (superUnitRep.matches(".*[+-]/[+-].*")) {
                token.setFeatureValue("SU_hasPlusMinus", 1.0);
            }
            if (superUnitRep.matches(".*\\([+-]\\).*")) {
                token.setFeatureValue("SU_PMwithBrackets", 1.0);
            }
            if (superUnitRep.matches("\\(([0-9]|[a-h]|i|ii|iii|iv|v)\\)")) {
                token.setFeatureValue("SU_isEnumeration", 1.0);
            }
            if (superUnitRep.matches(".*\\(s\\)")) {
                token.setFeatureValue("SU_hasBracketedPlural", 1.0);
            }
            if (superUnitRep.matches(".*'s")) {
                token.setFeatureValue("SU_hasGenitive", 1.0);
            }
            if (superUnitRep.length() <= 4) {
                token.setFeatureValue("SU_SIZE1", 1.0);
            } else if (superUnitRep.length() <= 8) {
                token.setFeatureValue("SU_SIZE2", 1.0);
            } else {
                token.setFeatureValue("SU_SIZE3", 1.0);
            }
            if (superUnitRep.length() > 6 && superUnitRep.matches("(.*[\\W].*){5,}") && !superUnitRep.contains("-->")) {
                token.setFeatureValue("SU_isChemical", 1.0);
            }
            String su_bwc = superUnitRep;
            su_bwc = su_bwc.replaceAll("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc]+", "A");
            su_bwc = su_bwc.replaceAll("[a-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc]+", "a");
            su_bwc = su_bwc.replaceAll("[0-9]+", "0");
            su_bwc = su_bwc.replaceAll("[^A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc0-9]+", "x");
            token.setFeatureValue("SU_BWC=" + su_bwc, 1.0);
            if (superUnitRep.matches("\\(?www\\..*?\\)?")) {
                token.setFeatureValue("SU_wwwURL", 1.0);
            } else if (superUnitRep.matches("\\(?http:.*?\\)?") || superUnitRep.matches("\\(?ftp:.*?\\)?")) {
                token.setFeatureValue("SU_httpURL", 1.0);
            }
            data.add(token);
            target.add(label);
            ++i;
        }
        carrier.setData(data);
        carrier.setTarget(target);
        carrier.setName(units);
        carrier.setSource(wSpaces);
        return carrier;
    }
}

