/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jsbd;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import de.julielab.jsbd.Abbreviations;
import de.julielab.jsbd.EOSSymbols;
import de.julielab.jsbd.Unit;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class Abstract2UnitPipe
extends Pipe {
    private static final String CAPS = "A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc";
    private static final String LOW = "a-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc";
    private static final long serialVersionUID = 1L;
    private static final Pattern splitPattern = Pattern.compile("[^\\s]+");
    TreeSet<String> eosSymbols = new EOSSymbols().getSymbols();
    TreeSet<String> abbrList = new Abbreviations().getSet();

    Abstract2UnitPipe() {
        super(new Alphabet(), (Alphabet)new LabelAlphabet());
    }

    public Instance pipe(Instance carrier) {
        String abstractFileName = (String)carrier.getSource();
        ArrayList lines = (ArrayList)carrier.getData();
        HashMap<String, Integer> unitFreq = this.getUnitFrequency(lines);
        TokenSequence data = new TokenSequence();
        LabelSequence target = new LabelSequence((Alphabet)((LabelAlphabet)this.getTargetAlphabet()));
        ArrayList<Unit> unitInfo = new ArrayList<Unit>();
        int i = 0;
        while (i < lines.size()) {
            ArrayList<Unit> units;
            String line = (String)lines.get(i);
            if (line.length() != 0 && (units = this.getUnits(line)).size() != 0) {
                int j = 0;
                while (j < units.size()) {
                    int freq;
                    int count;
                    String currUnitRep = units.get((int)j).rep;
                    String plainUnitRep = this.getPlainUnit(currUnitRep);
                    String label = "IS";
                    Token token = new Token(currUnitRep);
                    if (this.containsEOSSymbol(currUnitRep)) {
                        token.setFeatureValue("endwithEOSSymb=" + this.getEOSSymbol(currUnitRep), 1.0);
                    }
                    if (j + 1 == units.size()) {
                        label = "EOS";
                    }
                    if ((count = this.nrEOSSymbolsContained(plainUnitRep)) > 0) {
                        token.setFeatureValue("hasinnerEOSSymb=" + count, 1.0);
                    }
                    token.setFeatureValue("TOKEN=" + currUnitRep, 1.0);
                    if (currUnitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc].*")) {
                        token.setFeatureValue("INITCAPS", 1.0);
                    }
                    if (currUnitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc]")) {
                        token.setFeatureValue("ONECAPS", 1.0);
                    }
                    if (currUnitRep.matches("[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dc]+")) {
                        token.setFeatureValue("ALLCAPS", 1.0);
                    }
                    if (currUnitRep.matches("(.*[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc].*[0-9].*|.*[0-9].*[A-Z\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9\u00c7\u00d1\u00cf\u00c4\u00d6\u00dca-z\u00e0\u00e8\u00ec\u00f2\u00f9\u00e1\u00e9\u00ed\u00f3\u00fa\u00e7\u00f1\u00ef\u00e4\u00f6\u00fc].*)")) {
                        token.setFeatureValue("ALPHANUMERIC", 1.0);
                    }
                    if (currUnitRep.matches("[IVXDLCM]+")) {
                        token.setFeatureValue("ROMAN", 1.0);
                    }
                    if (currUnitRep.matches(".*\\b[IVXDLCM]+\\b.*")) {
                        token.setFeatureValue("HASROMAN", 1.0);
                    }
                    if (currUnitRep.matches("[0-9]+")) {
                        token.setFeatureValue("NATURALNUMBER", 1.0);
                    }
                    if (currUnitRep.matches("[-0-9]+[.,]+[0-9.,]+")) {
                        token.setFeatureValue("REALNUMBER", 1.0);
                    }
                    if (currUnitRep.matches(".*[0-9]+.*")) {
                        token.setFeatureValue("HASDIGITS", 1.0);
                    }
                    if (currUnitRep.matches("(\\(.*|\\[.*)")) {
                        token.setFeatureValue("BEGINBRACKETS", 1.0);
                    }
                    if (currUnitRep.matches("(\\(.*\\)|\\[.*\\])")) {
                        token.setFeatureValue("INSIDEBRACKETS", 1.0);
                    }
                    if (currUnitRep.matches("(\".*|'.*)")) {
                        token.setFeatureValue("BEGINQUOTES", 1.0);
                    }
                    if (currUnitRep.matches("(\".*\"|'.*')")) {
                        token.setFeatureValue("INSIDEBQUOTES", 1.0);
                    }
                    if (currUnitRep.length() <= 3) {
                        token.setFeatureValue("SIZE1", 1.0);
                    } else if (currUnitRep.length() <= 6) {
                        token.setFeatureValue("SIZE2", 1.0);
                    } else {
                        token.setFeatureValue("SIZE3", 1.0);
                    }
                    if (currUnitRep.matches("[A-Z]\\.")) {
                        token.setFeatureValue("ABBR1", 1.0);
                    }
                    if (currUnitRep.matches("([A-Za-z]\\.)+")) {
                        token.setFeatureValue("ABBR2", 1.0);
                    }
                    if (currUnitRep.matches("[abcdfghjklmnpqrstvwxyz]+\\.")) {
                        token.setFeatureValue("ABBR3", 1.0);
                    }
                    String wc = plainUnitRep;
                    String bwc = plainUnitRep;
                    wc = wc.replaceAll("[A-Z]", "A");
                    wc = wc.replaceAll("[a-z]", "a");
                    wc = wc.replaceAll("[0-9]", "0");
                    wc = wc.replaceAll("[^A-Za-z0-9]", "x");
                    bwc = bwc.replaceAll("[A-Z]+", "A");
                    bwc = bwc.replaceAll("[a-z]+", "a");
                    bwc = bwc.replaceAll("[0-9]+", "0");
                    bwc = bwc.replaceAll("[^A-Za-z0-9]+", "x");
                    token.setFeatureValue("BWC=" + bwc, 1.0);
                    if (this.containsEOSSymbol(currUnitRep) && (freq = unitFreq.get(currUnitRep).intValue()) > 1) {
                        token.setFeatureValue("FreqTokenEOSSymbol", 1.0);
                    }
                    if (this.abbrList.contains(currUnitRep)) {
                        token.setFeatureValue("KNOWNABBR", 1.0);
                    }
                    data.add((Object)token);
                    target.add((Object)label);
                    ++j;
                }
                unitInfo.addAll(units);
            }
            ++i;
        }
        carrier.setData((Object)data);
        carrier.setTarget((Object)target);
        carrier.setName(unitInfo);
        carrier.setSource((Object)abstractFileName);
        return carrier;
    }

    private int nrEOSSymbolsContained(String token) {
        int count = 0;
        char[] c = token.toCharArray();
        int i = 0;
        while (i < c.length) {
            char[] cc = new char[]{c[i]};
            if (this.eosSymbols.contains(new String(cc))) {
                ++count;
            }
            ++i;
        }
        return count;
    }

    private boolean containsEOSSymbol(String token) {
        String lastChar;
        return token.length() > 0 && this.eosSymbols.contains(lastChar = token.substring(token.length() - 1, token.length()));
    }

    private String getEOSSymbol(String token) {
        String lastChar;
        if (token.length() > 0 && this.eosSymbols.contains(lastChar = token.substring(token.length() - 1, token.length()))) {
            return lastChar;
        }
        return "";
    }

    private String getPlainUnit(String unitRep) {
        if (this.containsEOSSymbol(unitRep)) {
            return unitRep.substring(0, unitRep.length() - 1);
        }
        return unitRep;
    }

    private HashMap<String, Integer> getUnitFrequency(ArrayList<String> lines) {
        HashMap<String, Integer> freq = new HashMap<String, Integer>();
        int i = 0;
        while (i < lines.size()) {
            String line = lines.get(i);
            ArrayList<Unit> units = this.getUnits(line);
            int j = 0;
            while (j < units.size()) {
                Unit u = units.get(j);
                int count = 0;
                if (freq.containsKey(u.rep)) {
                    count = freq.get(u.rep);
                }
                freq.put(u.rep, ++count);
                ++j;
            }
            ++i;
        }
        return freq;
    }

    private ArrayList<Unit> getUnits(String line) {
        Matcher m = splitPattern.matcher(line);
        ArrayList<Unit> units = new ArrayList<Unit>();
        while (m.find()) {
            int begin = m.start();
            int end = m.end();
            String rep = m.group();
            units.add(new Unit(begin, end, rep));
        }
        return units;
    }
}

