/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jcore.ae.jsbd;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import de.julielab.jcore.ae.jsbd.AbbreviationsMedical;
import de.julielab.jcore.ae.jsbd.EOSSymbols;
import de.julielab.jcore.ae.jsbd.Unit;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class Abstract2UnitPipe
extends Pipe {
    private static final Logger log = LoggerFactory.getLogger(Abstract2UnitPipe.class);
    private static final long serialVersionUID = 1L;
    private static final Pattern splitPattern = Pattern.compile("[^\\s]+");
    private static final Pattern punctuationPattern = Pattern.compile("\\p{P}");
    TreeSet<String> eosSymbols;
    TreeSet<String> abbrList;
    private boolean splitAfterPunctuation;

    Abstract2UnitPipe(boolean splitAfterPunctuation) {
        super(new Alphabet(), (Alphabet)new LabelAlphabet());
        this.splitAfterPunctuation = splitAfterPunctuation;
        this.eosSymbols = new EOSSymbols().getSymbols();
        this.abbrList = new AbbreviationsMedical().getSet();
    }

    public Instance pipe(Instance carrier) {
        String abstractFileName = (String)carrier.getSource();
        List lines = (List)carrier.getData();
        Map<String, Integer> unitFreq = this.getUnitFrequency(lines);
        TokenSequence data = new TokenSequence();
        LabelSequence target = new LabelSequence((Alphabet)((LabelAlphabet)this.getTargetAlphabet()));
        ArrayList<Unit> unitInfo = new ArrayList<Unit>();
        for (int i = 0; i < lines.size(); ++i) {
            List<Unit> units;
            String line = (String)lines.get(i);
            if (line.isEmpty() || (units = this.getUnits(line)).isEmpty()) continue;
            for (int j = 0; j < units.size(); ++j) {
                int freq;
                int count;
                String currUnitRep = units.get((int)j).rep;
                String plainUnitRep = this.getPlainUnit(currUnitRep);
                String label = "IS";
                Token token = new Token(currUnitRep);
                if (units.get((int)j).isTokenInternal) {
                    token.setFeatureValue("istokeninternal=", 1.0);
                }
                if (this.containsEOSSymbol(currUnitRep)) {
                    token.setFeatureValue("endwithEOSSymb=" + this.getEOSSymbol(currUnitRep), 1.0);
                }
                if (j + 1 == units.size()) {
                    label = "EOS";
                }
                if ((count = this.nrEOSSymbolsContained(plainUnitRep)) > 0) {
                    token.setFeatureValue("hasinnerEOSSymb=" + count, 1.0);
                }
                token.setFeatureValue("TOKEN=" + currUnitRep, 1.0);
                if (currUnitRep.matches("[\\p{Lu}\\p{M}].*")) {
                    token.setFeatureValue("INITCAPS", 1.0);
                }
                if (currUnitRep.matches("[\\p{Lu}\\p{M}]")) {
                    token.setFeatureValue("ONECAPS", 1.0);
                }
                if (currUnitRep.matches("[\\p{Lu}\\p{M}]+")) {
                    token.setFeatureValue("ALLCAPS", 1.0);
                }
                if (currUnitRep.matches("(.*[\\p{L}\\p{M}].*[0-9].*|.*[0-9].*[\\p{L}\\p{M}].*)")) {
                    token.setFeatureValue("ALPHANUMERIC", 1.0);
                }
                if (currUnitRep.matches("[IVXDLCM]+")) {
                    token.setFeatureValue("ROMAN", 1.0);
                }
                if (currUnitRep.matches(".*\\b[IVXDLCM]+\\b.*")) {
                    token.setFeatureValue("HASROMAN", 1.0);
                }
                if (currUnitRep.matches("[0-9]+")) {
                    token.setFeatureValue("NATURALNUMBER", 1.0);
                }
                if (currUnitRep.matches("[-0-9]+[.,]+[0-9.,]+")) {
                    token.setFeatureValue("REALNUMBER", 1.0);
                }
                if (currUnitRep.matches(".*[0-9]+.*")) {
                    token.setFeatureValue("HASDIGITS", 1.0);
                }
                if (currUnitRep.matches("(\\(.*|\\[.*)")) {
                    token.setFeatureValue("BEGINBRACKETS", 1.0);
                }
                if (currUnitRep.matches("(\\(.*\\)|\\[.*\\])")) {
                    token.setFeatureValue("INSIDEBRACKETS", 1.0);
                }
                if (currUnitRep.matches("(\".*|'.*)")) {
                    token.setFeatureValue("BEGINQUOTES", 1.0);
                }
                if (currUnitRep.matches("(\".*\"|'.*')")) {
                    token.setFeatureValue("INSIDEBQUOTES", 1.0);
                }
                if (currUnitRep.length() <= 3) {
                    token.setFeatureValue("SIZE1", 1.0);
                } else if (currUnitRep.length() <= 6) {
                    token.setFeatureValue("SIZE2", 1.0);
                } else {
                    token.setFeatureValue("SIZE3", 1.0);
                }
                if (currUnitRep.matches("[A-Z]\\.")) {
                    token.setFeatureValue("ABBR1", 1.0);
                }
                if (currUnitRep.matches("([A-Za-z]\\.)+")) {
                    token.setFeatureValue("ABBR2", 1.0);
                }
                if (currUnitRep.matches("[abcdfghjklmnpqrstvwxyz]+\\.")) {
                    token.setFeatureValue("ABBR3", 1.0);
                }
                String bwc = plainUnitRep;
                bwc = bwc.replaceAll("[\\p{Lu}\\p{M}]+", "A");
                bwc = bwc.replaceAll("[\\p{Ll}\\p{M}]+", "a");
                bwc = bwc.replaceAll("[0-9]+", "0");
                bwc = bwc.replaceAll("[^\\p{L}\\p{M}0-9]+", "x");
                token.setFeatureValue("BWC=" + bwc, 1.0);
                if (this.containsEOSSymbol(currUnitRep) && (freq = unitFreq.get(currUnitRep).intValue()) > 1) {
                    token.setFeatureValue("FreqTokenEOSSymbol", 1.0);
                }
                if (this.abbrList.contains(currUnitRep)) {
                    token.setFeatureValue("KNOWNABBR", 1.0);
                }
                data.add((Object)token);
                target.add((Object)label);
            }
            unitInfo.addAll(units);
        }
        carrier.setData((Object)data);
        carrier.setTarget((Object)target);
        carrier.setName(unitInfo);
        carrier.setSource((Object)abstractFileName);
        return carrier;
    }

    private int nrEOSSymbolsContained(String token) {
        int count = 0;
        char[] c = token.toCharArray();
        for (int i = 0; i < c.length; ++i) {
            char[] cc = new char[]{c[i]};
            if (!this.eosSymbols.contains(new String(cc))) continue;
            ++count;
        }
        return count;
    }

    private boolean containsEOSSymbol(String token) {
        String lastChar;
        return token.length() > 0 && this.eosSymbols.contains(lastChar = token.substring(token.length() - 1, token.length()));
    }

    private String getEOSSymbol(String token) {
        String lastChar;
        if (token.length() > 0 && this.eosSymbols.contains(lastChar = token.substring(token.length() - 1, token.length()))) {
            return lastChar;
        }
        return "";
    }

    private String getPlainUnit(String unitRep) {
        if (this.containsEOSSymbol(unitRep)) {
            return unitRep.substring(0, unitRep.length() - 1);
        }
        return unitRep;
    }

    private Map<String, Integer> getUnitFrequency(List<String> lines) {
        HashMap<String, Integer> freq = new HashMap<String, Integer>();
        for (int i = 0; i < lines.size(); ++i) {
            String line = lines.get(i);
            List<Unit> units = this.getUnits(line);
            for (int j = 0; j < units.size(); ++j) {
                Unit u = units.get(j);
                int count = 0;
                if (freq.containsKey(u.rep)) {
                    count = (Integer)freq.get(u.rep);
                }
                freq.put(u.rep, ++count);
            }
        }
        return freq;
    }

    private List<Unit> getUnits(String line) {
        Matcher m = splitPattern.matcher(line);
        ArrayList<Unit> units = new ArrayList<Unit>();
        while (m.find()) {
            String rep = m.group();
            int begin = m.start();
            int end = m.end();
            int newBegin = begin;
            if (this.splitAfterPunctuation) {
                Matcher punctMatcher = punctuationPattern.matcher(rep);
                while (punctMatcher.find()) {
                    String punctRep = punctMatcher.group();
                    int punctEnd = begin + punctMatcher.start();
                    punctEnd = begin + punctMatcher.end();
                    boolean isTokenInternal = punctEnd < end;
                    units.add(new Unit(begin, punctEnd, line.substring(newBegin, punctEnd), isTokenInternal));
                    newBegin = punctEnd;
                }
            }
            if ((begin = newBegin) >= end || begin >= line.length()) continue;
            units.add(new Unit(begin, end, line.substring(begin, end), false));
        }
        return units;
    }

    private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
        stream.defaultReadObject();
        log.info("This sentence splitter model allows sentence splits after all punctuation: " + this.splitAfterPunctuation);
    }
}

