/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.filtering;

import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureVectorSequence;
import cc.mallet.pipe.tsf.TokenTextCharNGrams;
import cc.mallet.pipe.tsf.TokenTextCharSuffix;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneSet;
import de.julielab.geneexpbase.genemodel.GeneSets;
import de.julielab.genemapper.filtering.ContextPosPipe;
import de.julielab.genemapper.filtering.GeneSequence2SingleGene;
import de.julielab.genemapper.filtering.GeneSet2GeneSequence;
import de.julielab.genemapper.filtering.LowerCasePipe;
import de.julielab.genemapper.filtering.NextCharPipe;
import de.julielab.genemapper.filtering.NumberClassPipe;
import de.julielab.genemapper.filtering.PrevCharPipe;
import de.julielab.genemapper.filtering.RegExPipe;
import de.julielab.genemapper.filtering.TokenPrefixPipe;
import de.julielab.genemapper.filtering.WordClassPipe;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Stream;

public class InstanceListCreator {
    private final boolean sequence;

    public InstanceListCreator(boolean sequence) {
        this.sequence = sequence;
    }

    public InstanceList createInstanceList(Stream<GeneDocument> documents) {
        List<Pipe> pipes = this.createPipes();
        if (!this.sequence) {
            pipes.add(new GeneSequence2SingleGene());
        }
        InstanceList list = new InstanceList((Pipe)new SerialPipes(pipes));
        documents.forEach(d -> this.createInstanceList((GeneDocument)d, list));
        return list;
    }

    private InstanceList createInstanceList(GeneDocument document, InstanceList list) {
        GeneSets geneSets = document.getGeneSets();
        for (GeneSet set : geneSets) {
            Instance instance = new Instance((Object)set, null, null, (Object)document);
            list.addThruPipe(instance);
        }
        return list;
    }

    public List<Pipe> createPipes() {
        ArrayList<Pipe> pipes = new ArrayList<Pipe>();
        pipes.add(new GeneSet2GeneSequence());
        pipes.add(new PrevCharPipe("LCHAR="));
        pipes.add(new NextCharPipe("RCHAR="));
        pipes.add(new ContextPosPipe("PREVPOS", -5, true));
        pipes.add(new ContextPosPipe("NEXTPOS", 5, true));
        pipes.add(new LowerCasePipe("W="));
        pipes.add(new NumberClassPipe("NC=", false));
        pipes.add(new NumberClassPipe("BNC=", true));
        pipes.add(new WordClassPipe("WC=", false));
        pipes.add(new WordClassPipe("BWC=", true));
        pipes.add(new RegExPipe("ALPHA", Pattern.compile("[A-Za-z]+")));
        pipes.add(new RegExPipe("INITCAPS", Pattern.compile("[A-Z].*")));
        pipes.add(new RegExPipe("UPPER-LOWER", Pattern.compile("[A-Z][a-z].*")));
        pipes.add(new RegExPipe("LOWER-UPPER", Pattern.compile("[a-z]+[A-Z]+.*")));
        pipes.add(new RegExPipe("ALLCAPS", Pattern.compile("[A-Z]+")));
        pipes.add(new RegExPipe("MIXEDCAPS", Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*")));
        pipes.add(new RegExPipe("SINGLECHAR", Pattern.compile("[A-Za-z]")));
        pipes.add(new RegExPipe("SINGLEDIGIT", Pattern.compile("[0-9]")));
        pipes.add(new RegExPipe("DOUBLEDIGIT", Pattern.compile("[0-9][0-9]")));
        pipes.add(new RegExPipe("NUMBER", Pattern.compile("[0-9,]+")));
        pipes.add(new RegExPipe("HASDIGIT", Pattern.compile(".*[0-9].*")));
        pipes.add(new RegExPipe("ALPHANUMERIC", Pattern.compile(".*[0-9].*[A-Za-z].*")));
        pipes.add(new RegExPipe("ALPHANUMERIC", Pattern.compile(".*[A-Za-z].*[0-9].*")));
        pipes.add(new RegExPipe("NUMBERS_LETTERS", Pattern.compile("[0-9]+[A-Za-z]+")));
        pipes.add(new RegExPipe("LETTERS_NUMBERS", Pattern.compile("[A-Za-z]+[0-9]+")));
        pipes.add(new RegExPipe("HAS_DASH", Pattern.compile(".*-.*")));
        pipes.add(new RegExPipe("HAS_QUOTE", Pattern.compile(".*'.*")));
        pipes.add(new RegExPipe("HAS_SLASH", Pattern.compile(".*/.*")));
        pipes.add(new RegExPipe("REALNUMBER", Pattern.compile("(-|\\+)?[0-9,]+(\\.[0-9]*)?%?")));
        pipes.add(new RegExPipe("REALNUMBER", Pattern.compile("(-|\\+)?[0-9,]*(\\.[0-9]+)?%?")));
        pipes.add(new RegExPipe("START_MINUS", Pattern.compile("-.*")));
        pipes.add(new RegExPipe("START_PLUS", Pattern.compile("\\+.*")));
        pipes.add(new RegExPipe("END_PERCENT", Pattern.compile(".*%")));
        pipes.add(new TokenPrefixPipe("2PREFIX=", 2));
        pipes.add(new TokenPrefixPipe("3PREFIX=", 3));
        pipes.add(new TokenPrefixPipe("4PREFIX=", 4));
        pipes.add((Pipe)new TokenTextCharSuffix("2SUFFIX=", 2));
        pipes.add((Pipe)new TokenTextCharSuffix("3SUFFIX=", 3));
        pipes.add((Pipe)new TokenTextCharSuffix("4SUFFIX=", 4));
        pipes.add((Pipe)new TokenTextCharNGrams("CHARNGRAM=", new int[]{2, 3}, true));
        pipes.add(new RegExPipe("ROMAN", Pattern.compile("[IVXDLCM]+", 2)));
        pipes.add(new RegExPipe("GREEK", Pattern.compile(TermNormalizer.GREEK_REGEX, 2)));
        pipes.add(new RegExPipe("ISPUNCT", Pattern.compile("[`~!@#$%^&*()-=_+\\[\\]\\\\{}|;':\\\",./<>?]+")));
        pipes.add((Pipe)new TokenSequence2FeatureVectorSequence(true, true));
        return pipes;
    }
}

