/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.filtering.families;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.geneexpbase.genemodel.GeneSet;
import de.julielab.geneexpbase.genemodel.PosTag;
import de.julielab.genemapper.utils.GeneMapperException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.Range;

public class MentionPrefixPipe
extends Pipe {
    private static final long serialVersionUID = 6376862551856291001L;
    private final String prefix;
    private transient Matcher m = this.initMatcher();
    private final String[] NONSPEC_KINASE = new String[]{"proteins?", "kinases?", "receptors?", "non", "nonreceptor", "activity", "tyrosine", "phosphatidylinositol", "[0-9]+", "dependent", "cyclin"};

    public MentionPrefixPipe(String prefix) throws IOException, GeneMapperException {
        this.prefix = prefix;
    }

    private Matcher initMatcher() {
        return Pattern.compile("([0-9]+|i |i$|ii|iii|iv |iv$|v |v$|vi |vi$|vii|viii|(beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega))").matcher("");
    }

    @Override
    public Instance pipe(Instance inst) {
        Token t = (Token)inst.getData();
        GeneSet geneSet = (GeneSet)t.getProperty("gs");
        GeneDocument geneDocument = (GeneDocument)inst.getSource();
        for (GeneMention gm : geneSet) {
            String textname;
            AhoCorasickOptimized geneDict = Objects.requireNonNull(geneDocument.getGeneNameDictionary(), "The GeneDocument's gene name dictionary must be built once after the determination of the genes to be selected for each GeneDocument and before using this pipe.");
            String phraseExtendedText = gm.getPhraseExtendedText();
            String rightExtendedText = gm.getRightExtendedText().toLowerCase();
            if (rightExtendedText.toLowerCase().contains("motif")) {
                t.setFeatureValue("HAS_MOTIF", 1.0);
            }
            if (gm.getText().startsWith("type") && rightExtendedText.endsWith("receptor")) {
                t.setFeatureValue("TYPE_RECEPTOR", 1.0);
            }
            ArrayList<PosTag> rightExtendedPostags = new ArrayList<PosTag>(geneDocument.getOverlappingPosTags(gm.getRightExtendedOffsets()));
            if (rightExtendedText.contains("-related") && (rightExtendedText.contains("substrate") || ((PosTag)rightExtendedPostags.get(rightExtendedPostags.size() - 1)).getTag().equals("NNS"))) {
                t.setFeatureValue("RELATED", 1.0);
            }
            if (rightExtendedText.contains("-containing")) {
                t.setFeatureValue("IS_CONTAINING", 1.0);
            }
            if (rightExtendedText.contains("-activated")) {
                t.setFeatureValue("IS_ACTIVATED", 1.0);
            }
            if (rightExtendedText.endsWith("lase") || rightExtendedText.endsWith("lases") || rightExtendedText.contains("lase ") || rightExtendedText.contains("lases ")) {
                t.setFeatureValue("SUFFIX_LASE", 1.0);
            }
            if (rightExtendedText.endsWith("complex")) {
                t.setFeatureValue("HAS_COMPLEX", 1.0);
            }
            if (gm.getNormalizedText().replaceAll("kinases?", "").replaceAll("tyrosine", "").replaceAll("activity", "").replaceAll("proteins?", "").replaceAll("receptor", "").isBlank()) {
                t.setFeatureValue("UNSPEC_KINASE", 1.0);
            } else if (gm.getText().contains("kinase")) {
                t.setFeatureValue("SPEC_KINASE", 1.0);
            }
            if (!geneDict.isEntryPrefix(textname = gm.getText().toLowerCase())) continue;
            Range<Integer> extendedOffsets = gm.getRightExtendedOffsets();
            Collection<PosTag> extendedPos = geneDocument.getOverlappingPosTags(extendedOffsets);
            if (extendedPos.stream().anyMatch(p -> p.getTag().equals("NNS"))) {
                t.setFeatureValue("HAS_PLURAL", 1.0);
            }
            boolean foundLongerGene = false;
            for (GeneMention otherGm : geneDocument.getGenesIterable()) {
                String otherText = otherGm.getText().toLowerCase();
                if (otherText.length() <= textname.length() || !otherText.startsWith(textname)) continue;
                String suffix = otherText.substring(textname.length());
                if (otherText.length() - textname.length() > 4 || otherText.length() == textname.length() + 1 && otherText.endsWith("s")) continue;
                this.m.reset(suffix);
                if (this.m.matches()) {
                    t.setFeatureValue("OTHER_GENE_HAS_SPEC_SUFFIX", 1.0);
                }
                if (!textname.matches(".*[0-9].*") && suffix.matches("[0-9]+")) {
                    t.setFeatureValue("NUM_SUFFIX", 1.0);
                    continue;
                }
                if (!phraseExtendedText.startsWith("a ") && !phraseExtendedText.startsWith("an ") || gm.getPhraseExtendesOffsets().getMinimum() != 0 && !Character.isWhitespace(geneDocument.getDocumentText().charAt(gm.getPhraseExtendesOffsets().getMinimum() - 1))) continue;
                t.setFeatureValue("UNSPEC_DET", 1.0);
            }
        }
        return inst;
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        this.m = this.initMatcher();
    }
}

