/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.filtering;

import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.geneexpbase.genemodel.MentionMappingResult;
import de.julielab.genemapper.filtering.StringHelper;
import de.julielab.java.utilities.spanutils.Span;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Objects;
import java.util.regex.PatternSyntaxException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class UnspecificNameFilter {
    private static final Logger log = LoggerFactory.getLogger(UnspecificNameFilter.class);
    public static String leftWordBoundary = "(^|[\\s\\(\\[\\/])";
    public static String rightWordBoundary = "([\\s\\,\\)\\]\\/\\;\\!\\?]|$)";
    public static String blacklist_pattern = "mp87xwgtivoucegouasvbc9823vquh";

    public static boolean isUnspecific(String term) {
        HashSet<Integer> speciesIDs = new HashSet<Integer>();
        return UnspecificNameFilter.isUnspecific(term, speciesIDs);
    }

    public static boolean isUnspecific(String term, HashSet<Integer> speciesIDs) {
        term = term.trim();
        if ((term = term.replaceFirst("[\\,\\-\\;]+$", "")).length() == 1 && !speciesIDs.contains(7227)) {
            return true;
        }
        if (term.matches("[\\d\\.\\s]+")) {
            return true;
        }
        if (term.matches("([Aa]utosomal recessive|splicing|protease|checkpoint|C\\-terminal fragments|expressed in .*|murine|gene product|polypeptide|deafness|congenital|nonsyndromic|up\\-regulated|macular|embryonic|juvenile||myopia|catalytic subunits?|regulatory subunits?|T cell differentiation|processing of separase|chromosome|translocated to|mutated|bind|bind DNA|repeats|autosomal recessive deafness|congenital deafness|tandem|lipid raft|glucose|cadherin superfamily|cadherin family|.* domain|processing of separase|molecular weight|[Cc]\\-terminal|immunodeficiency|inhibitor securin|[Ii]nhibitor binding|type I lissencephaly|antisense|secreted|high affinity|Collagen XVIII|transmembrane protease|transmembrane serine protease|regulatory subunit NEMO|interacting|autosomal dominant|actin cytoskeleton|intestinal epithelialRho family|.* chemokine|X\\-linked retinoschisis|secreted photoreceptor|putative secreted photoreceptor|soluble L1|Soluble CD2|loss of heterozygosity|Kv4.2 potassium channel|alpha1 AMPK|K\\(\\+\\)|mm K\\(\\+\\)|Src homology|antiproliferative|focal|gamma IP-10|receptor trafficking|serine protease|homology|unknown function|EST|CNS|body weight|bone mineral content|long bones|renal cystic disease severity|GAGA transcription factor|[Dd]ominant megacolon|sex-peptide|gamma\\(c\\)|Ames dwarf|cellular stress response)")) {
            return true;
        }
        if (term.matches("^(aa|bp\\s[0-9]{1,2}|kd|mg|Ki|nM|CD|Sci|Proc|Acad|and [1Ii]|[Aa] gene|. .|. . .|. . .( .)+|as a|a PS|or if|or is|or in|[A-Za-z] or|a part|[A-Za-z][A-Za-z] as|and \\d+|or \\d+|an \\d+|for \\d+|is \\d+|[Aa]t\\d+|is [VvIiXx]|[A-Z] at|per \\d+|OR \\d+|six \\d+|F \\d+|at \\d+ d|s to|acid 2|HS is|(?:a )?[\\d+\\.][\\-\\s][Kk][Dd][Aa](?: protein)?|factor[\\s\\-]\\d|factor[\\s\\-](alpha|beta|gamma|delta)|receptor\\s\\d|[Ii]soforms?[\\s\\-]?\\d+|[Ii]sozymes?[\\s\\-]\\d+?|[A-Z] receptor|S phase|open reading frame|pulmonary function|MHC\\s[Cc]lass [Ii][Ii]?|Part I|[Uu]rinary protein|urine protein|death[\\-\\s]inducing|early[\\-\\s]response|[Nn]on\\-histone[\\-\\s]chromosomal|membrane[\\s\\-]?bound|proton[\\s\\-]?pump|similar\\sto|rough\\sdeal|alternative\\ssplicing|[a-z]+[\\s\\-]?binding|a catalytic|P\\(k\\)|[Mm]ediator complex|trans-Golgi network|protein\\s[A-Za-z]|protein[\\s\\-][0-9]+|beta[\\s\\-]\\d+|\\d+[\\s\\-]beta([\\s\\-]\\d+)?|alpha[\\s\\-]\\d+|\\d+[\\s\\-]alpha([\\s\\-]\\d+)?|(alpha|beta|gamma|delta|epsilon|eta|kappa|lambda)\\s[A-Za-z0-9]|[A-Za-z0-9][\\s\\-](alpha|beta|gamma|delta|epsilon|eta|kappa|lambda)|(alpha|beta|gamma|delta|epsilon|eta|kappa|lambda)\\schain|[A-Za-z][\\s\\-]protein|[Aa]\\s[0-9]{1,2})$")) {
            return true;
        }
        if (term.endsWith("nesis") || term.endsWith("gression") || term.endsWith("vation")) {
            return true;
        }
        if (term.toLowerCase().endsWith("amp") && term.length() == 4) {
            return true;
        }
        return term.matches("^([\\s\\,\\.\\-\\;\\:\\(\\)\\/]|isoform|subunit|ligand|complement|chain|site|form|domain|autoantigen|antigen|sequence|homolog|type|subtype|motif|group|candidate|molecule|superfamily|family|subfamily|transcript|[Ff]ragment|[fF]actor|regulator|inhibitor|suppressor|translocator|activator|[rR]eceptor|[lL]igand|adaptor|adapter|nucleoprotein|oncoprotein|phosphoprotein|glycoprotein|[Pp]rotein|polypeptide|RNA|DNA|dna|cDNA|rna|mRNA|mrna|mRna|tRNA|tRna|trna|histone|collagen|neuron|caspase|kinase|phosphatase|polymerase|coactivator|activator|transporter|hormone|[eE]xpression|activation|transduction|transcription|adhesion|interaction|release|[aA]ssociated|induced|coupled|related|linked|associated|conserved|mediated|expressed|advanced|localized|activating|regulating|signaling|binding|bound|containing|docking|transforming|export|trafficking|breast|colon|stem|cell|muscle|cellular|extracellular|intestinal|nuclear|surface|membrane|brain|epidermal|ectodermal|vesicle|mitochondrial|pancreatic|ubiquitous|fetal|chicken|mammalian|human|cancer|carcinoma|tumor|obesity|lung cancer|apoptosis|death|growth|maturation|necrosis|signal|repair|survival|stress|division|adhesion|control|excision|fusion|cycle|heat|shock|proteoglycan|core|homeobox|chemokine|cytokine|potassium|calcium|sodium|retinol|pyruvate|vitamin|glutamate|[Zz]inc|estrogen|thrombin|arrestin|actin|ubiquitin|mucin|urotensin|disintegrin|activin|chromatin|calmodulin|tubulin|cyclin|immunoglobulin|heparin|GTP|tyrosine|serine|threonine|alanine|arginine|asparagine|cysteine|glutamine|leucine|isoleucine|glycine|methionine|histidine|proline|lysine|phenylalanie|thryptophan|valine|low|high|highly|non|heterogeneous|homogeneous|light|heavy|negative|novel|putative|dependent|accessory|peripheral|regulatory|deficient|terminal|transcriptional|inducible|soluble|dual|specificity|specific|nucleic|acid|putative|peroxisomal|basic|nucleolar|secretory|susceptibility|paired|like|specific|muscle|testis|mobility|programmed|matrix|channel|end|ciliary|neurotrophic|retinoid|germinal|center|neural|finger|fibroblast|lymphokine|[a-z]+ine[\\s\\-]rich|[a-z]+ant|two|to|by|that|like|a|[tT]he|for|of|and|or|with|in|[Aa]ntigen|lymphocyte|cytoplasmic|helicase|retinoic|acid|plasminogen|cytoskeletal|anchor|[Aa]nti|integral|membrane|[Nn]eutrophil|ubiquitin|basic|leucine|zipper|putative|transmembrane|proteasome|responsive)+('?s)?$");
    }

    public static boolean isUnspecificSingleWord(String name) {
        return name.trim().matches(blacklist_pattern);
    }

    public static boolean isUnspecificSingleWordCaseInsensitive(String name) {
        return name.toLowerCase().trim().matches("(for|in|of|at|an|milk|cycling|enabled|blast|lipase|golgi|fusion|proteins?|nuclear|sex|membrane|mitochondrial|oligo|oligo 3|oligo 5|partial|macrophage|condensed|wt)");
    }

    public static boolean isUnspecificAbbreviation(String name, String sentence) {
        HashSet<Integer> speciesIDs = new HashSet<Integer>();
        return UnspecificNameFilter.isUnspecificAbbreviation(name, sentence, speciesIDs);
    }

    public static boolean isUnspecificAbbreviation(String name, String sentence, HashSet<Integer> speciesIDs) {
        if (name.length() > 2) {
            return false;
        }
        if (name.length() <= 2 && !speciesIDs.contains(7227)) {
            if (name.matches("H([2-9]|\\d\\d+)")) {
                return false;
            }
            String maskedName = StringHelper.escapeString(name);
            if (sentence.matches(".*[\\s\\(]" + maskedName + "[\\s\\,\\)]([^\\s]+\\s)?(gene|protein|locus|loci)s?.*")) {
                return false;
            }
            return !sentence.matches(".*(gene|protein|locus\\sfor|loci\\sfor)s?[\\s\\,\\)]([^\\s]+\\s)?[\\(\\s]?" + maskedName + "[\\s\\,\\)].*");
        }
        return false;
    }

    public static boolean isAminoAcid(String name) {
        return name.matches("(Ala|[Aa]lanine|Arg|[Aa]rginine|Asn|[Aa]sparagine|Asp|[Aa]spartic acid|Cys|[Cc]ysteine|Gln|[Gg]lutamine|Gly|[Gg]lycine|Glu|[Gg]lutamic acid|His|[Hh]istidine|Ile|[Ii]soleucine|Leu|[Ll]eucine|Lys|[Ll]ysine?|Met|[Mm]ethionine|Phe|[Pp]enylalanine|Pro|[Pp]roline|Ser|[Ss]erine|Thr|[Tt]hreonine|Trp|[Tt]ryptophane?|Tyr|[Tt]yrosine|Val|[Vv]aline)s?");
    }

    public static boolean isDiseaseName(String name) {
        boolean ret = false;
        if (name.trim().matches(".*([a-z]+(phase|osis|topy|trophy|itis|noma|phoma|axia|emia|stoma)|syndrome|failure|disease|severity)")) {
            ret = true;
        }
        if (name.trim().matches("(NF|[Nn]eurofibromatosis([\\s\\-][12])?|DM|Lu|Lutheran\\sblood\\sgroup|Se|H|Le|Lewis\\sblood\\sgroup|Rb|LW|LW\\sblood\\sgroup|Landsteiner[\\s\\-]Wiener\\sblood\\sgroup|autoimmune susceptibility|severe combined immunodeficiency|FHC|adipose|SLE|multiple sclerosis|anti\\-?phospholipid syndrome|[a-z]+ syndrome|thrombocytopenia|renal amyloidosis|IBD|hepatocellular\\scarcinoma|hereditary\\shemochromatosis|promyelocytic\\sleukemia|retinitis pigmentosa|multiple endocrine neoplasia|ovarian cancer|AML|MDS)")) {
            ret = true;
        }
        return ret;
    }

    public static boolean keepDiseaseName(String name, String sentence) {
        try {
            return sentence.matches(".*" + leftWordBoundary + "(locus|loci|location|chromosom[a-z]+|gene.+associated)" + rightWordBoundary + ".*") || sentence.matches(".*" + leftWordBoundary + name.replaceAll("([\\+\\-\\*\\(\\)\\[\\]\\{\\}])", "\\\\$1") + rightWordBoundary + ".*" + leftWordBoundary + "(gene|protein)s?" + rightWordBoundary + ".*");
        }
        catch (PatternSyntaxException e) {
            return false;
        }
    }

    public static boolean isTissueCellCompartment(String name) {
        return name.trim().matches("([a-z]+(plasmic|plastic)|[a-z]+(skeletal|\\sretina)|amyloid|neuronal|[a-z]+(phil)|[a-z]+(cytes?)|[a-z]+ic cell|[Ss]tem cells?|[Ee]ndothelial cell|T cell|[a-z]ial|[a-z]ic|post\\-?synaptic|endothelial|epithelial|[Ii]ntestinal|skin-derived|lymphoid|skeletal muscle|liver|kidney|cytoskeleton|hematopoietic|retinal|dendritic|retina|testis)");
    }

    public static boolean isCellLine(String name, String sentence) {
        String maskedName = StringHelper.escapeString(name);
        if (sentence.matches(".* (WT|wild[ -]?type)" + leftWordBoundary + maskedName + rightWordBoundary + ".*")) {
            return false;
        }
        if (sentence.matches(".*" + leftWordBoundary + maskedName + rightWordBoundary + "(cell|culture)s?.*")) {
            return true;
        }
        if (name.matches("CD\\d+") && sentence.matches(".*" + leftWordBoundary + maskedName + rightWordBoundary + "([A-Za-z\\-]+ )?(cell)s?.*")) {
            return true;
        }
        if (name.startsWith("MDA") && sentence.matches(".*" + maskedName + "\\-MB.*")) {
            return true;
        }
        return sentence.matches(".*" + leftWordBoundary + maskedName + "[\\-\\/][0-9]+[A-Z]*" + rightWordBoundary + "([a-z]+ ){0,2}(cell|culture)s?.*");
    }

    public static boolean isSpecies(String name) {
        return name.trim().toLowerCase().matches("(human|man|patient|mouse|mice|murine|pig|hiv|rabbit|coli|avian|chimp|chicken|rat|e\\. coli)");
    }

    public static boolean isChromosome(String name, String sentence) {
        if (!name.matches("(X|Y|[\\d]+[pq](\\.[\\d\\.]+)?)")) {
            return false;
        }
        name = StringHelper.escapeString(name);
        String maskedName = StringHelper.escapeString(name);
        return sentence.matches(".*(chromosome " + maskedName + "|" + maskedName + " chromosome).*");
    }

    public static boolean isNegativePair(String name, String sentence) {
        String maskedName = StringHelper.escapeString(name);
        if (name.equals("LPS") && sentence.matches(".*(induce|administ|stimulat).*")) {
            return true;
        }
        if (name.equals("GST") && sentence.matches(".*(pull\\-?down|assay|fusion|purification|\\Wtag\\W|blotting|anti\\-?body).*")) {
            return true;
        }
        if (name.equalsIgnoreCase("polymerase") && sentence.matches(".*(chain[\\-\\s]?reaction|PCR|Pcr).*")) {
            return true;
        }
        if (sentence.matches(".*" + maskedName + " (patient|disease|symptom|syndrome?)s?.*")) {
            return true;
        }
        return sentence.matches(".*(disease|symptom|syndrome?|cancer|[a-z]+oma|[a-z]+itis) \\(" + maskedName + "\\)");
    }

    public static boolean textHasPlural(String name, String text) {
        name = StringHelper.escapeString(name);
        try {
            if (text.matches(".*[\\s\\(\\[\"]" + name + "s[\\s\\,\\.\\]\\)\"].*") && !text.matches(".*(homolog|ortholog|similar)[a-z]*[^\\.\\;\\:]*[\\s\\(\\[]" + name + "s[\\s\\,\\.\\]\\)\"].*")) {
                log.debug("#UNF: Text contains plural form: " + name);
                return true;
            }
        }
        catch (PatternSyntaxException patternSyntaxException) {
            // empty catch block
        }
        return false;
    }

    public void filter(GeneDocument document) {
        int removed = 0;
        int total = (int)document.getGenes().count();
        Iterator unidentifiedGeneNames = document.getGenes().iterator();
        while (unidentifiedGeneNames.hasNext()) {
            GeneMention recognizedGeneName = (GeneMention)unidentifiedGeneNames.next();
            if (recognizedGeneName.isRejected()) continue;
            HashSet<Integer> speciesIDs = new HashSet<Integer>();
            recognizedGeneName.getTaxonomyIds().stream().filter(Objects::nonNull).map(Integer::parseInt).forEach(speciesIDs::add);
            String sentence = document.getCoveredText(document.getOverlappingSentence((Span)recognizedGeneName));
            String maskedGeneName = StringHelper.escapeString(recognizedGeneName.getText());
            String reason = "";
            if (UnspecificNameFilter.isUnspecific(maskedGeneName, speciesIDs)) {
                reason = "unspecific";
            } else if (UnspecificNameFilter.isUnspecificSingleWordCaseInsensitive(maskedGeneName)) {
                reason = "unspecific single word case-insensitive";
            } else if (UnspecificNameFilter.isUnspecificSingleWord(maskedGeneName)) {
                reason = "unspecific single word";
            } else if (UnspecificNameFilter.isTissueCellCompartment(maskedGeneName)) {
                reason = "tissue, cell compartment";
            } else if (UnspecificNameFilter.isUnspecificAbbreviation(recognizedGeneName.getText(), sentence, speciesIDs)) {
                reason = "unspecific abbreviation";
            } else if (UnspecificNameFilter.isSpecies(maskedGeneName)) {
                reason = "species";
            } else if (UnspecificNameFilter.isAminoAcid(maskedGeneName)) {
                reason = "amino acid";
            } else if (UnspecificNameFilter.textHasPlural(maskedGeneName, document.getDocumentText())) {
                reason = "text has plural form";
            } else if (UnspecificNameFilter.isChromosome(recognizedGeneName.getText(), sentence)) {
                reason = "chromosome";
            } else if (UnspecificNameFilter.isDiseaseName(recognizedGeneName.getText())) {
                reason = "disease name";
            } else if (UnspecificNameFilter.isDiseaseName(maskedGeneName)) {
                reason = "disease name (masked)";
            } else if (UnspecificNameFilter.isNegativePair(recognizedGeneName.getText(), sentence)) {
                reason = "not a gene in this context";
            } else if (sentence.matches(".*" + maskedGeneName + "([\\-\\/][A-Za-z0-9]*[A-Z0-9][A-Za-z0-9]*)?( [a-z]+)? (gene|protein) family([\\.\\,\\;\\:]| [a-z]+ [a-z]+[\\s\\,\\.\\:\\;]).*")) {
                reason = "gene family";
            }
            if (reason.length() > 0) {
                log.debug("UNF: removing " + recognizedGeneName.getText() + " in sentence " + sentence + "; reason: " + reason);
                recognizedGeneName.reject(MentionMappingResult.RejectReason.IS_UNSPECIFIC);
                ++removed;
            }
            if (!UnspecificNameFilter.isNegativePair(recognizedGeneName.getText(), sentence)) continue;
            document.getGenes().filter((? super T gm) -> gm.getText().equals(recognizedGeneName.getText())).forEach(gm -> gm.reject(MentionMappingResult.RejectReason.HAS_SAME_TEXT_AS_OTHER_UNSPECIFIC_MENTION));
            log.debug("UNF: removing all occurrences of " + recognizedGeneName.getText() + "; reason: " + reason);
        }
        log.debug(" " + this.getClass().getSimpleName() + ": removed " + removed + " names out of " + total);
    }

    static {
        HashSet<String> blacks = new HashSet<String>();
        try {
            String line;
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("data/blacklist.txt")));
            while ((line = br.readLine()) != null) {
                if (line.startsWith("#") || line.trim().length() <= 0) continue;
                blacks.add(line);
            }
            br.close();
            br = null;
        }
        catch (IOException ioe) {
            ioe.printStackTrace();
        }
        if (blacks.size() > 0) {
            blacklist_pattern = "(" + StringHelper.joinStringSet(blacks, "|") + ")";
            String[] iterms = blacklist_pattern.split("\\|");
            System.err.println("#INFO using blacklist of " + iterms.length + " terms.");
        }
    }
}

