/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.geneexpbase;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import de.julielab.geneexpbase.GeneExpRuntimeException;
import de.julielab.geneexpbase.TermNormalizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CandidateFilter {
    public static final Map<String, String> greekAbbrMap = new HashMap<String, String>();
    public static final String SUB_GREEK = "(beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)";
    private static final Logger LOGGER = LoggerFactory.getLogger(CandidateFilter.class);
    private static final String UNSPECIFIEDS_FILE = "/unspecified_proteins";
    private static final String PREMOD_FILE = "/premodifiers";
    private static final Pattern NUM_GREEK_LATIN_PATTERN = Pattern.compile("[0-9]+|" + TermNormalizer.GREEK_REGEX + "|" + TermNormalizer.LAT_NUM_REGEX);
    public static String MODIFIER = "(receptors?|cofactors?|factors?|tranporters?|regulators?|inhibitors?|activators?|suppressors?|enhancers?|repressors?|adaptors?|interactors?|modulators?|mediators?|inducers?|effectors?|coactivators?|supressors?|integrators?|facilitators?|binders?|terminators?|acceptors?|responders?|proactivators?|exchangers?|enhancers?|adapters?|responders?|modifiers?|ligands?)";
    public static String NON_DESCRIPTIVE = "(^|\\s)(constructs?|fragments?|antigens?|precursors?|proteins?|genes?|chains?|domains?|kinases?|homologues?|homologs?|isoforms?|isologs?|isotypes?|motifs?|orthologues?|orthologs?|products?|sequences?|subtypes?|subunits?|promoters?|onco proteins?|oncoproteins?|proto oncogenes?|protooncogenes?|proteases?|binding sites?|transcripts?|elements?|constructs?|si rnas?|prem rnas?|pre m rnas?|m rna ?s?|rnas?|locus|gene products?|products?|reporter genes?|reporters?|genes?|proteins?|c dnas?|molecules?|pseudogenes?|autoantigens?|peptides?|polypeptides?|enzymes?|cytokines?|cells?|surfaces?)(\\s|$)";
    public static String AMINO_ACIDS = "(alanine|arginine|asparagine|aspartic|cysteine|glutamine|glutamic|glycine|histidine|isoleucine|leucine|lysine|methionine|phenylalanine|proline|serine|threonine|tryptophan|tyrosine|valine)";
    public static Set<String> NON_GENE_WORDS = Set.of("cyclin", "protease", "polypeptide", "lyase", "kinase", "synapsin", "dynein", "inv", "activin", "chemokine", "lysophospholipase");
    public static String DOMAIN_FAMILIES = "^.*(acceptors|acid|activators|adapters|adaptors|antibodi|antibody|binders|binding|binding site|binding sites|box|boxe|chromosome|coactivators|cofactors|complex|domain|dyneins|effectors|element|enhancers|epitope|erythrocyte|exchangers|exon|facilitators|factors|familie|family|filament|finger|helicases|histone|histones|homeodomain|inducers|inhibitors|integrators|interactors|intron|kinases|kinesins|lectins|ligands|mediators|member|membrane|modifiers|modulators|motif|myosins|proactivators|proteases|proteasome|proteins|reductases|region|regulators|repeat|repressors|residue|responders|sequence|site|subdomain|subfamily|subunits|superfamily|suppressors|supressors|syndrome|tail|terminal|terminators|terminus|tranporters|transferases|zinc finger)e?s?";
    public static Pattern patternNonDesc;
    public static Pattern patternDomainFamilies;
    public static Pattern patternUnspecifieds;
    public static Pattern patternPreMods;
    public static String UNSPECIFIEDS;
    public static String PREMODS;
    private final Pattern num;
    private final Pattern singChar;
    private final Pattern specWords;
    public Matcher matcherNonDesc = patternNonDesc.matcher("");
    public Matcher matcherUnspecifieds = patternUnspecifieds.matcher("");

    public CandidateFilter() {
        this.num = Pattern.compile("[0-9]*");
        this.singChar = Pattern.compile("([a-z]|[0-9])");
        this.specWords = Pattern.compile("(" + TermNormalizer.GREEK_REGEX + "|" + MODIFIER + "||" + NON_DESCRIPTIVE + ")");
    }

    public static void main(String[] args) throws IOException {
        CandidateFilter cf = new CandidateFilter();
        Pattern p = patternUnspecifieds;
        System.out.println(p.pattern());
        Matcher m3 = p.matcher("fos");
        if (m3.matches()) {
            System.out.println("yes");
        } else {
            System.out.println("no");
        }
    }

    public static String expendGreek(String s2) {
        Matcher mGreekAbbrLetter = Pattern.compile("\\b[a-zA-Z]\\b").matcher(s2);
        StringBuilder ret = new StringBuilder();
        int lastMatch = 0;
        while (mGreekAbbrLetter.find()) {
            int matchStart = mGreekAbbrLetter.start();
            ret.append(s2, lastMatch, matchStart);
            lastMatch = mGreekAbbrLetter.end();
            String letter = mGreekAbbrLetter.group();
            String greekChar = greekAbbrMap.get(letter);
            ret.append(greekChar != null ? greekChar : letter);
        }
        if (lastMatch > 0 && lastMatch < s2.length() - 1) {
            ret.append(s2.substring(lastMatch));
        }
        if (ret.length() == 0) {
            return s2;
        }
        return ret.toString();
    }

    public static boolean isNumberCompatible(String normalizedMention, String synonym) {
        String[] mentionSplit = normalizedMention.split("\\s");
        String[] synSplit = synonym.split("\\s");
        Multiset<String> mentionNumbers = CandidateFilter.getNumbers(mentionSplit);
        Multiset<String> synNumbers = CandidateFilter.getNumbers(synSplit);
        return mentionNumbers.size() == synNumbers.size() && Multisets.intersection(mentionNumbers, synNumbers).size() == mentionNumbers.size();
    }

    public static boolean hasContradictingNumber(String searchedMention, String foundSynonym) {
        String[] mentionSplit = searchedMention.split("\\s");
        String[] synSplit = foundSynonym.split("\\s");
        Multiset<String> mentionNumbers = CandidateFilter.getNumbers(mentionSplit);
        Multiset<String> synNumbers = CandidateFilter.getNumbers(synSplit);
        return !mentionNumbers.isEmpty() && !synNumbers.isEmpty() && Multisets.intersection(mentionNumbers, synNumbers).isEmpty();
    }

    public static Multiset<String> getNumbers(String[] tokens) {
        HashMultiset<String> numberTokens = HashMultiset.create();
        for (String token : tokens) {
            if (!token.matches("[0-9]+")) continue;
            numberTokens.add(token);
        }
        return numberTokens;
    }

    public static Multiset<String> getSingleSymbols(String[] tokens) {
        HashMultiset<String> singleLetterTokens = HashMultiset.create();
        for (String token : tokens) {
            if (!token.matches("[a-zA-Z]|[0-9]+|" + TermNormalizer.GREEK_REGEX)) continue;
            singleLetterTokens.add(token);
        }
        return singleLetterTokens;
    }

    public static boolean isNumberGreekOrLatin(String token) {
        return NUM_GREEK_LATIN_PATTERN.matcher(token).matches();
    }

    public static Multiset<String> getContentTokens(String[] tokens) {
        HashMultiset<String> contentTokens = HashMultiset.create();
        for (String token : tokens) {
            if (token.matches("[a-zA-Z]|[0-9]+|" + TermNormalizer.GREEK_REGEX)) continue;
            contentTokens.add(token);
        }
        return contentTokens;
    }

    public static Multiset<String> getNumberOfCommonTokens(String normalizedMention, String synonym) {
        int i;
        String[] mentionSplit = normalizedMention.split("\\s");
        String[] synSplit = synonym.split("\\s");
        HashMultiset<String> mentionTokens = HashMultiset.create();
        HashMultiset<String> synTokens = HashMultiset.create();
        for (i = 0; i < mentionSplit.length; ++i) {
            String mentionToken = mentionSplit[i];
            mentionTokens.add(mentionToken);
        }
        for (i = 0; i < synSplit.length; ++i) {
            String synToken = synSplit[i];
            synTokens.add(synToken);
        }
        return Multisets.intersection(mentionTokens, synTokens);
    }

    private static void initUnspecifieds() {
        TermNormalizer normalizer = new TermNormalizer();
        InputStream in = CandidateFilter.class.getResourceAsStream(UNSPECIFIEDS_FILE);
        InputStreamReader isr = new InputStreamReader(in);
        BufferedReader reader = new BufferedReader(isr);
        UNSPECIFIEDS = "^(";
        try {
            String line = "";
            while ((line = reader.readLine()) != null) {
                if (line.startsWith("##")) continue;
                line = line.trim();
                line = normalizer.normalize(line);
                UNSPECIFIEDS = UNSPECIFIEDS + line.trim() + "|";
            }
            reader.close();
        }
        catch (IOException e) {
            throw new GeneExpRuntimeException(e);
        }
        UNSPECIFIEDS = UNSPECIFIEDS.substring(0, UNSPECIFIEDS.length() - 1) + ")e?s?$";
        UNSPECIFIEDS = UNSPECIFIEDS.trim();
        patternUnspecifieds = Pattern.compile(UNSPECIFIEDS);
        LOGGER.debug("Initializing unspecified proteins pattern from file: " + patternUnspecifieds);
    }

    public static void initPreModifiers() {
        InputStream in = CandidateFilter.class.getResourceAsStream(PREMOD_FILE);
        InputStreamReader isr = new InputStreamReader(in);
        BufferedReader reader = new BufferedReader(isr);
        PREMODS = "^(";
        try {
            String line = "";
            while ((line = reader.readLine()) != null) {
                if (line.startsWith("##")) continue;
                PREMODS = PREMODS + line.trim() + "|";
            }
            reader.close();
        }
        catch (IOException e) {
            throw new GeneExpRuntimeException(e);
        }
        PREMODS = PREMODS.substring(0, PREMODS.length() - 1) + ") ";
        patternPreMods = Pattern.compile(PREMODS + ".*");
        LOGGER.debug("Initializing protein void premodifiers from file: " + patternPreMods);
    }

    public static TreeSet<String> getCommonWords(String[] firstArray, String[] secondArray) {
        TreeSet<String> firstSet = new TreeSet<String>();
        for (int i = 0; i < firstArray.length; ++i) {
            firstSet.add(firstArray[i]);
        }
        TreeSet<String> secondSet = new TreeSet<String>();
        for (int i = 0; i < secondArray.length; ++i) {
            secondSet.add(secondArray[i]);
        }
        TreeSet<String> intersection = new TreeSet<String>((SortedSet<String>)firstSet);
        intersection.retainAll(secondSet);
        return intersection;
    }

    public static TreeSet<String> getDifferentWords(String[] firstArray, String[] secondArray) {
        TreeSet<String> firstSet = new TreeSet<String>();
        for (int i = 0; i < firstArray.length; ++i) {
            firstSet.add(firstArray[i]);
        }
        TreeSet<String> secondSet = new TreeSet<String>();
        for (int i = 0; i < secondArray.length; ++i) {
            secondSet.add(secondArray[i]);
        }
        TreeSet<String> different = new TreeSet<String>((SortedSet<String>)firstSet);
        different.removeAll(secondSet);
        return different;
    }

    public static double getOverlapRatio(String first, String second) {
        String[] firstArray = first.split(" ");
        String[] secondArray = second.split(" ");
        int firstLength = firstArray.length;
        int secondLength = secondArray.length;
        int commonWords = CandidateFilter.getCommonWords(firstArray, secondArray).size();
        double firstRatio = (double)commonWords / (double)firstLength;
        double secondRatio = (double)commonWords / (double)secondLength;
        return Math.min(firstRatio, secondRatio);
    }

    public static ArrayList<String> makeBigrams(String term) {
        String[] split = term.split(" ");
        ArrayList<String> bigrams = new ArrayList<String>();
        for (int i = 1; i < split.length; ++i) {
            Object bigram = split[i - 1] + " " + split[i];
            bigram = ((String)bigram).trim();
            bigrams.add((String)bigram);
        }
        return bigrams;
    }

    public static String makeUnderScoreBigrams(String term) {
        String[] split = term.split(" ");
        Object bigrams = "";
        if (split.length == 1) {
            bigrams = term;
        } else {
            for (int i = 1; i < split.length; ++i) {
                String bigram = split[i - 1] + "_" + split[i];
                bigrams = (String)bigrams + bigram + " ";
            }
        }
        return ((String)bigrams).trim();
    }

    private boolean differInTypeOfOneTerm(String searchTerm, String foundTerm, String type) {
        if (searchTerm.equals(foundTerm)) {
            return false;
        }
        TreeSet<String> s1 = this.getSet(searchTerm.split("\\s+"));
        TreeSet<String> s2 = this.getSet(foundTerm.split("\\s+"));
        TreeSet<Object> diff = new TreeSet();
        if (s1.size() == s2.size() + 1) {
            diff = s1;
            diff.removeAll(s2);
        } else if (s1.size() == s2.size() + 1) {
            diff = s2;
            diff.removeAll(s1);
        }
        if (diff.size() == 1) {
            String diffToken = (String)diff.first();
            Pattern pat = Pattern.compile(type);
            Matcher m3 = pat.matcher(diffToken);
            return m3.matches();
        }
        return false;
    }

    private TreeSet<String> getSet(String[] array) {
        TreeSet<String> mySet = new TreeSet<String>();
        for (int i = 0; i < array.length; ++i) {
            mySet.add(array[i]);
        }
        return mySet;
    }

    private int getNumberOfOccurrences(String term, String occurrence) {
        String[] tokens = term.split("\\s+");
        int num = 0;
        Pattern p = Pattern.compile(occurrence);
        for (int i = 0; i < tokens.length; ++i) {
            Matcher m3 = p.matcher(tokens[i]);
            if (!m3.matches()) continue;
            ++num;
        }
        return num;
    }

    private boolean onlyDifferentTypes(String searchTerm, String foundTerm, String type) {
        String query = "([a-z0-9 ]*?) ?" + type + " ?([a-z0-9 ]*?)";
        Pattern num = Pattern.compile(query);
        if (this.getNumberOfOccurrences(searchTerm, type) == 1 && this.getNumberOfOccurrences(foundTerm, type) == 1) {
            Matcher m1 = num.matcher(searchTerm);
            Matcher m22 = num.matcher(foundTerm);
            if (m1.matches() && m22.matches()) {
                return !m1.group(2).equals(m22.group(2)) && m1.group(1).equals(m22.group(1)) && m1.group(3).equals(m22.group(3));
            }
        }
        return false;
    }

    public boolean filterOut(String searchTerm, String foundTerm) {
        boolean bl;
        TreeSet<String> commonWords = CandidateFilter.getCommonWords(searchTerm.split(" "), foundTerm.split(" "));
        boolean allNumbers = true;
        for (String string : commonWords) {
            Matcher m3 = this.num.matcher(string);
            if (m3.matches()) continue;
            allNumbers = false;
            break;
        }
        if (allNumbers) {
            LOGGER.debug("filtered out because: overlap only numbers: '" + searchTerm + "' <-> '" + foundTerm + "'");
            return allNumbers;
        }
        boolean singleCharDigits = true;
        for (String c : commonWords) {
            Matcher m4 = this.singChar.matcher(c);
            if (m4.matches()) continue;
            singleCharDigits = false;
            break;
        }
        if (singleCharDigits) {
            LOGGER.debug("filtered out because: overlap only single characters or single digits: '" + searchTerm + "' <-> '" + foundTerm + "'");
            return singleCharDigits;
        }
        boolean bl2 = true;
        for (String c : commonWords) {
            Matcher m5 = this.specWords.matcher(c);
            if (m5.matches()) continue;
            bl = false;
            break;
        }
        if (bl) {
            LOGGER.debug("filtered out because: overlap consists only of special words (greek, modifiers, non-descriptive): '" + searchTerm + "' <-> '" + foundTerm + "'");
            return bl;
        }
        if (this.onlyDifferentTypes(searchTerm, foundTerm, "([0-9]+)")) {
            LOGGER.debug("filtered out because: terms differ in one number only: '" + searchTerm + "' <-> '" + foundTerm + "'");
            return true;
        }
        if (this.onlyDifferentTypes(searchTerm, foundTerm, TermNormalizer.GREEK_REGEX)) {
            LOGGER.debug("filtered out because: terms differ in one greek token only: '" + searchTerm + "' <-> '" + foundTerm + "'");
            return true;
        }
        if (this.differInTypeOfOneTerm(searchTerm, foundTerm, "([02-9]|[1-9]{2,})")) {
            LOGGER.debug("filtered out because: one has a number and the other doesn't (1 is excluded): '" + searchTerm + "' <-> '" + foundTerm + "'");
            return true;
        }
        if (this.differInTypeOfOneTerm(searchTerm, foundTerm, SUB_GREEK)) {
            LOGGER.debug("filtered out because: one has a greek and the other doesn't (alpha is excluded): '" + searchTerm + "' <-> '" + foundTerm + "'");
            return true;
        }
        if (this.differInTypeOfOneTerm(searchTerm, foundTerm, MODIFIER)) {
            LOGGER.debug("filtered out because: one has a modifier and the other doesn't: '" + searchTerm + "' <-> '" + foundTerm + "'");
            return true;
        }
        return false;
    }

    public static boolean isSpecifier(String word) {
        if (word.matches("[abcdeABCDE0-9]+")) {
            return true;
        }
        if (word.matches(TermNormalizer.GREEK_REGEX)) {
            return true;
        }
        return word.matches(TermNormalizer.LAT_NUM_REGEX);
    }

    public static boolean hasSpecifier(String normalizedGeneName) {
        String[] split = normalizedGeneName.split("\\s+");
        return CandidateFilter.isSpecifier(split[split.length - 1]);
    }

    public boolean hasContradictingGreek(String s1, String s2) {
        return false;
    }

    public boolean isUnspecified(String word) {
        return this.matcherUnspecifieds.reset(word).matches();
    }

    public boolean isNonDescriptive(String word) {
        return this.matcherNonDesc.reset(word).matches();
    }

    static {
        for (int i = 0; i < TermNormalizer.GREEK.length; ++i) {
            String greekChar = TermNormalizer.GREEK[i];
            String firstChar = greekChar.substring(0, 1);
            if (greekAbbrMap.containsKey(firstChar)) continue;
            greekAbbrMap.put(firstChar, greekChar);
        }
        CandidateFilter.initUnspecifieds();
        CandidateFilter.initPreModifiers();
        patternDomainFamilies = Pattern.compile(DOMAIN_FAMILIES);
        patternNonDesc = Pattern.compile(NON_DESCRIPTIVE);
    }
}

