/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.geneexpbase;

import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.IOStreamUtilities;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.invoke.CallSite;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.Range;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tartarus.snowball.SnowballProgram;

public class TermNormalizer {
    private static final Logger log = LoggerFactory.getLogger(TermNormalizer.class);
    private static AhoCorasickOptimized specialistLexEmbeddedGreekAC;
    private static final Map<String, String> greekCharacterNormalizationMap;
    public static final String[] GREEK;
    public static String GREEK_REGEX;
    public static String LAT_NUM_REGEX;
    public static final Pattern ROMAN_NUMBERS_PATTERN;
    private final String GREEK_CHAR_PATTERN = "\u03b1|\u03b2|\u03b3|\u03b4|\u03b5|\u03b6|\u03b7|\u03b8|\u03b9|\u03ba|\u03bb|\u03bc|\u03bd|\u03be|\u03bf|\u03c0|\u03c1|\u03c2|\u03c3|\u03c4|\u03c5|\u03c6|\u03c7|\u03c8";
    private final String NON_DESCRIPTIVES_FILE = "/non_descriptives";
    private static final String NUMBERPATTERN = "([A-Za-z]+)([0-9]+)";
    private final String SHORTFORMPATTERN = "((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))";
    private final String SHORTFORMEND_WITH_NUMBER_PATTERN = "(.* )(ra|rb|rg|bp)( [0-9]*)?";
    private final String SHORTFORMEND_NO_NUMBER_PATTERN = "(.* )(a|b)";
    private final String TOKENSPLITPATTERN = "(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)";
    private final String DOTREMOVAL = "(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)";
    private static TreeSet<String> nonDescriptives;
    private static TreeSet<String> stopwords;
    private HashMap<String, String> plurals;
    private static final Pattern NUMBER_SPECIFIER_PATTERN;
    private final Pattern shortFormPattern = Pattern.compile("((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))");
    private final Pattern shortFormEndWithNumberPattern;
    private final Pattern shortFormEndNoNumberPattern;
    private final Pattern tokenSplitPattern = Pattern.compile("(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)");
    private final Pattern dotRemovalPattern = Pattern.compile("(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)");
    private final Pattern greekCharPattern;
    private final AhoCorasickOptimized greekHighLowKinaseAC;
    private final AhoCorasickOptimized greekAC;
    private final SnowballProgram stemmer;

    public TermNormalizer() {
        this.shortFormEndWithNumberPattern = Pattern.compile("(.* )(ra|rb|rg|bp)( [0-9]*)?");
        this.shortFormEndNoNumberPattern = Pattern.compile("(.* )(a|b)");
        this.greekCharPattern = Pattern.compile("\u03b1|\u03b2|\u03b3|\u03b4|\u03b5|\u03b6|\u03b7|\u03b8|\u03b9|\u03ba|\u03bb|\u03bc|\u03bd|\u03be|\u03bf|\u03c0|\u03c1|\u03c2|\u03c3|\u03c4|\u03c5|\u03c6|\u03c7|\u03c8");
        List<String> patterns = Arrays.stream(GREEK).collect(Collectors.toList());
        patterns.add("high");
        patterns.add("low");
        patterns.add("kinase");
        this.greekHighLowKinaseAC = new AhoCorasickOptimized(patterns);
        this.greekAC = new AhoCorasickOptimized(GREEK);
        this.initStopwords();
        this.initNonDescriptives();
        try {
            Class<?> stemClass = Class.forName("org.tartarus.snowball.ext.EnglishStemmer");
            this.stemmer = (SnowballProgram)stemClass.getDeclaredConstructor(new Class[0]).newInstance(new Object[0]);
        }
        catch (ReflectiveOperationException e) {
            throw new RuntimeException(e.toString());
        }
    }

    public static void main(String[] args) {
        if (args.length == 2) {
            File unnormalizedFile = new File(args[0]);
            File outputFile = new File(args[1]);
            new TermNormalizer().normalizeFile(unnormalizedFile, outputFile);
        } else {
            System.err.println("usage:\nTermNormalizer <inputFile> <outputFile>");
            System.exit(-1);
        }
    }

    public static String removeModifiers(String normalizedSearchTerm) {
        log.debug("TRYING to remove modifiers or even complete term: {}", (Object)normalizedSearchTerm);
        Pattern p = CandidateFilter.patternUnspecifieds;
        Matcher m3 = p.matcher(normalizedSearchTerm);
        if (m3.matches()) {
            log.debug("IS UNSPECIFIED: {}", (Object)normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.UNSPECIFIEDS, "");
            log.debug("UNSPECIFIED REMOVED: |{}|", (Object)normalizedSearchTerm);
        }
        if ((m3 = (p = CandidateFilter.patternDomainFamilies).matcher(normalizedSearchTerm = normalizedSearchTerm.trim())).matches()) {
            log.debug("IS DOMAIN: {}", (Object)normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.DOMAIN_FAMILIES, "");
            log.debug("DOMAIN REMOVED: |{}|", (Object)normalizedSearchTerm);
        }
        if ((m3 = (p = CandidateFilter.patternPreMods).matcher(normalizedSearchTerm = normalizedSearchTerm.trim())).matches()) {
            log.debug("PREMODIFIER: {}", (Object)normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.PREMODS, "");
        }
        if ((m3 = (p = CandidateFilter.patternNonDesc).matcher(normalizedSearchTerm = normalizedSearchTerm.trim())).matches()) {
            log.debug("IS NONDESC: {}", (Object)normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.NON_DESCRIPTIVE, "");
            log.debug("NONDESC REMOVED: |{}|", (Object)normalizedSearchTerm);
        }
        return normalizedSearchTerm.trim();
    }

    public static String removeUnspecifieds(String normalizedSearchTerm) {
        Pattern p = CandidateFilter.patternUnspecifieds;
        Matcher m3 = p.matcher(normalizedSearchTerm);
        log.debug("TRYING to remove modifiers or even complete term: " + normalizedSearchTerm);
        if (m3.matches()) {
            log.debug("IS UNSPECIFIED: " + normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.UNSPECIFIEDS, "");
            log.debug("UNSPECIFIED REMOVED: |" + normalizedSearchTerm + "|");
        }
        return normalizedSearchTerm.trim();
    }

    public static String removeNondescriptives(String normalizedSearchTerm) {
        Pattern p = CandidateFilter.patternNonDesc;
        Matcher m3 = p.matcher(normalizedSearchTerm);
        StringBuilder sb = new StringBuilder();
        int previousEndIndex = 0;
        while (m3.find(previousEndIndex)) {
            if (log.isDebugEnabled()) {
                log.debug("IS NONDESC: {} because of {}", (Object)normalizedSearchTerm, (Object)m3.group(2));
            }
            sb.append(normalizedSearchTerm, previousEndIndex, m3.start(2));
            if (log.isDebugEnabled()) {
                log.debug("NONDESC REMOVED: |{}|", (Object)m3.group(2));
            }
            previousEndIndex = m3.end(2);
        }
        sb.append(normalizedSearchTerm, previousEndIndex, normalizedSearchTerm.length());
        while (sb.length() > 0 && Character.isWhitespace(sb.charAt(0))) {
            sb.deleteCharAt(0);
        }
        while (sb.length() > 0 && Character.isWhitespace(sb.charAt(sb.length() - 1))) {
            sb.deleteCharAt(sb.length() - 1);
        }
        return sb.toString();
    }

    public static String removeDomainFamilies(String normalizedSearchTerm) {
        Pattern p = CandidateFilter.patternDomainFamilies;
        Matcher m3 = p.matcher(normalizedSearchTerm);
        if (m3.matches()) {
            log.debug("IS DOMAIN: " + normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.DOMAIN_FAMILIES, "");
            log.debug("DOMAIN REMOVED: |" + normalizedSearchTerm + "|");
        }
        return normalizedSearchTerm.trim();
    }

    public static String removePremodifiers(String normalizedSearchTerm) {
        Pattern p = CandidateFilter.patternPreMods;
        Matcher m3 = p.matcher(normalizedSearchTerm);
        if (m3.matches()) {
            log.debug("PREMODIFIER: " + normalizedSearchTerm);
            normalizedSearchTerm = normalizedSearchTerm.replaceFirst(CandidateFilter.PREMODS, "");
        }
        return normalizedSearchTerm.trim();
    }

    public String normalize(String term) {
        List<String> termOld;
        List<String> newTerm = this.removeStopwords(term);
        newTerm = this.splitAndNormalizeGreekCharacters(newTerm);
        newTerm = this.removeSpecialCharacters(newTerm);
        newTerm = this.decomposition(newTerm);
        do {
            termOld = newTerm;
            newTerm = this.splitAwayNumbers(newTerm);
        } while (!(newTerm = this.specialTokenSplit(newTerm)).equals(termOld));
        newTerm = this.replaceRomanNumbers(newTerm);
        newTerm = this.splitAwayCharacterStrings(newTerm);
        newTerm = this.toLowerCase(newTerm);
        term = this.ArrayList2String(newTerm);
        term = term.trim();
        return term;
    }

    protected List<String> decomposition(List<String> newTerm) {
        ArrayList<String> ret = new ArrayList<String>();
        for (String t : newTerm) {
            LinkedHashSet spans = new LinkedHashSet();
            specialistLexEmbeddedGreekAC.match(t, (start, end, entry) -> spans.add(Range.between(start, end + 1)));
            ArrayList<Range> decomposition = new ArrayList<Range>();
            block1: for (Range r1 : spans) {
                for (Range r2 : spans) {
                    if (r2.isBefore((Integer)r1.getMinimum()) || r1 == r2 || (Integer)r1.getMinimum() != 0 || r1.getMaximum() != r2.getMinimum() || ((Integer)r2.getMaximum()).intValue() != t.length()) continue;
                    decomposition.add(r1);
                    decomposition.add(r2);
                    continue block1;
                }
            }
            if (!decomposition.isEmpty()) {
                for (Range subrange : decomposition) {
                    ret.add(t.substring((Integer)subrange.getMinimum(), (Integer)subrange.getMaximum()));
                }
                continue;
            }
            ret.add(t);
        }
        return ret;
    }

    protected List<String> splitAndNormalizeGreekCharacters(List<String> newTerm) {
        Matcher m3 = this.greekCharPattern.matcher("");
        ArrayList<String> ret = new ArrayList<String>();
        for (String term : newTerm) {
            m3.reset(term);
            term = m3.replaceAll(" $0 ");
            String[] split = term.split("\\s+");
            Arrays.stream(split).map(t -> greekCharacterNormalizationMap.containsKey(t) ? greekCharacterNormalizationMap.get(t) : t).forEach(ret::add);
        }
        return ret;
    }

    public List<String> generateVariants(String term) {
        ArrayList<String> ret = new ArrayList<String>();
        String variant = term.replaceAll("([^-0-9])\\-([^0-9])", "$1$2");
        ret.add(variant);
        variant = this.splitAwayRomanNumbers(Arrays.asList(term.split("\\s+"))).stream().collect(Collectors.joining(" "));
        ret.add(variant);
        variant = term.replaceAll("alpha", "a");
        variant = variant.replaceAll("beta", "b");
        variant = variant.replaceAll("gamma", "g");
        variant = variant.replaceAll("delta", "d");
        ret.add(variant);
        variant = term.replaceAll("\\s?alpha", "a");
        variant = variant.replaceAll("\\s?beta", "b");
        variant = variant.replaceAll("\\s?gamma", "g");
        variant = variant.replaceAll("\\s?delta", "d");
        ret.add(variant);
        return ret.stream().distinct().collect(Collectors.toList());
    }

    public String concatenateTrailingSpecifier(String term) {
        String[] split = term.split("\\p{P}");
        if (split[split.length - 1].length() == 1) {
            return term.substring(0, term.length() - 1) + split[split.length - 1];
        }
        return term;
    }

    public String stemNameTokens(String normalizedTerm) throws IOException {
        String[] split = normalizedTerm.split("\\s+");
        ArrayList<String> stemmedTokens = new ArrayList<String>(split.length);
        for (String token : split) {
            this.stemmer.setCurrent(token);
            this.stemmer.stem();
            stemmedTokens.add(this.stemmer.getCurrent());
        }
        return StringUtils.join(stemmedTokens, " ");
    }

    public Stream<String> getRomanNumbers(String normalizedName) {
        Stream.Builder<String> ret = Stream.builder();
        String[] tokens = normalizedName.split("\\s+");
        for (int i = 0; i < tokens.length; ++i) {
            String token = tokens[i];
            Matcher romNumMatcher = ROMAN_NUMBERS_PATTERN.matcher(token);
            while (romNumMatcher.find()) {
                if (romNumMatcher.start() != 0 && romNumMatcher.end() == token.length()) {
                    ret.accept(romNumMatcher.group());
                    continue;
                }
                if (i != tokens.length - 1 || token.length() != romNumMatcher.end() + 1 || !Character.isUpperCase(token.charAt(token.length() - 1))) continue;
                ret.accept(romNumMatcher.group());
            }
        }
        return ret.build();
    }

    public Stream<String> getGreekCharacters(String normalizedName) {
        Stream.Builder builder = Stream.builder();
        this.greekAC.match(normalizedName, (start, end, match) -> builder.accept(match));
        return builder.build();
    }

    public Stream<String> getNumbers(String normalizedName) {
        Matcher numberSpecifierMatcher = NUMBER_SPECIFIER_PATTERN.matcher("");
        return Stream.of(normalizedName).flatMap(name -> Arrays.stream(name.split("\\s+"))).flatMap(name -> {
            Stream.Builder<String> builder = Stream.builder();
            numberSpecifierMatcher.reset((CharSequence)name);
            while (numberSpecifierMatcher.find()) {
                builder.accept(numberSpecifierMatcher.group());
            }
            return builder.build();
        });
    }

    public void normalizeFile(File inputFile, File outputFile) {
        System.out.println("Normalizing file " + inputFile.getAbsolutePath() + " and writing the result to " + outputFile.getAbsolutePath());
        AtomicInteger ignoredLines = new AtomicInteger(0);
        try (BufferedReader br = new BufferedReader(new FileReader(inputFile));
             FileWriter fileOut = new FileWriter(outputFile);){
            ((Stream)((Stream)br.lines().parallel()).map(line -> line.split("\t")).filter(split -> {
                if (((String[])split).length != 2 && ((String[])split).length != 3) {
                    ignoredLines.incrementAndGet();
                    System.err.println("wrong line format, ignoring line: " + Arrays.toString(split));
                    return false;
                }
                return true;
            }).flatMap(split -> {
                Stream.Builder<CallSite> toWrite = Stream.builder();
                String normalizedSyn = this.normalize(split[0]);
                if (!normalizedSyn.isEmpty()) {
                    int i;
                    List<String> variantString = this.generateVariants(split[0]);
                    for (i = 0; i < variantString.size(); ++i) {
                        variantString.set(i, this.normalize(variantString.get(i)));
                    }
                    if (((String[])split).length == 3) {
                        toWrite.accept((CallSite)((Object)(normalizedSyn + "\t" + split[1] + "\t" + split[2] + "\n")));
                    } else if (((String[])split).length == 2) {
                        toWrite.accept((CallSite)((Object)(normalizedSyn + "\t" + split[1] + "\n")));
                    }
                    for (i = 0; i < variantString.size(); ++i) {
                        if (((String[])split).length == 3) {
                            toWrite.accept((CallSite)((Object)(variantString.get(i) + "\t" + split[1] + "\t" + split[2] + "\n")));
                            continue;
                        }
                        if (((String[])split).length != 2) continue;
                        toWrite.accept((CallSite)((Object)(variantString.get(i) + "\t" + split[1] + "\n")));
                    }
                }
                return toWrite.build();
            }).unordered()).distinct().forEach(line -> {
                try {
                    FileWriter fileWriter = fileOut;
                    synchronized (fileWriter) {
                        fileOut.write((String)line);
                    }
                }
                catch (IOException e) {
                    System.err.println("Could not write line: " + line);
                    e.printStackTrace();
                }
            });
        }
        catch (IOException io) {
            io.printStackTrace();
        }
        System.out.println("\n\n\ndone");
        System.out.println("number of ignored lines (due to wrong format): " + ignoredLines);
    }

    public List<String> specialTokenSplit(List<String> newTerm) {
        for (int i = 0; i < newTerm.size(); ++i) {
            Object myTerm = newTerm.get(i);
            do {
                newTerm.remove(i);
                newTerm.add(i, (String)myTerm);
                Matcher m3 = this.tokenSplitPattern.matcher((CharSequence)myTerm);
                if (!m3.matches()) continue;
                if (m3.group(1) != null && m3.group(2) != null) {
                    myTerm = m3.group(1) + " " + m3.group(2);
                    continue;
                }
                if (m3.group(3) != null && m3.group(4) != null) {
                    myTerm = m3.group(3) + " " + m3.group(4);
                    continue;
                }
                if (m3.group(5) != null && m3.group(6) != null) {
                    myTerm = m3.group(5) + " " + m3.group(6);
                    continue;
                }
                if (m3.group(7) == null || m3.group(8) == null) continue;
                myTerm = m3.group(7) + " " + m3.group(8);
            } while (!((String)myTerm).equals(newTerm.get(i)));
        }
        ArrayList<String> finalTerms = new ArrayList<String>();
        for (String token : newTerm) {
            if (token.length() <= 0) continue;
            String[] values = token.split(" ");
            for (int i = 0; i < values.length; ++i) {
                finalTerms.add(values[i]);
            }
        }
        return finalTerms;
    }

    public AhoCorasickOptimized getGreekAC() {
        return this.greekAC;
    }

    public AhoCorasickOptimized getGreekHighLowKinaseAC() {
        return this.greekHighLowKinaseAC;
    }

    protected List<String> splitAwayCharacterStrings(List<String> term) {
        AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
        for (int i = 0; i < term.size(); ++i) {
            callback.clear();
            String currentPart = term.get(i).toLowerCase();
            specialistLexEmbeddedGreekAC.match(currentPart, callback);
            TreeMap<Range<Integer>, String> specialistDictMatches = callback.getLongestMatches();
            if (specialistDictMatches.containsValue(currentPart)) continue;
            callback.clear();
            this.greekHighLowKinaseAC.match(currentPart, callback);
            TreeMap<Range<Integer>, String> longestMatches = callback.getLongestMatches();
            if (longestMatches.isEmpty() || longestMatches.size() == 1 && longestMatches.firstEntry().getValue().equals(currentPart)) continue;
            int currentPos = 0;
            boolean isWithinSpecialistMatch = false;
            for (Range<Integer> match : longestMatches.keySet()) {
                for (Range<Integer> specialistMatchSpan : specialistDictMatches.keySet()) {
                    if (!specialistMatchSpan.containsRange(match) || specialistMatchSpan.getMinimum() >= match.getMinimum() && specialistMatchSpan.getMaximum() <= match.getMaximum()) continue;
                    isWithinSpecialistMatch = true;
                }
                if (isWithinSpecialistMatch) continue;
                Range<Integer> textBeforeMatch = Range.between(currentPos, match.getMinimum());
                if (currentPos == 0) {
                    if (textBeforeMatch.getMaximum() > 0) {
                        term.set(i, currentPart.substring(textBeforeMatch.getMinimum(), textBeforeMatch.getMaximum()));
                        term.add(++i, longestMatches.get(match));
                        ++i;
                    } else {
                        term.set(i, longestMatches.get(match));
                        ++i;
                    }
                } else {
                    if (textBeforeMatch.getMaximum() > textBeforeMatch.getMinimum()) {
                        term.add(i, currentPart.substring(textBeforeMatch.getMinimum(), textBeforeMatch.getMaximum()));
                        ++i;
                    }
                    term.add(i, longestMatches.get(match));
                    ++i;
                }
                currentPos = match.getMaximum() + 1;
            }
            if (isWithinSpecialistMatch || currentPos >= currentPart.length() - 1) continue;
            term.add(i, currentPart.substring(currentPos));
        }
        return term;
    }

    private List<String> replaceShortForms(List<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            Matcher m3 = this.shortFormPattern.matcher(term.get(i));
            if (!m3.matches()) continue;
            String base = "";
            String substitute = "";
            if (m3.group(3) != null) {
                base = m3.group(2);
                substitute = m3.group(3);
                if (substitute.equals("L")) {
                    substitute = "ligand";
                } else if (substitute.equals("R")) {
                    substitute = "receptor";
                }
            } else if (m3.group(5) != null) {
                base = m3.group(4);
                substitute = m3.group(5);
                if (substitute.equals("l")) {
                    substitute = "ligand";
                } else if (substitute.equals("r")) {
                    substitute = "receptor";
                }
            } else if (m3.group(6) != null) {
                if (m3.group(1).equalsIgnoreCase("l")) {
                    substitute = "ligand";
                } else if (m3.group(1).equalsIgnoreCase("r")) {
                    substitute = "receptor";
                }
            }
            term.set(i, base);
            term.add(++i, substitute);
        }
        return term;
    }

    private String replaceShortFormsAtEnd(String term) {
        String replacement = "";
        Matcher m3 = this.shortFormEndWithNumberPattern.matcher(term);
        if (m3.matches()) {
            if (m3.group(2).equals("ra")) {
                replacement = "receptor alpha";
            } else if (m3.group(2).equals("rb")) {
                replacement = "receptor beta";
            } else if (m3.group(2).equals("rg")) {
                replacement = "receptor gamma";
            } else if (m3.group(2).equals("bp")) {
                replacement = "binding protein";
            } else if (m3.group(2).equals("a")) {
                replacement = "alpha";
            } else if (m3.group(2).equals("b")) {
                replacement = "beta";
            }
            if (replacement.length() > 0) {
                String number = "";
                if (m3.group(3) != null) {
                    number = m3.group(3);
                }
                return m3.group(1) + replacement + number;
            }
        }
        if ((m3 = this.shortFormEndNoNumberPattern.matcher(term)).matches()) {
            if (m3.group(2).equals("a")) {
                replacement = "alpha";
            } else if (m3.group(2).equals("b")) {
                replacement = "beta";
            }
            if (replacement.length() > 0) {
                return m3.group(1) + replacement;
            }
        }
        return term;
    }

    private List<String> replaceKnownAcronyms(List<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            if (!term.get(i).equals("il") && !term.get(i).equals("IL")) continue;
            term.set(i, "interleukin");
        }
        return term;
    }

    public List<String> splitAwayNumbers(List<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            Matcher m3 = NUMBER_SPECIFIER_PATTERN.matcher(term.get(i));
            if (!m3.matches()) continue;
            term.set(i, m3.group(1));
            term.add(++i, m3.group(2));
        }
        return term;
    }

    public List<String> splitAwayRomanNumbers(List<String> term) {
        ArrayList<String> ret = new ArrayList<String>(term);
        for (int i = 0; i < ret.size(); ++i) {
            String token = (String)ret.get(i);
            Matcher romNumMatcher = ROMAN_NUMBERS_PATTERN.matcher(token);
            while (romNumMatcher.find()) {
                if (romNumMatcher.start() != 0 && romNumMatcher.end() == token.length()) {
                    ret.set(i, token.substring(0, romNumMatcher.start()));
                    ret.add(++i, romNumMatcher.group());
                    continue;
                }
                if (i != ret.size() - 1 || token.length() != romNumMatcher.end() + 1 || !Character.isUpperCase(token.charAt(token.length() - 1))) continue;
                ret.set(i, token.substring(0, romNumMatcher.start()));
                ret.add(++i, romNumMatcher.group());
                ret.add(++i, String.valueOf(token.charAt(token.length() - 1)));
            }
        }
        return ret;
    }

    protected List<String> replaceRomanNumbers(List<String> synonym) {
        if (synonym.size() > 1) {
            for (int i = 0; i < synonym.size(); ++i) {
                String token = synonym.get(i);
                if (token.equals("I")) {
                    synonym.set(i, "1");
                    continue;
                }
                if (token.equals("II")) {
                    synonym.set(i, "2");
                    continue;
                }
                if (token.equals("III")) {
                    synonym.set(i, "3");
                    continue;
                }
                if (!token.equals("IV")) continue;
                synonym.set(i, "4");
            }
        }
        return synonym;
    }

    private List<String> transformPlurals(List<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            if (!this.plurals.containsKey(term.get(i))) continue;
            term.set(i, this.plurals.get(term.get(i)));
        }
        return term;
    }

    protected List<String> toLowerCase(List<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            String s2 = term.get(i);
            term.set(i, s2.trim().toLowerCase());
        }
        return term;
    }

    protected List<String> removeSpecialCharacters(List<String> term) {
        ArrayList<String> newTerm = new ArrayList<String>();
        for (String token : term) {
            Matcher m3 = this.dotRemovalPattern.matcher(token = token.replaceAll("[\\W_&&[^\\.]]", " "));
            if (m3.matches()) {
                token = m3.replaceFirst(m3.group(1) + m3.group(2) + " " + m3.group(3) + m3.group(4));
            }
            token = token.replaceAll("[ ]+", " ");
            if ((token = token.trim()).length() <= 0) continue;
            String[] values = token.split(" ");
            for (int i = 0; i < values.length; ++i) {
                newTerm.add(values[i]);
            }
        }
        return newTerm;
    }

    private List<String> removeDotAndHyphen(List<String> term) {
        ArrayList<String> newTerm = new ArrayList<String>();
        for (String token : term) {
            token = token.replaceAll("\\-", " ");
            newTerm.add(token);
        }
        return newTerm;
    }

    protected List<String> removeStopwords(String term) {
        String[] tokens = term.split(" ");
        ArrayList<String> newTerm = new ArrayList<String>(tokens.length);
        if (tokens.length == 1) {
            newTerm.add(tokens[0]);
            return newTerm;
        }
        for (int i = 0; i < tokens.length; ++i) {
            if (stopwords.contains(tokens[i])) continue;
            newTerm.add(tokens[i]);
        }
        return newTerm;
    }

    public String removeNonDescriptives(String term) {
        String[] tokens = term.split(" ");
        ArrayList<String> newTerm = new ArrayList<String>(tokens.length);
        for (int i = 0; i < tokens.length; ++i) {
            if (nonDescriptives.contains(tokens[i])) continue;
            newTerm.add(tokens[i]);
        }
        return this.ArrayList2String(newTerm);
    }

    public boolean isNonDescriptive(String term) {
        return nonDescriptives.contains(term);
    }

    private synchronized void initStopwords() {
        if (stopwords == null) {
            stopwords = new TreeSet();
            stopwords.add("of");
            stopwords.add("for");
            stopwords.add("and");
            stopwords.add("or");
            stopwords.add("the");
        }
    }

    private void initPlurals() {
        this.plurals = new HashMap();
        this.plurals.put("receptors", "receptor");
        this.plurals.put("proteins", "protein");
        this.plurals.put("factors", "factor");
        this.plurals.put("ligands", "ligand");
        this.plurals.put("chains", "chain");
        this.plurals.put("antigens", "antigen");
        this.plurals.put("genes", "gene");
        this.plurals.put("transcripts", "transcript");
    }

    private synchronized void initNonDescriptives() {
        if (nonDescriptives == null) {
            nonDescriptives = new TreeSet();
            try {
                InputStream in = FileUtilities.findResource("/non_descriptives");
                InputStreamReader isr = new InputStreamReader(in);
                BufferedReader nonDescReader = new BufferedReader(isr);
                String line = "";
                while ((line = nonDescReader.readLine()) != null) {
                    nonDescriptives.add(line.trim());
                }
                nonDescReader.close();
            }
            catch (IOException e) {
                throw new IllegalStateException(e);
            }
        }
    }

    protected String ArrayList2String(List<String> term) {
        StringBuffer transform = new StringBuffer();
        for (int i = 0; i < term.size(); ++i) {
            transform.append(term.get(i) + " ");
        }
        if (transform.length() != 0) {
            transform.deleteCharAt(transform.length() - 1);
        }
        return transform.toString().trim();
    }

    static {
        try {
            List<String> specialistTerms = IOStreamUtilities.getReaderFromInputStream(FileUtilities.findResource("specialistLexiconEmbeddedGreekLowHigh.txt.gz")).lines().collect(Collectors.toList());
            specialistTerms.add("kinase");
            specialistTerms.add("kinases");
            specialistLexEmbeddedGreekAC = new AhoCorasickOptimized(specialistTerms);
        }
        catch (IOException e) {
            log.error("Could not find the SPECIALIST Lexicon entries filtered for embedded mentions of greek symbols or the words 'low' and 'high' on the classpath as 'specialistLexiconEmbeddedGreekLowHigh.txt.gz'. Term normalization will split words containing greek symbols by coincidence.");
            specialistLexEmbeddedGreekAC = new AhoCorasickOptimized(new String[0]);
        }
        greekCharacterNormalizationMap = new HashMap<String, String>();
        greekCharacterNormalizationMap.put("\u03b1", "alpha");
        greekCharacterNormalizationMap.put("\u03b2", "beta");
        greekCharacterNormalizationMap.put("\u03b3", "gamma");
        greekCharacterNormalizationMap.put("\u03b4", "delta");
        greekCharacterNormalizationMap.put("\u03b5", "epsilon");
        greekCharacterNormalizationMap.put("\u03b6", "zeta");
        greekCharacterNormalizationMap.put("\u03b7", "eta");
        greekCharacterNormalizationMap.put("\u03b8", "theta");
        greekCharacterNormalizationMap.put("\u03b9", "iota");
        greekCharacterNormalizationMap.put("\u03ba", "kappa");
        greekCharacterNormalizationMap.put("\u03bb", "delta");
        greekCharacterNormalizationMap.put("\u03bc", "mu");
        greekCharacterNormalizationMap.put("\u03bd", "nu");
        greekCharacterNormalizationMap.put("\u03be", "xi");
        greekCharacterNormalizationMap.put("\u03bf", "omicron");
        greekCharacterNormalizationMap.put("\u03c0", "pi");
        greekCharacterNormalizationMap.put("\u03c1", "rho");
        greekCharacterNormalizationMap.put("\u03c3", "sigma");
        greekCharacterNormalizationMap.put("\u03c4", "tau");
        greekCharacterNormalizationMap.put("\u03c5", "upsilon");
        greekCharacterNormalizationMap.put("\u03c6", "phi");
        greekCharacterNormalizationMap.put("\u03c7", "chi");
        greekCharacterNormalizationMap.put("\u03c8", "omega");
        GREEK = new String[]{"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"};
        GREEK_REGEX = "(" + Stream.of(GREEK).collect(Collectors.joining("|")) + ")";
        LAT_NUM_REGEX = "(" + Stream.of("I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX").sorted(Comparator.reverseOrder()).collect(Collectors.joining("|")) + ")";
        ROMAN_NUMBERS_PATTERN = Pattern.compile(LAT_NUM_REGEX);
        NUMBER_SPECIFIER_PATTERN = Pattern.compile(NUMBERPATTERN);
    }
}

