/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jules.ae.genemapping.utils.norm;

import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.jules.ae.genemapping.AhoCorasickLongestMatchCallback;
import de.julielab.jules.ae.genemapping.CandidateFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.invoke.CallSite;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.Range;
import org.apache.commons.lang3.StringUtils;
import org.tartarus.snowball.SnowballProgram;

public class TermNormalizer {
    private final String NON_DESCRIPTIVES_FILE = "/non_descriptives";
    private final String NUMBERPATTERN = "([A-Za-z]+)([0-9]+)";
    private final String SHORTFORMPATTERN = "((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))";
    private final String SHORTFORMEND_WITH_NUMBER_PATTERN = "(.* )(ra|rb|rg|bp)( [0-9]*)?";
    private final String SHORTFORMEND_NO_NUMBER_PATTERN = "(.* )(a|b)";
    private final String TOKENSPLITPATTERN = "(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)";
    private final String DOTREMOVAL = "(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)";
    private TreeSet<String> nonDescriptives;
    private TreeSet<String> stopwords;
    private HashMap<String, String> plurals;
    private Pattern numberPattern = Pattern.compile("([A-Za-z]+)([0-9]+)");
    private Pattern shortFormPattern = Pattern.compile("((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))");
    private Pattern shortFormEndWithNumberPattern;
    private Pattern shortFormEndNoNumberPattern;
    private Pattern tokenSplitPattern = Pattern.compile("(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)");
    private Pattern dotRemovalPattern = Pattern.compile("(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)");
    private AhoCorasickOptimized greekAC;
    private SnowballProgram stemmer;

    public TermNormalizer() {
        this.shortFormEndWithNumberPattern = Pattern.compile("(.* )(ra|rb|rg|bp)( [0-9]*)?");
        this.shortFormEndNoNumberPattern = Pattern.compile("(.* )(a|b)");
        List<String> patterns = Arrays.stream(CandidateFilter.GREEK).collect(Collectors.toList());
        patterns.add("high");
        patterns.add("low");
        this.greekAC = new AhoCorasickOptimized(patterns);
        this.initStopwords();
        this.initNonDescriptives();
        try {
            Class<?> stemClass = Class.forName("org.tartarus.snowball.ext.EnglishStemmer");
            this.stemmer = (SnowballProgram)stemClass.newInstance();
        }
        catch (ReflectiveOperationException e) {
            throw new RuntimeException(e.toString());
        }
    }

    public static void main(String[] args) throws IOException {
        if (args.length == 2) {
            File unnormalizedFile = new File(args[0]);
            File outputFile = new File(args[1]);
            new TermNormalizer().normalizeFile(unnormalizedFile, outputFile);
        } else {
            System.err.println("usage:\nTermNormalizer <inputFil> <outputFile>");
            System.exit(-1);
        }
    }

    public String normalize(String term) {
        ArrayList<String> termOld;
        ArrayList<String> newTerm = this.removeStopwords(term);
        newTerm = this.removeSpecialCharacters(newTerm);
        do {
            termOld = newTerm;
            newTerm = this.splitAwayNumbers(newTerm);
        } while (!(newTerm = this.specialTokenSplit(newTerm)).equals(termOld));
        newTerm = this.splitAwayCharacterStrings(newTerm);
        newTerm = this.replaceRomanNumbers(newTerm);
        newTerm = this.toLowerCase(newTerm);
        term = this.ArrayList2String(newTerm);
        term = term.trim();
        return term;
    }

    public List<String> generateVariants(String term) {
        ArrayList<String> ret = new ArrayList<String>();
        String variant = term.replaceAll("([^-0-9])\\-([^0-9])", "$1$2");
        ret.add(variant);
        variant = this.splitAwayRomanNumbers(Arrays.asList(term.split("\\s+"))).stream().collect(Collectors.joining(" "));
        ret.add(variant);
        variant = term.replaceAll("alpha", "a");
        variant = variant.replaceAll("beta", "b");
        variant = variant.replaceAll("gamma", "g");
        variant = variant.replaceAll("delta", "d");
        ret.add(variant);
        variant = term.replaceAll("\\s?alpha", "a");
        variant = variant.replaceAll("\\s?beta", "b");
        variant = variant.replaceAll("\\s?gamma", "g");
        variant = variant.replaceAll("\\s?delta", "d");
        ret.add(variant);
        return ret;
    }

    public String stemNameTokens(String normalizedTerm) throws IOException {
        String[] split = normalizedTerm.split("\\s+");
        ArrayList<String> stemmedTokens = new ArrayList<String>(split.length);
        for (String token : split) {
            this.stemmer.setCurrent(token);
            this.stemmer.stem();
            stemmedTokens.add(this.stemmer.getCurrent());
        }
        return StringUtils.join(stemmedTokens, (String)" ");
    }

    public void normalizeFile(File inputFile, File outputFile) {
        System.out.println("Normalizing file " + inputFile.getAbsolutePath() + " and writing the result to " + outputFile.getAbsolutePath());
        AtomicInteger ignoredLines = new AtomicInteger(0);
        try (BufferedReader br = new BufferedReader(new FileReader(inputFile));
             FileWriter fileOut = new FileWriter(outputFile);){
            ((Stream)((Stream)br.lines().parallel()).map(line -> line.split("\t")).filter(split -> {
                if (((String[])split).length != 3) {
                    ignoredLines.incrementAndGet();
                    System.err.println("wrong line format, ignoring line: " + Arrays.toString(split));
                    return false;
                }
                return true;
            }).flatMap(split -> {
                Stream.Builder<CallSite> toWrite = Stream.builder();
                String normalizedSyn = this.normalize(split[0]);
                if (!normalizedSyn.isEmpty()) {
                    int i;
                    List<String> variantString = this.generateVariants(split[0]);
                    for (i = 0; i < variantString.size(); ++i) {
                        variantString.set(i, this.normalize(variantString.get(i)));
                    }
                    toWrite.accept((CallSite)((Object)(normalizedSyn + "\t" + split[1] + "\t" + split[2] + "\n")));
                    for (i = 0; i < variantString.size(); ++i) {
                        toWrite.accept((CallSite)((Object)(variantString.get(i) + "\t" + split[1] + "\t" + split[2] + "\n")));
                    }
                }
                return toWrite.build();
            }).unordered()).distinct().forEach(line -> {
                try {
                    FileWriter fileWriter = fileOut;
                    synchronized (fileWriter) {
                        fileOut.write((String)line);
                    }
                }
                catch (IOException e) {
                    System.err.println("Could not write line: " + line);
                    e.printStackTrace();
                }
            });
        }
        catch (IOException io) {
            io.printStackTrace();
        }
        System.out.println("\n\n\ndone");
        System.out.println("number of ignored lines (due to wrong format): " + ignoredLines);
    }

    private ArrayList<String> specialTokenSplit(ArrayList<String> newTerm) {
        for (int i = 0; i < newTerm.size(); ++i) {
            Object myTerm = newTerm.get(i);
            do {
                newTerm.remove(i);
                newTerm.add(i, (String)myTerm);
                Matcher m = this.tokenSplitPattern.matcher((CharSequence)myTerm);
                if (!m.matches()) continue;
                if (m.group(1) != null && m.group(2) != null) {
                    myTerm = m.group(1) + " " + m.group(2);
                    continue;
                }
                if (m.group(3) != null && m.group(4) != null) {
                    myTerm = m.group(3) + " " + m.group(4);
                    continue;
                }
                if (m.group(5) != null && m.group(6) != null) {
                    myTerm = m.group(5) + " " + m.group(6);
                    continue;
                }
                if (m.group(7) == null || m.group(8) == null) continue;
                myTerm = m.group(7) + " " + m.group(8);
            } while (!((String)myTerm).equals(newTerm.get(i)));
        }
        ArrayList<String> finalTerms = new ArrayList<String>();
        for (String token : newTerm) {
            if (token.length() <= 0) continue;
            String[] values = token.split(" ");
            for (int i = 0; i < values.length; ++i) {
                finalTerms.add(values[i]);
            }
        }
        return finalTerms;
    }

    private ArrayList<String> splitAwayCharacterStrings(ArrayList<String> term) {
        AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
        for (int i = 0; i < term.size(); ++i) {
            callback.clear();
            String currentPart = term.get(i).toLowerCase();
            this.greekAC.match(currentPart, callback);
            TreeMap<Range<Integer>, String> longestMatches = callback.getLongestMatches();
            if (longestMatches.isEmpty() || longestMatches.size() == 1 && longestMatches.firstEntry().getValue().equals(currentPart)) continue;
            int currentPos = 0;
            for (Range<Integer> match : longestMatches.keySet()) {
                Range textBeforeMatch = Range.between((Comparable)Integer.valueOf(currentPos), (Comparable)((Integer)match.getMinimum()));
                if (currentPos == 0) {
                    if ((Integer)textBeforeMatch.getMaximum() > 0) {
                        term.set(i, currentPart.substring((Integer)textBeforeMatch.getMinimum(), (Integer)textBeforeMatch.getMaximum()));
                        term.add(++i, longestMatches.get(match));
                        ++i;
                    } else {
                        term.set(i, longestMatches.get(match));
                        ++i;
                    }
                } else {
                    if ((Integer)textBeforeMatch.getMaximum() > (Integer)textBeforeMatch.getMinimum()) {
                        term.add(i, currentPart.substring((Integer)textBeforeMatch.getMinimum(), (Integer)textBeforeMatch.getMaximum()));
                        ++i;
                    }
                    term.add(i, longestMatches.get(match));
                    ++i;
                }
                currentPos = (Integer)match.getMaximum() + 1;
            }
            if (currentPos >= currentPart.length() - 1) continue;
            term.add(i, currentPart.substring(currentPos));
        }
        return term;
    }

    private ArrayList<String> replaceShortForms(ArrayList<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            Matcher m = this.shortFormPattern.matcher(term.get(i));
            if (!m.matches()) continue;
            String base = "";
            String substitute = "";
            if (m.group(3) != null) {
                base = m.group(2);
                substitute = m.group(3);
                if (substitute.equals("L")) {
                    substitute = "ligand";
                } else if (substitute.equals("R")) {
                    substitute = "receptor";
                }
            } else if (m.group(5) != null) {
                base = m.group(4);
                substitute = m.group(5);
                if (substitute.equals("l")) {
                    substitute = "ligand";
                } else if (substitute.equals("r")) {
                    substitute = "receptor";
                }
            } else if (m.group(6) != null) {
                if (m.group(1).toLowerCase().equals("l")) {
                    substitute = "ligand";
                } else if (m.group(1).toLowerCase().equals("r")) {
                    substitute = "receptor";
                }
            }
            term.set(i, base);
            term.add(++i, substitute);
        }
        return term;
    }

    private String replaceShortFormsAtEnd(String term) {
        String replacement = "";
        Matcher m = this.shortFormEndWithNumberPattern.matcher(term);
        if (m.matches()) {
            if (m.group(2).equals("ra")) {
                replacement = "receptor alpha";
            } else if (m.group(2).equals("rb")) {
                replacement = "receptor beta";
            } else if (m.group(2).equals("rg")) {
                replacement = "receptor gamma";
            } else if (m.group(2).equals("bp")) {
                replacement = "binding protein";
            } else if (m.group(2).equals("a")) {
                replacement = "alpha";
            } else if (m.group(2).equals("b")) {
                replacement = "beta";
            }
            if (replacement.length() > 0) {
                String number = "";
                if (m.group(3) != null) {
                    number = m.group(3);
                }
                return m.group(1) + replacement + number;
            }
        }
        if ((m = this.shortFormEndNoNumberPattern.matcher(term)).matches()) {
            if (m.group(2).equals("a")) {
                replacement = "alpha";
            } else if (m.group(2).equals("b")) {
                replacement = "beta";
            }
            if (replacement.length() > 0) {
                return m.group(1) + replacement;
            }
        }
        return term;
    }

    private ArrayList<String> replaceKnownAcronyms(ArrayList<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            if (!term.get(i).equals("il") && !term.get(i).equals("IL")) continue;
            term.set(i, "interleukin");
        }
        return term;
    }

    private ArrayList<String> splitAwayNumbers(ArrayList<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            Matcher m = this.numberPattern.matcher(term.get(i));
            if (!m.matches()) continue;
            term.set(i, m.group(1));
            term.add(++i, m.group(2));
        }
        return term;
    }

    public List<String> splitAwayRomanNumbers(List<String> term) {
        ArrayList<String> ret = new ArrayList<String>(term);
        for (int i = 0; i < ret.size(); ++i) {
            String token = (String)ret.get(i);
            Matcher romNumMatcher = Pattern.compile(CandidateFilter.LAT_NUM_REGEX).matcher(token);
            while (romNumMatcher.find()) {
                if (romNumMatcher.start() == 0 || romNumMatcher.end() != token.length()) continue;
                ret.set(i, token.substring(0, romNumMatcher.start()));
                ret.add(++i, romNumMatcher.group());
            }
        }
        return ret;
    }

    private ArrayList<String> replaceRomanNumbers(ArrayList<String> synonym) {
        if (synonym.size() > 1) {
            for (int i = 0; i < synonym.size(); ++i) {
                String token = synonym.get(i);
                if (token.equals("I")) {
                    synonym.set(i, "1");
                    continue;
                }
                if (token.equals("II")) {
                    synonym.set(i, "2");
                    continue;
                }
                if (token.equals("III")) {
                    synonym.set(i, "3");
                    continue;
                }
                if (!token.equals("IV")) continue;
                synonym.set(i, "4");
            }
        }
        return synonym;
    }

    private ArrayList<String> transformPlurals(ArrayList<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            if (!this.plurals.containsKey(term.get(i))) continue;
            term.set(i, this.plurals.get(term.get(i)));
        }
        return term;
    }

    private ArrayList<String> toLowerCase(ArrayList<String> term) {
        for (int i = 0; i < term.size(); ++i) {
            String s = term.get(i);
            term.set(i, s.trim().toLowerCase());
        }
        return term;
    }

    private ArrayList<String> removeSpecialCharacters(ArrayList<String> term) {
        ArrayList<String> newTerm = new ArrayList<String>();
        for (String token : term) {
            Matcher m = this.dotRemovalPattern.matcher(token = token.replaceAll("[\\W_&&[^\\.]]", " "));
            if (m.matches()) {
                token = m.replaceFirst(m.group(1) + m.group(2) + " " + m.group(3) + m.group(4));
            }
            token = token.replaceAll("[ ]+", " ");
            if ((token = token.trim()).length() <= 0) continue;
            String[] values = token.split(" ");
            for (int i = 0; i < values.length; ++i) {
                newTerm.add(values[i]);
            }
        }
        return newTerm;
    }

    private ArrayList<String> removeDotAndHyphen(ArrayList<String> term) {
        ArrayList<String> newTerm = new ArrayList<String>();
        for (String token : term) {
            token = token.replaceAll("\\-", " ");
            newTerm.add(token);
        }
        return newTerm;
    }

    private ArrayList<String> removeStopwords(String term) {
        String[] tokens = term.split(" ");
        ArrayList<String> newTerm = new ArrayList<String>(tokens.length);
        if (tokens.length == 1) {
            newTerm.add(tokens[0]);
            return newTerm;
        }
        for (int i = 0; i < tokens.length; ++i) {
            if (this.stopwords.contains(tokens[i])) continue;
            newTerm.add(tokens[i]);
        }
        return newTerm;
    }

    public String removeNonDescriptives(String term) {
        String[] tokens = term.split(" ");
        ArrayList<String> newTerm = new ArrayList<String>(tokens.length);
        for (int i = 0; i < tokens.length; ++i) {
            if (this.nonDescriptives.contains(tokens[i])) continue;
            newTerm.add(tokens[i]);
        }
        return this.ArrayList2String(newTerm);
    }

    public boolean isNonDescriptive(String term) {
        return this.nonDescriptives.contains(term);
    }

    private void initStopwords() {
        this.stopwords = new TreeSet();
        this.stopwords.add("of");
        this.stopwords.add("for");
        this.stopwords.add("and");
        this.stopwords.add("or");
        this.stopwords.add("the");
    }

    private void initPlurals() {
        this.plurals = new HashMap();
        this.plurals.put("receptors", "receptor");
        this.plurals.put("proteins", "protein");
        this.plurals.put("factors", "factor");
        this.plurals.put("ligands", "ligand");
        this.plurals.put("chains", "chain");
        this.plurals.put("antigens", "antigen");
        this.plurals.put("genes", "gene");
        this.plurals.put("transcripts", "transcript");
    }

    private void initNonDescriptives() {
        this.nonDescriptives = new TreeSet();
        InputStream in = this.getClass().getResourceAsStream("/non_descriptives");
        InputStreamReader isr = new InputStreamReader(in);
        BufferedReader nonDescReader = new BufferedReader(isr);
        try {
            String line = "";
            while ((line = nonDescReader.readLine()) != null) {
                this.nonDescriptives.add(line.trim());
            }
            nonDescReader.close();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private String ArrayList2String(ArrayList<String> term) {
        StringBuffer transform = new StringBuffer("");
        for (int i = 0; i < term.size(); ++i) {
            transform.append(term.get(i) + " ");
        }
        if (transform.length() != 0) {
            transform.deleteCharAt(transform.length() - 1);
        }
        return transform.toString().trim();
    }
}

