/*
 * Decompiled with CFR 0.152.
 */
package uk.ac.man.entitytagger.matching.matchers;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import uk.ac.man.documentparser.dataholders.Document;
import uk.ac.man.entitytagger.Mention;
import uk.ac.man.entitytagger.matching.Matcher;

public class TaxonGrabMatcher
extends Matcher {
    private Set<String> dict;

    public TaxonGrabMatcher(File dictFile) {
        System.out.print("Loading TaxonGrab dictionary...");
        this.dict = this.loadDict(dictFile);
        System.out.println(" done, loaded " + this.dict.size() + " entries.");
    }

    private Set<String> loadDict(File dictFile) {
        HashSet<String> res = new HashSet<String>();
        try {
            BufferedReader inStream = new BufferedReader(new FileReader(dictFile));
            String line = inStream.readLine();
            while (line != null) {
                res.add(line);
                line = inStream.readLine();
            }
            inStream.close();
        }
        catch (Exception e) {
            System.err.println(e);
            e.printStackTrace();
            System.exit(-1);
        }
        return res;
    }

    @Override
    public List<Mention> match(String text, Document doc) {
        String[] lines;
        String docID = doc != null ? doc.getID() : null;
        String orgtext = text;
        String f_word = "";
        String s_word = "";
        String taxon_name_long = "";
        ArrayList<String> taxon_array = new ArrayList<String>();
        text = text.replace(" -\r", " - ");
        text = text.replace(" -\n", " - ");
        text = text.replace("-\r", "");
        text = text.replace("-\n", "");
        text = text.replace("\r", " ");
        text = text.replace("\t", "");
        for (String line : lines = text.split("\n")) {
            String[] words;
            line = line.replace(":", " ");
            line = line.replace(";", " ");
            line = line.replace(".", ". ");
            for (String word : words = line.split("\\s")) {
                if (word.matches(".*[\\$\\%\\|\\{\\}\\*\\+\\?\\=\\-\\'\\^\\/\\@\\&]|[0-9].*")) {
                    f_word = "";
                    s_word = "";
                    taxon_name_long = "";
                }
                if (word.matches(".*[(][\\sa-z]+.*")) {
                    word = word.replaceAll("[()]", "");
                }
                String word_key = word.toLowerCase();
                word_key = word_key.replace(".", "");
                word_key = word_key.replace(",", "");
                if (!taxon_name_long.equals("") && word.matches(".*^[A-Za-z()]{2,}.*")) {
                    taxon_array.add(taxon_name_long + " " + word);
                    taxon_name_long = "";
                }
                if (this.dict.contains(word_key)) {
                    f_word = "";
                    s_word = "";
                    continue;
                }
                if (word.matches(".*\\A(?:((^[A-Z][a-z]{1,})|(^[A-Z][a-z]?\\.)))\\z.*") && !word.matches(".*var|subsp.*")) {
                    f_word = word;
                    s_word = "";
                    continue;
                }
                if (!f_word.equals("") && s_word.equals("")) {
                    if ((word = word.replace(",", "")).matches(".*^[a-z]{3,}.\\z.*")) {
                        s_word = word;
                        taxon_array.add(f_word + " " + s_word);
                        continue;
                    }
                    if (word.matches(".*\\A\\([A-Z][a-z]{3,}\\)\\z.*")) {
                        s_word = word;
                        taxon_array.add("temporary, should be deleted");
                        continue;
                    }
                    f_word = "";
                    s_word = "";
                    continue;
                }
                if (!f_word.equals("") && !s_word.equals("") && word.length() > 2) {
                    if ((word = word.replace(",", "")).matches("^[A-Za-z()]{2,}")) {
                        taxon_array.remove(taxon_array.size() - 1);
                        if (word.matches(".*var|subsp|subg|ssp.*")) {
                            taxon_name_long = f_word + " " + s_word + " " + word;
                        } else if (!word.contains(".")) {
                            taxon_array.add(f_word + " " + s_word + " " + word);
                        }
                    }
                    f_word = "";
                    s_word = "";
                    continue;
                }
                f_word = "";
                s_word = "";
            }
        }
        ArrayList<Mention> matches = new ArrayList<Mention>();
        HashSet<String> processed = new HashSet<String>();
        for (String str : taxon_array) {
            if (processed.contains(str)) continue;
            if (str.endsWith(".")) {
                str = str.substring(0, str.length() - 1);
            }
            if (str.endsWith(")") && str.indexOf("(") == -1) {
                str = str.substring(0, str.length() - 1);
            }
            if (str.contains(". ")) {
                String pstr = str.replace(". ", ". ?");
                pstr = pstr.replace("(", "\\(");
                pstr = pstr.replace(")", "\\)");
                pstr = pstr.replace("[", "\\[");
                pstr = pstr.replace("]", "\\]");
                pstr = pstr.replace("{", "\\{");
                pstr = pstr.replace("}", "\\}");
                pstr = pstr.replace(".", "\\.");
                Pattern p = Pattern.compile(pstr);
                java.util.regex.Matcher matcher = p.matcher(orgtext);
                while (matcher.find()) {
                    int s = matcher.start();
                    int e = matcher.end();
                    Mention m = new Mention(new String[0], s, e, orgtext.substring(s, e));
                    m.setComment("taxongrab (" + str + ")");
                    m.setDocid(docID);
                    if (doc != null && !doc.isValid(m.getStart(), m.getEnd())) continue;
                    matches.add(m);
                }
            } else {
                int x = orgtext.indexOf(str);
                while (x != -1) {
                    Mention m = new Mention(new String[0], x, x + str.length(), str);
                    m.setComment("taxongrab (" + str + ")");
                    m.setDocid(docID);
                    if (doc == null || doc.isValid(m.getStart(), m.getEnd())) {
                        matches.add(m);
                    }
                    x = orgtext.indexOf(str, x + 1);
                }
            }
            processed.add(str);
        }
        return matches;
    }
}

