/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.ner;

import de.datexis.annotator.Annotator;
import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Annotation;
import de.datexis.model.Document;
import de.datexis.model.Token;
import de.datexis.ner.MentionAnnotation;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import net.amygdalum.stringsearchalgorithms.search.MatchOption;
import net.amygdalum.stringsearchalgorithms.search.StringFinder;
import net.amygdalum.stringsearchalgorithms.search.StringFinderOption;
import net.amygdalum.stringsearchalgorithms.search.StringMatch;
import net.amygdalum.stringsearchalgorithms.search.chars.SetBackwardOracleMatching;
import net.amygdalum.stringsearchalgorithms.search.chars.StringSearchAlgorithm;
import net.amygdalum.util.io.CharProvider;
import net.amygdalum.util.io.StringCharProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MatchingAnnotator
extends Annotator {
    protected static final Logger log = LoggerFactory.getLogger(MatchingAnnotator.class);
    protected int minimumWordLength = 3;
    protected String type = "GENERIC";
    protected Pattern wordLengthMatcher = Pattern.compile("\\b\\w{4,}\\b");
    protected Pattern uppercaseMatcher = Pattern.compile("^[A-Z0-9]+$");
    protected StringSearchAlgorithm stringSearch;
    protected WordHelpers wordHelpers = new WordHelpers(WordHelpers.Language.EN);
    Collection<String> terms = new ArrayList<String>();
    protected MatchingStrategy matchingStrategy = MatchingStrategy.CASE_SENSITIVE;
    protected Annotation.Source source = Annotation.Source.SILVER;

    public MatchingAnnotator() {
        this(MatchingStrategy.CASE_SENSITIVE, Annotation.Source.SILVER);
    }

    public MatchingAnnotator(MatchingStrategy matchingStrategy) {
        this(matchingStrategy, Annotation.Source.SILVER);
    }

    public MatchingAnnotator(MatchingStrategy matchLowercase, Annotation.Source source) {
        this.matchingStrategy = matchLowercase;
        this.source = source;
    }

    public MatchingAnnotator(MatchingStrategy matchLowercase, Annotation.Source source, String type) {
        this(matchLowercase, source);
        this.type = type;
    }

    public MatchingAnnotator(MatchingStrategy matchLowercase, Annotation.Source source, String type, int minWordLength) {
        this(matchLowercase, source, type);
        this.minimumWordLength = minWordLength;
    }

    protected Collection<String> convertTerms(Stream<String> terms) {
        switch (this.matchingStrategy) {
            case LOWERCASE: {
                return terms.filter(w -> w.length() >= this.minimumWordLength).map(w -> this.convertToLowercase((String)w)).distinct().collect(Collectors.toList());
            }
            case LEMMA: {
                return terms.filter(w -> w.length() >= this.minimumWordLength).map(w -> this.removePlurals(this.convertToLowercase((String)w))).distinct().collect(Collectors.toList());
            }
            case SKIP_STOPWORDS: {
                return terms.filter(w -> w.length() >= this.minimumWordLength && !this.wordHelpers.isStopWord(w)).distinct().collect(Collectors.toList());
            }
        }
        return terms.distinct().collect(Collectors.toList());
    }

    public void clearTermsToMatch() {
        this.terms.clear();
        this.stringSearch = new SetBackwardOracleMatching(this.terms);
    }

    public void loadTermsToMatch(Collection<String> terms) {
        this.loadTermsToMatch(terms.stream());
    }

    public void loadTermsToMatch(Stream<String> terms) {
        this.terms.addAll(this.convertTerms(terms));
        log.info("Rebuildung dictionary with {} distinct terms", (Object)this.terms.size());
        this.stringSearch = new SetBackwardOracleMatching(this.terms);
    }

    public void loadTermsToMatch(Resource path) throws IOException {
        if (path.isDirectory()) {
            Files.walk(path.getPath(), new FileVisitOption[0]).filter(p -> Files.isRegularFile(p, LinkOption.NOFOLLOW_LINKS)).forEach(p -> {
                try {
                    this.loadTermsToMatch(Resource.fromFile((String)p.toString()));
                }
                catch (IOException ex) {
                    log.error(ex.toString());
                }
            });
        } else if (path.isFile()) {
            try (BufferedReader br = new BufferedReader(new InputStreamReader(path.getInputStream(), "UTF-8"));){
                this.loadTermsToMatch(br.lines());
            }
        } else {
            throw new FileNotFoundException("cannot open path: " + path.toString());
        }
    }

    public void deleteTermsToMatch(Collection<String> terms) {
        this.deleteTermsToMatch(terms.stream());
    }

    public void deleteTermsToMatch(Stream<String> terms) {
        this.terms.removeAll(this.convertTerms(terms));
        log.info("Rebuildung dictionary with {} distinct terms", (Object)this.terms.size());
        this.stringSearch = new SetBackwardOracleMatching(this.terms);
    }

    public void deleteTermsToMatch(Resource path) throws IOException {
        try (BufferedReader br = new BufferedReader(new InputStreamReader(path.getInputStream(), "UTF-8"));){
            this.deleteTermsToMatch(br.lines());
        }
    }

    public int countTerms() {
        return this.terms.size();
    }

    protected String convertToLowercase(String text) {
        Matcher m = this.wordLengthMatcher.matcher(text);
        StringBuffer sb = new StringBuffer();
        while (m.find()) {
            String match = m.group();
            Matcher u = this.uppercaseMatcher.matcher(match);
            if (u.matches()) {
                if (match.length() < 8) continue;
                m.appendReplacement(sb, m.group().toLowerCase());
                continue;
            }
            m.appendReplacement(sb, m.group().toLowerCase());
        }
        m.appendTail(sb);
        return sb.toString();
    }

    protected String removePlurals(String text) {
        throw new UnsupportedOperationException("Lemma matching is not yet implemented.");
    }

    public void annotate(Collection<Document> docs) {
        this.annotate(docs, this.source);
    }

    public void annotate(Iterable<Document> docs, Annotation.Source source) {
        for (Document doc : docs) {
            String text = doc.getText();
            if (this.matchingStrategy.equals((Object)MatchingStrategy.LOWERCASE)) {
                text = this.convertToLowercase(doc.getText());
            }
            StringCharProvider chars = new StringCharProvider(text, 0);
            if (this.stringSearch == null) {
                log.warn("MatchingAnnotator called without terms loaded");
                return;
            }
            StringFinder finder = this.stringSearch.createFinder((CharProvider)chars, new StringFinderOption[]{MatchOption.LONGEST_MATCH, MatchOption.NON_OVERLAP});
            for (StringMatch match : finder.findAll()) {
                int end;
                int begin = (int)match.start();
                List<Token> list = doc.streamTokensInRange(begin, end = (int)match.end(), true).collect(Collectors.toList());
                if (!this.spanIsAtTokenBoundaries(list, begin, end, doc)) continue;
                MentionAnnotation ann = new MentionAnnotation(source, list);
                ann.setType(this.type);
                doc.addAnnotation((Annotation)ann);
            }
        }
    }

    private boolean spanIsAtTokenBoundaries(List<Token> list, int begin, int end, Document doc) {
        if (list.isEmpty()) {
            return false;
        }
        if (list.size() == 1 && list.get(0).getBegin() == begin && list.get(0).getEnd() == end) {
            return true;
        }
        return list.get(0).getBegin() == begin && list.get(list.size() - 1).getEnd() == end;
    }

    public static enum MatchingStrategy {
        CASE_SENSITIVE,
        LOWERCASE,
        LEMMA,
        SKIP_STOPWORDS;

    }
}

