/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jcore.ae.jsbd.main;

import de.julielab.jcore.ae.jsbd.SentenceSplitter;
import de.julielab.jcore.ae.jsbd.Unit;
import de.julielab.jcore.types.Sentence;
import de.julielab.jcore.utility.JCoReAnnotationIndexMerger;
import de.julielab.jcore.utility.JCoReCondensedDocumentText;
import de.julielab.jcore.utility.JCoReTools;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SentenceAnnotator
extends JCasAnnotator_ImplBase {
    public static final String PARAM_MODEL_FILE = "ModelFilename";
    public static final String PARAM_POSTPROCESSING = "Postprocessing";
    public static final String PARAM_SENTENCE_DELIMITER_TYPES = "SentenceDelimiterTypes";
    public static final String PARAM_CUT_AWAY_TYPES = "CutAwayTypes";
    public static final String PARAM_MAX_SENTENCE_LENGTH = "MaximumSentenceLength";
    public static final String PARAM_ALWAYS_SPLIT_NEWLINE = "AlwaysSplitAtNewlines";
    private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotator.class);
    private static AtomicInteger numEmptyCases = new AtomicInteger();
    private final Matcher letterMatcher = Pattern.compile("\\p{L}\\p{M}*").matcher("");
    private final Matcher eolMatcher = Pattern.compile("(\\r\\n|\\r|\\n)").matcher("");
    private final Matcher semicoliMatcher = Pattern.compile(";").matcher("");
    private final Matcher wsMatcher = Pattern.compile(" ").matcher("");
    @ConfigurationParameter(name="Postprocessing", mandatory=false, defaultValue={"false"}, description="One of 'biomed' or 'medical'. Does some post processing to e.g. respect parenthesis and don't put a sentence boundary withing in a pair of opening and closing parenthesis.")
    private String postprocessingFilter = null;
    @ConfigurationParameter(name="SentenceDelimiterTypes", mandatory=false, description="An array of annotation types that should never begin or end within a sentence. For example, sentences should never reach out of a paragraph or a section heading.")
    private Set<String> sentenceDelimiterTypes;
    @ConfigurationParameter(name="ModelFilename", mandatory=true)
    private String modelFilename;
    @ConfigurationParameter(name="CutAwayTypes", mandatory=false, description="An array of fully qualified type names. Document text covered by annotations of these types will be ignored from sentence splitting. This means that sentence splitting happens as if the covered text of these annotations would not exist in the text. This helps for references, for example, which otherwise might confuse the sentence splitting. A post-processing step tries to extend sentences include such annotations if they appear directly after the sentence (e.g. references: '...as Smith et al. have shown.1 Further text follows...').")
    private Set<String> cutAwayTypes;
    @ConfigurationParameter(name="MaximumSentenceLength", mandatory=false, description="Optional. If given, this parameter defines the maximum length in characters any sentence will have. If the machine learning algorithm produces sentences exceeding the given maximum length, they will be split first by newline and, if necessary, also at semicoli. If there are still too large sentences then, they will be split at whitespaces to stay within the given bound. Defaults to 0 which means no maximum length.")
    private int maxSentenceLength;
    @ConfigurationParameter(name="AlwaysSplitAtNewlines", mandatory=false, description="Optional. If true, newlines are also used as sentence boundaries.")
    private boolean alwaysSplitAtNewlines;
    private SentenceSplitter sentenceSplitter;

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);
        try {
            String[] ignoredTypesArray;
            String[] sentenceDelimiterTypesArray;
            InputStream modelIs;
            this.sentenceSplitter = new SentenceSplitter();
            LOGGER.info("initializing JSBD Annotator ...");
            this.modelFilename = (String)aContext.getConfigParameterValue(PARAM_MODEL_FILE);
            File modelFile = new File(this.modelFilename);
            if (modelFile.exists()) {
                modelIs = new FileInputStream(modelFile);
            } else {
                LOGGER.debug("File \"{}\" does not exist. Searching for the model as a classpath resource.", (Object)this.modelFilename);
                modelIs = this.getClass().getResourceAsStream((String)(this.modelFilename.startsWith("/") ? this.modelFilename : "/" + this.modelFilename));
                if (null == modelIs) {
                    throw new IllegalArgumentException("The model file \"" + this.modelFilename + "\" could be found neither in the file system nor in the classpath.");
                }
            }
            this.sentenceSplitter.readModel(modelIs);
            Object pp = aContext.getConfigParameterValue(PARAM_POSTPROCESSING);
            if (pp != null) {
                this.postprocessingFilter = (String)pp;
            }
            if (null != (sentenceDelimiterTypesArray = (String[])aContext.getConfigParameterValue(PARAM_SENTENCE_DELIMITER_TYPES))) {
                this.sentenceDelimiterTypes = new LinkedHashSet<String>(Arrays.asList(sentenceDelimiterTypesArray));
            }
            if (null != (ignoredTypesArray = (String[])aContext.getConfigParameterValue(PARAM_CUT_AWAY_TYPES))) {
                this.cutAwayTypes = Stream.of(ignoredTypesArray).collect(Collectors.toSet());
            }
            this.maxSentenceLength = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_MAX_SENTENCE_LENGTH)).orElse(0);
            this.alwaysSplitAtNewlines = Optional.ofNullable((Boolean)aContext.getConfigParameterValue(PARAM_ALWAYS_SPLIT_NEWLINE)).orElse(false);
        }
        catch (IOException | ClassNotFoundException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        JCoReCondensedDocumentText documentText;
        if (StringUtils.isBlank(aJCas.getDocumentText())) {
            String docId = JCoReTools.getDocId(aJCas);
            LOGGER.warn("The document text of document {} is empty.", (Object)docId);
            return;
        }
        try {
            documentText = new JCoReCondensedDocumentText(aJCas, this.cutAwayTypes);
        }
        catch (ClassNotFoundException e1) {
            throw new AnalysisEngineProcessException(e1);
        }
        if (this.sentenceDelimiterTypes != null) {
            try {
                JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(this.sentenceDelimiterTypes, false, null, aJCas);
                ArrayList<Integer> borders = new ArrayList<Integer>();
                borders.add(0);
                borders.add(aJCas.getDocumentText().length());
                while (indexMerger.incrementAnnotation()) {
                    Annotation a = (Annotation)indexMerger.getAnnotation();
                    borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin()));
                    borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd()));
                }
                borders.sort(null);
                for (int i = 1; i < borders.size(); ++i) {
                    int start;
                    int end = (Integer)borders.get(i);
                    for (start = ((Integer)borders.get(i - 1)).intValue(); start < end && Character.isWhitespace(aJCas.getDocumentText().charAt(start)); ++start) {
                    }
                    String textSpan = documentText.getCodensedText().substring(start, end);
                    if (StringUtils.isBlank(textSpan)) continue;
                    this.doSegmentation(documentText, textSpan, start);
                }
            }
            catch (ClassNotFoundException e) {
                throw new AnalysisEngineProcessException(e);
            }
        } else if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) {
            this.doSegmentation(documentText, documentText.getCodensedText(), 0);
        } else if (numEmptyCases.get() < 10) {
            LOGGER.debug("document text empty. Skipping this document.");
            numEmptyCases.incrementAndGet();
        } else if (numEmptyCases.get() == 10) {
            LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again to avoid scrolling in cases where this is expected.");
        }
    }

    private void doSegmentation(JCoReCondensedDocumentText documentText, String text, int offset) {
        ArrayList<String> lines = new ArrayList<String>();
        lines.add(text);
        List<Unit> units = this.sentenceSplitter.predict(lines, this.postprocessingFilter);
        this.addAnnotations(documentText, units, offset);
    }

    private void addAnnotations(JCoReCondensedDocumentText documentText, List<Unit> units, int offset) {
        int start = 0;
        for (int i = 0; i < units.size(); ++i) {
            Unit myUnit = units.get(i);
            String decision = units.get((int)i).label;
            if (start == -1) {
                start = myUnit.begin;
            }
            if (!decision.equals("EOS") && i != units.size() - 1) continue;
            Sentence annotation = new Sentence(documentText.getCas());
            int begin = documentText.getOriginalOffsetForCondensedOffset(start + offset);
            int end = documentText.getOriginalOffsetForCondensedOffset(myUnit.end + offset);
            if ((begin = this.adjustBeginOffsetForWhitespaces(begin, documentText)) < (end = this.adjustEndOffsetForWhitespaces(end, documentText))) {
                annotation.setBegin(begin);
                annotation.setEnd(end);
                annotation.setComponentId(this.getClass().getName());
                try {
                    this.letterMatcher.reset(annotation.getCoveredText());
                }
                catch (StringIndexOutOfBoundsException e) {
                    LOGGER.error("Document {}. Invalid sentence offsets: {}-{}. Document text length: {}.", JCoReTools.getDocId(documentText.getCas()), begin, end, documentText.getCas().getDocumentText().length());
                    throw e;
                }
                if (this.letterMatcher.find()) {
                    HashSet<Sentence> subSentences;
                    if (LOGGER.isTraceEnabled()) {
                        String docId = JCoReTools.getDocId(documentText.getCas());
                        LOGGER.trace("Adding sentence with offsets {}-{}, length {} to document {}", begin, end, end - begin, docId);
                    }
                    if (this.maxSentenceLength > 0 && annotation.getEnd() - annotation.getBegin() > this.maxSentenceLength) {
                        Sentence s;
                        subSentences = new HashSet();
                        LOGGER.debug("Sentence length {} exceeds maximum sentence length of {}. It is split into smaller chunks.", (Object)(annotation.getEnd() - annotation.getBegin()), (Object)this.maxSentenceLength);
                        LOGGER.debug("Splitting at newlines.");
                        this.splitAtRegex(documentText, annotation, this.eolMatcher, subSentences);
                        if (subSentences.isEmpty()) {
                            subSentences.add(annotation);
                        }
                        Iterator sentIt = subSentences.iterator();
                        HashSet<Sentence> subSubSentences = new HashSet<Sentence>();
                        while (sentIt.hasNext()) {
                            s = (Sentence)sentIt.next();
                            if (s.getEnd() - s.getBegin() <= this.maxSentenceLength) continue;
                            LOGGER.debug("Newline splitting still produces overlong sentences. Splitting at semicoli.");
                            sentIt.remove();
                            int numSubSubBefore = subSubSentences.size();
                            this.splitAtRegex(documentText, s, this.semicoliMatcher, subSubSentences);
                            if (numSubSubBefore != subSubSentences.size()) continue;
                            subSubSentences.add(s);
                        }
                        subSentences.addAll(subSubSentences);
                        sentIt = subSentences.iterator();
                        subSubSentences = new HashSet();
                        while (sentIt.hasNext()) {
                            s = (Sentence)sentIt.next();
                            if (s.getEnd() - s.getBegin() <= this.maxSentenceLength) continue;
                            LOGGER.debug("Newline and semicoli splitting still produce overlong sentences. Chunking at whitespaces.");
                            sentIt.remove();
                            this.splitAtWhitespaces(documentText, s, subSubSentences);
                        }
                        for (Sentence s2 : subSentences) {
                            s2.addToIndexes();
                        }
                        for (Sentence s3 : subSubSentences) {
                            s3.addToIndexes();
                        }
                    } else if (this.alwaysSplitAtNewlines) {
                        LOGGER.debug("Splitting at newlines.");
                        subSentences = new HashSet<Sentence>();
                        this.splitAtRegex(documentText, annotation, this.eolMatcher, subSentences);
                        if (subSentences.isEmpty()) {
                            subSentences.add(annotation);
                        }
                        subSentences.forEach(TOP::addToIndexes);
                    } else {
                        annotation.addToIndexes();
                    }
                }
            }
            start = -1;
        }
    }

    private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentence overlongSentence, Set<Sentence> subSentences) {
        Sentence s;
        int subEnd;
        int subBegin;
        this.wsMatcher.reset(overlongSentence.getCoveredText());
        int currentSentenceLength = 0;
        int lastEnd = overlongSentence.getBegin();
        while (this.wsMatcher.find()) {
            if (currentSentenceLength + this.wsMatcher.end() > this.maxSentenceLength) {
                subBegin = this.adjustBeginOffsetForWhitespaces(lastEnd, documentText);
                subEnd = this.adjustEndOffsetForWhitespaces(overlongSentence.getBegin() + this.wsMatcher.start(), documentText);
                if (subEnd > subBegin) {
                    s = new Sentence(documentText.getCas(), subBegin, subEnd);
                    s.setComponentId(this.getClass().getName());
                    subSentences.add(s);
                    lastEnd = s.getEnd();
                    currentSentenceLength = 0;
                } else {
                    LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", (Object)subBegin, (Object)subEnd);
                }
            }
            currentSentenceLength += this.wsMatcher.end();
        }
        subBegin = this.adjustBeginOffsetForWhitespaces(lastEnd, documentText);
        subEnd = this.adjustEndOffsetForWhitespaces(overlongSentence.getEnd(), documentText);
        if (subEnd > subBegin) {
            s = new Sentence(documentText.getCas(), subBegin, subEnd);
            s.setComponentId(this.getClass().getName());
            subSentences.add(s);
        } else {
            LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", (Object)subBegin, (Object)subEnd);
        }
    }

    private void splitAtRegex(JCoReCondensedDocumentText documentText, Sentence originalOverlongSentence, Matcher splitMatcher, Set<Sentence> subSentences) {
        Sentence s;
        int subEnd;
        int subBegin;
        splitMatcher.reset(originalOverlongSentence.getCoveredText());
        int lastEnd = originalOverlongSentence.getBegin();
        while (splitMatcher.find()) {
            subBegin = this.adjustBeginOffsetForWhitespaces(lastEnd, documentText);
            if (subBegin < (subEnd = this.adjustEndOffsetForWhitespaces(originalOverlongSentence.getBegin() + splitMatcher.start(), documentText))) {
                s = new Sentence(documentText.getCas(), subBegin, subEnd);
                s.setComponentId(this.getClass().getName());
                subSentences.add(s);
                lastEnd = subEnd;
                continue;
            }
            LOGGER.warn("Not creating regex-segmented sub-sentence because its offsets would be invalid: {}-{}", (Object)subBegin, (Object)subEnd);
        }
        if (lastEnd < originalOverlongSentence.getEnd() - 2) {
            subBegin = this.adjustBeginOffsetForWhitespaces(lastEnd, documentText);
            subEnd = this.adjustEndOffsetForWhitespaces(originalOverlongSentence.getEnd(), documentText);
            if (subEnd > subBegin) {
                s = new Sentence(documentText.getCas(), subBegin, subEnd);
                s.setComponentId(this.getClass().getName());
                subSentences.add(s);
            } else {
                LOGGER.warn("Not creating regex-segmented sub-sentence because its offsets would be invalid: {}-{}", (Object)subBegin, (Object)subEnd);
            }
        }
    }

    private int adjustBeginOffsetForWhitespaces(int begin, JCoReCondensedDocumentText documentText) {
        while (begin < documentText.getCas().getDocumentText().length() && Character.isWhitespace(documentText.getCas().getDocumentText().charAt(begin))) {
            ++begin;
        }
        return begin;
    }

    private int adjustEndOffsetForWhitespaces(int end, JCoReCondensedDocumentText documentText) {
        while (end > 0 && Character.isWhitespace(documentText.getCas().getDocumentText().codePointAt(end - 1))) {
            --end;
        }
        return end;
    }
}

