/*
 * Decompiled with CFR 0.152.
 */
package org.apache.uima.ruta.engine;

import java.util.HashSet;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.ruta.engine.HtmlConverterPSpan;
import org.apache.uima.ruta.engine.HtmlConverterPSpanReplacement;
import org.apache.uima.ruta.engine.HtmlConverterVisitor;
import org.apache.uima.util.CasCopier;
import org.apache.uima.util.Level;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class HtmlConverter
extends JCasAnnotator_ImplBase {
    public static final String NAMESPACE = "org.apache.uima.ruta.type.html.";
    public static final String DEFAULT_MODIFIED_VIEW = "plaintext";
    public static final String LINEBREAK = "\n";
    public static final String PARAM_OUTPUT_VIEW = "outputView";
    @ConfigurationParameter(name="outputView", mandatory=true, defaultValue={"plaintext"})
    private String modifiedViewName;
    public static final String PARAM_INPUT_VIEW = "inputView";
    @ConfigurationParameter(name="inputView", mandatory=false)
    private String inputViewName;
    public static final String PARAM_REPLACE_LINEBREAKS = "replaceLinebreaks";
    @ConfigurationParameter(name="replaceLinebreaks", mandatory=false, defaultValue={"true"})
    private boolean replaceLinebreaks;
    public static final String PARAM_SKIP_WHITESPACES = "skipWhitespaces";
    @ConfigurationParameter(name="skipWhitespaces", mandatory=false, defaultValue={"true"})
    private boolean skipWhitespaces;
    public static final String PARAM_PROCESS_ALL = "processAll";
    @ConfigurationParameter(name="processAll", mandatory=true, defaultValue={"false"})
    private boolean processAll;
    public static final String PARAM_EXPAND_OFFSETS = "expandOffsets";
    @ConfigurationParameter(name="expandOffsets", mandatory=true, defaultValue={"false"})
    private boolean expandOffsets;
    public static final String PARAM_LINEBREAK_REPLACEMENT = "linebreakReplacement";
    @ConfigurationParameter(name="linebreakReplacement", mandatory=false, defaultValue={""})
    private String linebreakReplacement;
    public static final String PARAM_NEWLINE_INDUCING_TAGS = "newlineInducingTags";
    @ConfigurationParameter(name="newlineInducingTags", mandatory=false, defaultValue={"br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote"})
    private String[] newlineInducingTags;
    public static final String PARAM_NEWLINE_INDUCING_TAG_REGEXP = "newlineInducingTagRegExp";
    @ConfigurationParameter(name="newlineInducingTagRegExp", mandatory=false)
    private String newlineInducingTagRegExp;
    public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags";
    @ConfigurationParameter(name="gapInducingTags", mandatory=true, defaultValue={})
    private String[] gapInducingTags;
    public static final String PARAM_GAP_TEXT = "gapText";
    @ConfigurationParameter(name="gapText", mandatory=true, defaultValue={""})
    private String gapText;
    public static final String PARAM_USE_SPACE_GAP = "useSpaceGap";
    @ConfigurationParameter(name="useSpaceGap", mandatory=true, defaultValue={"false"})
    private boolean useSpaceGap;
    public static final String PARAM_CONVERSION_PATTERNS = "conversionPatterns";
    @ConfigurationParameter(name="conversionPatterns", mandatory=false, defaultValue={"&nbsp;", "&laquo;", "&raquo;", "&quot;", "&amp;", "&lt;", "&gt;", "&apos;", "&sect;", "&uml;", "&copy;", "&trade;", "&reg;", "&ouml;", "&auml;", "&uuml;", "&#160;"})
    private String[] conversionPatterns;
    public static final String PARAM_CONVERSION_POLICY = "conversionPolicy";
    @ConfigurationParameter(name="conversionPolicy", mandatory=true, defaultValue={"heuristic"})
    private String conversionPolicy;
    public static final String PARAM_CONVERSION_REPLACEMENTS = "conversionReplacements";
    @ConfigurationParameter(name="conversionReplacements", mandatory=false)
    private String[] conversionReplacements;
    private int[] map;

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);
        this.linebreakReplacement = (String)aContext.getConfigParameterValue(PARAM_LINEBREAK_REPLACEMENT);
        this.linebreakReplacement = this.linebreakReplacement == null ? "" : this.linebreakReplacement;
        String conversionPolicy = (String)aContext.getConfigParameterValue(PARAM_CONVERSION_POLICY);
        if (StringUtils.isBlank(conversionPolicy) || conversionPolicy.equals("heuristic")) {
            conversionPolicy = "heuristic";
        } else if (!conversionPolicy.equals("explicit") && !conversionPolicy.equals("none")) {
            throw new ResourceInitializationException("illegal conversionPolicy parameter value", new Object[0]);
        }
        if (this.modifiedViewName.equals(this.inputViewName)) {
            throw new ResourceInitializationException("input and output view names must differ!", new Object[0]);
        }
        this.conversionReplacements = (String[])aContext.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
        if (this.conversionReplacements == null) {
            this.conversionReplacements = new String[this.conversionPatterns.length];
            for (int i = 0; i < this.conversionPatterns.length; ++i) {
                String rep;
                String c = this.conversionPatterns[i];
                this.conversionReplacements[i] = rep = StringEscapeUtils.unescapeHtml4(c);
            }
        }
        if (this.useSpaceGap) {
            this.gapText = " ";
        }
    }

    @Override
    public void process(JCas jcaz) throws AnalysisEngineProcessException {
        JCas jcas;
        try {
            jcas = this.inputViewName != null ? jcaz.getView(this.inputViewName) : jcaz;
        }
        catch (CASException e1) {
            throw new AnalysisEngineProcessException(e1.getCause());
        }
        String documentText = jcas.getDocumentText();
        String splitSeq = documentText.contains("\r\n") ? "\r\n" : LINEBREAK;
        this.map = new int[documentText.length() + 1];
        JCas modview = null;
        try {
            Iterator<JCas> viewIterator = jcas.getViewIterator();
            while (viewIterator.hasNext()) {
                JCas jCas2 = viewIterator.next();
                if (!jCas2.getViewName().equals(this.modifiedViewName)) continue;
                modview = jCas2;
                this.getContext().getLogger().log(Level.WARNING, "view with name \"" + this.modifiedViewName + "\" already exists.");
            }
            if (modview == null) {
                modview = jcas.createView(this.modifiedViewName);
            }
        }
        catch (CASException e) {
            e.printStackTrace();
            return;
        }
        SortedSet<Object> visibleSpansSoFar = new TreeSet();
        SortedSet<Object> linebreaksFromHtmlTags = new TreeSet();
        SortedSet<Object> gapsFromHtmlTags = new TreeSet();
        try {
            Parser parser2 = new Parser(documentText);
            NodeList list = parser2.parse(null);
            HtmlConverterVisitor visitor = new HtmlConverterVisitor(this.newlineInducingTags, this.newlineInducingTagRegExp, this.gapInducingTags, this.gapText, this.skipWhitespaces, this.processAll);
            list.visitAllNodesWith(visitor);
            visibleSpansSoFar = visitor.getTextSpans();
            linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
            gapsFromHtmlTags = visitor.getGapsFromHtmlTags();
        }
        catch (ParserException e) {
            throw new AnalysisEngineProcessException(e);
        }
        if (this.replaceLinebreaks) {
            visibleSpansSoFar = this.handleLinebreaksInDocumentText(visibleSpansSoFar, splitSeq);
        }
        if (this.conversionPolicy.equals("heuristic")) {
            visibleSpansSoFar = this.htmlDecoding(visibleSpansSoFar);
        } else if (this.conversionPolicy.equals("explicit")) {
            for (int i = 0; i < this.conversionPatterns.length; ++i) {
                String pat = this.conversionPatterns[i];
                String rep = this.conversionReplacements[i];
                visibleSpansSoFar = this.handleConversion(visibleSpansSoFar, pat, rep);
            }
        }
        visibleSpansSoFar.addAll(linebreaksFromHtmlTags);
        visibleSpansSoFar.addAll(gapsFromHtmlTags);
        StringBuffer sbu = new StringBuffer(documentText.length());
        int originalOffsetI = 0;
        int outOffset = 0;
        for (HtmlConverterPSpan htmlConverterPSpan : visibleSpansSoFar) {
            int begin = htmlConverterPSpan.getBegin();
            int end = htmlConverterPSpan.getEnd();
            while (originalOffsetI < begin) {
                this.map[originalOffsetI++] = outOffset;
            }
            String s2 = "";
            if (htmlConverterPSpan instanceof HtmlConverterPSpanReplacement) {
                s2 = htmlConverterPSpan.getTxt();
                while (originalOffsetI < begin + s2.length()) {
                    this.map[originalOffsetI++] = outOffset++;
                }
                while (originalOffsetI < end) {
                    this.map[originalOffsetI++] = outOffset;
                }
            } else {
                s2 = documentText.substring(begin, end);
                while (originalOffsetI < end) {
                    this.map[originalOffsetI++] = outOffset++;
                }
            }
            sbu.append(s2);
        }
        while (originalOffsetI < documentText.length()) {
            this.map[originalOffsetI++] = outOffset;
        }
        this.map[documentText.length()] = outOffset + 1;
        String modTxt = sbu.toString();
        modview.setDocumentText(modTxt);
        try {
            this.mapAnnotations(jcas, this.map, this.modifiedViewName);
        }
        catch (CASException cASException) {
            cASException.printStackTrace();
        }
    }

    private void mapAnnotations(JCas fromJcas, int[] map, String toView) throws CASException {
        JCas modview = fromJcas.getView(toView);
        HashSet<Annotation> indexedFs = new HashSet<Annotation>();
        HashSet<Annotation> toExpand = new HashSet<Annotation>();
        AnnotationIndex<Annotation> annotationIndex = fromJcas.getAnnotationIndex();
        TypeSystem typeSystem = fromJcas.getTypeSystem();
        Type docType = typeSystem.getType("uima.tcas.DocumentAnnotation");
        CasCopier casCopier = new CasCopier(fromJcas.getCas(), modview.getCas());
        for (Annotation annotation : annotationIndex) {
            if (typeSystem.subsumes(docType, annotation.getType())) continue;
            Annotation clone = (Annotation)casCopier.copyFs(annotation);
            clone.setFeatureValue(modview.getTypeSystem().getFeatureByFullName("uima.cas.AnnotationBase:sofa"), modview.getSofa());
            int mappedBegin = map[clone.getBegin()];
            int mappedEnd = map[clone.getEnd()];
            if (mappedBegin < mappedEnd) {
                if (mappedEnd > fromJcas.getCas().getDocumentAnnotation().getEnd()) {
                    this.getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
                    continue;
                }
                int max = modview.getCas().getDocumentAnnotation().getEnd();
                if (mappedBegin < max && mappedEnd <= max && mappedBegin >= 0 && mappedEnd > 0) {
                    clone.setBegin(mappedBegin);
                    clone.setEnd(mappedEnd);
                    modview.addFsToIndexes(clone);
                    indexedFs.add(clone);
                    continue;
                }
                this.getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
                continue;
            }
            if (!this.expandOffsets) continue;
            clone.setBegin(mappedBegin);
            clone.setEnd(mappedEnd);
            toExpand.add(clone);
        }
        for (Annotation each : toExpand) {
            Annotation nextBestAnnotation = this.getNextBestAnnotation(each, modview);
            if (nextBestAnnotation == null) continue;
            each.setBegin(nextBestAnnotation.getBegin());
            each.setEnd(nextBestAnnotation.getEnd());
            Feature expandedOffsetsFeature = each.getType().getFeatureByBaseName("expandedOffsets");
            if (expandedOffsetsFeature != null) {
                each.setBooleanValue(expandedOffsetsFeature, true);
            }
            modview.addFsToIndexes(each);
        }
    }

    private Annotation getNextBestAnnotation(Annotation source, JCas jcas) {
        FSIterator iterator = jcas.getAnnotationIndex().iterator(source);
        Annotation best = null;
        if (iterator.isValid()) {
            Annotation annotation;
            best = annotation = (Annotation)iterator.get();
        } else {
            Annotation dummy = new Annotation(jcas, source.getBegin(), source.getBegin() + 1);
            iterator = jcas.getAnnotationIndex().iterator(dummy);
            if (!iterator.isValid()) {
                if (jcas.getDocumentText().length() / 2 > source.getBegin()) {
                    iterator.moveToFirst();
                    if (iterator.isValid()) {
                        Annotation annotation;
                        best = annotation = (Annotation)iterator.get();
                    }
                } else {
                    iterator.moveToLast();
                    if (iterator.isValid()) {
                        Annotation annotation;
                        best = annotation = (Annotation)iterator.get();
                    }
                }
            }
        }
        return best;
    }

    private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText(SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String splitSeq) {
        return this.handleConversion(visibleSpansSoFar, splitSeq, this.linebreakReplacement);
    }

    private SortedSet<HtmlConverterPSpan> htmlDecoding(SortedSet<HtmlConverterPSpan> visibleSpansSoFar) {
        TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar);
        Pattern patt = Pattern.compile("(&[a-zA-Z0-9]{2,6};)|(&#\\d{2,5};)");
        for (HtmlConverterPSpan pSpan : visibleSpansSoFar) {
            String spanTxt = pSpan.getTxt();
            Matcher matcher = patt.matcher(spanTxt);
            if (!matcher.find()) continue;
            copy.remove(pSpan);
            int pSpanBegin = pSpan.getBegin();
            int ioff = pSpan.getBegin();
            do {
                String sourceString = matcher.group();
                String replacement = StringEscapeUtils.unescapeHtml4(sourceString);
                HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement(pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement);
                copy.add(replacementSpan);
                int replacementLength = sourceString.length();
                if (pSpanBegin + matcher.end() > ioff + replacementLength) {
                    int ib = ioff;
                    int ie = pSpanBegin + matcher.start();
                    String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
                    copy.add(new HtmlConverterPSpan(ib, ie, newTxt));
                    ioff = ie;
                }
                ioff += replacementLength;
            } while (matcher.find());
            if (ioff >= pSpan.getEnd()) continue;
            int ib = ioff;
            int ie = pSpan.getEnd();
            String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
            copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt));
        }
        return copy;
    }

    private SortedSet<HtmlConverterPSpan> handleConversion(SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String patternString, String replacement) {
        TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar);
        Pattern patt = Pattern.compile(patternString);
        int replacementLength = patternString.length();
        for (HtmlConverterPSpan pSpan : visibleSpansSoFar) {
            String newTxt;
            int ie;
            String spanTxt = pSpan.getTxt();
            Matcher matcher = patt.matcher(spanTxt);
            if (!matcher.find()) continue;
            copy.remove(pSpan);
            int pSpanBegin = pSpan.getBegin();
            int ioff = pSpan.getBegin();
            do {
                if (!StringUtils.isEmpty(replacement)) {
                    HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement(pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement);
                    copy.add(replacementSpan);
                }
                if (pSpanBegin + matcher.end() > ioff + replacementLength) {
                    int ib = ioff;
                    ie = pSpanBegin + matcher.start();
                    newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
                    copy.add(new HtmlConverterPSpan(ib, ie, newTxt));
                    ioff = ie;
                }
                ioff += replacementLength;
            } while (matcher.find());
            if (ioff >= pSpan.getEnd()) continue;
            int ib = ioff;
            ie = pSpan.getEnd();
            newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
            copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt));
        }
        return copy;
    }
}

