/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.resources.uima;

import de.julielab.genemapper.WikipediaCategoryManager;
import de.julielab.genemapper.resources.MultiStreamBZip2InputStream;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.types.Header;
import de.julielab.jcore.types.wikipedia.Title;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.LineIterator;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name="JCoRe GeneMapper Wikipedia Reader", description="Reads the XML Wikipedia dump. Extracts a portion of the first text line of each page to capture the term definition.")
public class WikipediaReader
extends JCasCollectionReader_ImplBase {
    public static final String PARAM_WIKIPEDIA_XML = "WikipediaXML";
    public static final String PARAM_EXCERPT_LENGTH = "ExcerptLength";
    public static final String PARAM_TITLE_WHITELIST = "TitleWhitelist";
    public static final String PARAM_WIKIPEDIA_CATEGORY_TREE_PATH = "WikipediaCategoryTreePath";
    private static final Logger log = LoggerFactory.getLogger(WikipediaReader.class);
    private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[[^]|]+\\|([^]]+)\\]\\]");
    private static final Pattern WIKI_MARKUP_ELEMENTS = Pattern.compile("[]\\[{}']+");
    private static final Pattern XML_REF_ELEMENT_PATTERN = Pattern.compile("<ref[^<]+</ref>");
    private static final Pattern XML_MARKUP_ELEMENTS = Pattern.compile("<[^>]+>");
    private static final Pattern NON_WS_PATTERN = Pattern.compile("[^\\s]");
    private static final Set<Character> NON_TEXT_CHARS = Set.of(Character.valueOf('{'), Character.valueOf('}'), Character.valueOf('#'), Character.valueOf('|'), Character.valueOf('<'), Character.valueOf('['), Character.valueOf('*'));
    private static WikipediaCategoryManager wikipediaCategoryManager;
    private final XMLInputFactory factory = XMLInputFactory.newInstance();
    @ConfigurationParameter(name="WikipediaXML")
    private String wikipediaXml;
    @ConfigurationParameter(name="ExcerptLength", description="Maximum number of characters to be kept from the first line of each page. Defaults to 1000.", mandatory=false, defaultValue={"1000"})
    private int excerptLength;
    @ConfigurationParameter(name="TitleWhitelist", description="Path to a file. If given, only pages that have a title on the list will be returned as a CAS.", mandatory=false)
    private String titleWhiteListFilePath;
    @ConfigurationParameter(name="WikipediaCategoryTreePath", mandatory=false, description="Optional. File created by GeNo's 'WikipediaCategoryTreeAndRedirectsExtractor' class that represents a map from page and category titles to categories they belong to. Will be used to filter for pages that are in some way related to the Molecular Biology category.")
    private String wikipediaCategoryTreePath;
    private Set<String> titleWhitelist;
    private XMLStreamReader parser;
    private ParsingStatus currentPage;
    private int processedPages;

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        try {
            this.wikipediaXml = (String)context.getConfigParameterValue(PARAM_WIKIPEDIA_XML);
            this.excerptLength = Optional.ofNullable((Integer)context.getConfigParameterValue(PARAM_EXCERPT_LENGTH)).orElse(1000);
            this.titleWhiteListFilePath = (String)context.getConfigParameterValue(PARAM_TITLE_WHITELIST);
            this.wikipediaCategoryTreePath = (String)context.getConfigParameterValue(PARAM_WIKIPEDIA_CATEGORY_TREE_PATH);
            log.info("Reading Wikipedia dump from {}.", (Object)this.wikipediaXml);
            log.info("Maximum excerpt length: {}", (Object)this.excerptLength);
            if (this.titleWhiteListFilePath != null) {
                try (BufferedReader br = FileUtilities.getReaderFromFile((File)new File(this.titleWhiteListFilePath));){
                    this.titleWhitelist = br.lines().filter(Predicate.not(line -> line.startsWith("#") || line.isBlank())).collect(Collectors.toSet());
                    log.info("Received Wikipedia title whitelist from {} with {} entries.", (Object)this.titleWhiteListFilePath, (Object)this.titleWhitelist.size());
                }
            }
            FileInputStream fin = new FileInputStream(this.wikipediaXml);
            BufferedInputStream bis = new BufferedInputStream(fin);
            Object bis2 = this.wikipediaXml.endsWith(".bz2") ? new MultiStreamBZip2InputStream(bis) : bis;
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)bis2));
            this.parser = this.factory.createXMLStreamReader(br);
            this.currentPage = this.getNextPage();
            this.processedPages = 0;
        }
        catch (IOException | XMLStreamException e) {
            log.error("Exception while initializing WikipediaReader", (Throwable)e);
            throw new ResourceInitializationException((Throwable)e);
        }
        Class<WikipediaReader> clazz = WikipediaReader.class;
        synchronized (WikipediaReader.class) {
            if (this.wikipediaCategoryTreePath != null && wikipediaCategoryManager == null) {
                log.info("Creating Dijkstra tree for Category:Biology");
                wikipediaCategoryManager = new WikipediaCategoryManager(this.wikipediaCategoryTreePath, true);
                wikipediaCategoryManager.buildDijkstraTree("Category:Biology");
            }
            // ** MonitorExit[var2_2] (shouldn't be in output)
            return;
        }
    }

    public void getNext(JCas jCas) throws CollectionException {
        try {
            if (this.currentPage != null) {
                String lcText = this.currentPage.getText().toLowerCase();
                String lcTitle = this.currentPage.getTitle().toLowerCase();
                int i = lcText.indexOf(lcTitle);
                int l = lcTitle.length();
                if (i < 0 && lcTitle.endsWith("s")) {
                    i = lcText.indexOf(lcTitle.substring(0, lcTitle.length() - 1));
                    l = lcTitle.length() - 1;
                }
                Title title = null;
                if (i >= 0) {
                    title = new Title(jCas, i, i + l);
                    title.addToIndexes();
                }
                int headerBegin = title != null ? title.getBegin() : 0;
                int headerEnd = title != null ? title.getEnd() : lcText.indexOf(" ");
                Header header = headerBegin >= 0 && headerEnd > headerBegin ? new Header(jCas, headerBegin, headerEnd) : new Header(jCas);
                header.setDocId(this.currentPage.getPageId());
                header.setTitle(this.currentPage.getTitle());
                header.addToIndexes();
                jCas.setDocumentText(this.currentPage.getText());
                this.currentPage = this.getNextPage();
                ++this.processedPages;
                if (this.processedPages % 100000 == 0) {
                    log.info("Processed {} pages.", (Object)this.processedPages);
                }
            }
        }
        catch (Throwable e) {
            log.error("Error while reading Wikipedia", e);
            throw new CollectionException(e);
        }
    }

    @Nullable
    private ParsingStatus getNextPage() throws XMLStreamException {
        ParsingStatus ps = null;
        boolean pageParsed = false;
        while (this.parser.hasNext() && (!pageParsed || this.currentPage != null && this.currentPage.isSkip())) {
            int eventType = this.parser.next();
            if (eventType == 1) {
                if (this.parser.getLocalName().equalsIgnoreCase("page")) {
                    ps = new ParsingStatus();
                }
                if (ps == null || ps.isSkip()) continue;
                if (this.parser.getLocalName().equalsIgnoreCase("title")) {
                    List path;
                    String pageTitle = this.parser.getElementText();
                    if (wikipediaCategoryManager != null && (path = wikipediaCategoryManager.getShortestPathToDijkstraTreeRoot(pageTitle, null)).isEmpty()) {
                        ps.skip();
                    }
                    ps.setTitle(pageTitle);
                    continue;
                }
                if (this.parser.getLocalName().equalsIgnoreCase("ns")) {
                    ps.setNamespace(this.parser.getElementText());
                    continue;
                }
                if (this.parser.getLocalName().equalsIgnoreCase("text") && ps.getNamespace().equals("0")) {
                    if (this.titleWhitelist != null && !this.titleWhitelist.isEmpty() && !this.titleWhitelist.contains(ps.getTitle())) continue;
                    this.parseText(this.parser.getElementText(), ps);
                    continue;
                }
                if (ps == null || ps.getPageId() != null || !this.parser.getLocalName().equalsIgnoreCase("id")) continue;
                ps.setPageId(this.parser.getElementText());
                continue;
            }
            if (eventType != 2 || !this.parser.getLocalName().equalsIgnoreCase("page") || ps == null) continue;
            pageParsed = true;
            if (ps.getText() != null && !ps.getText().isBlank()) continue;
            pageParsed = false;
            ps = null;
        }
        return ps;
    }

    private void parseText(String elementText, ParsingStatus ps) {
        LineIterator lineIt = new LineIterator((Reader)new StringReader(elementText));
        while (lineIt.hasNext()) {
            String line = lineIt.next();
            Matcher nonWsMatcher = NON_WS_PATTERN.matcher(line);
            if (!nonWsMatcher.find() || line.isBlank() || NON_TEXT_CHARS.contains(Character.valueOf(line.charAt(nonWsMatcher.start())))) continue;
            String textWithoutLinks = WIKI_LINK_PATTERN.matcher(line).replaceAll("$1");
            String textWithoutWikiMarkup = WIKI_MARKUP_ELEMENTS.matcher(textWithoutLinks).replaceAll("");
            String textWithoutXmlRefElements = XML_REF_ELEMENT_PATTERN.matcher(textWithoutWikiMarkup).replaceAll("");
            String textWithoutXmlMarkup = XML_MARKUP_ELEMENTS.matcher(textWithoutXmlRefElements).replaceAll("");
            ps.setText(textWithoutXmlMarkup.substring(0, Math.min(textWithoutXmlMarkup.length(), this.excerptLength)));
            break;
        }
    }

    public boolean hasNext() throws IOException, CollectionException {
        return this.currentPage != null;
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.processedPages, 0, "pages")};
    }

    private class ParsingStatus {
        private String title;
        private String text;
        private String namespace;
        private String pageId;
        private boolean skip;

        private ParsingStatus() {
        }

        public boolean isSkip() {
            return this.skip;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String text) {
            this.text = text;
        }

        public String getTitle() {
            return this.title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getNamespace() {
            return this.namespace;
        }

        public void setNamespace(String namespace) {
            this.namespace = namespace;
        }

        public String getPageId() {
            return this.pageId;
        }

        public void setPageId(String pageId) {
            this.pageId = pageId;
        }

        public void skip() {
            this.skip = true;
        }
    }
}

