/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.resources;

import de.julielab.genemapper.resources.MultiStreamBZip2InputStream;
import de.julielab.java.utilities.FileUtilities;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikipediaCategoryTreeAndRedirectsExtractor {
    private static final Logger log = LoggerFactory.getLogger(WikipediaCategoryTreeAndRedirectsExtractor.class);
    private static final Pattern CATEGORY_PATTERN = Pattern.compile("\\[\\[Category:([^]]+)\\]\\]");
    private static final Pattern XML_COMMENT_PATTERN = Pattern.compile("(?s)<!--.*?-->");
    private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[(.*?)[]|]");
    private static final Set<String> ACCEPTED_NAMESPACES = Set.of("0", "14");
    private final File wikipediaXml;
    private final File categoriesOutputFile;
    private final File redirectsOutputFile;

    public WikipediaCategoryTreeAndRedirectsExtractor(File wikipediaXml, File categoryTreeOutputFile, File redirectsOutputFile) {
        this.wikipediaXml = wikipediaXml;
        this.categoriesOutputFile = categoryTreeOutputFile;
        this.redirectsOutputFile = redirectsOutputFile;
    }

    public static void main(String[] args) throws IOException, XMLStreamException {
        if (args.length < 3) {
            System.err.println("Usage: " + WikipediaCategoryTreeAndRedirectsExtractor.class.getSimpleName() + " <wikipedia XML dump> <category tree file output path> <redirect map file output path>");
            System.exit(0);
        }
        File wikipediaXml = new File(args[0]);
        File categoryTreeOutputFile = new File(args[1]);
        File redirectsOutputFile = new File(args[2]);
        WikipediaCategoryTreeAndRedirectsExtractor creator = new WikipediaCategoryTreeAndRedirectsExtractor(wikipediaXml, categoryTreeOutputFile, redirectsOutputFile);
        log.info("Reading Wikipedia dump from {}. Writing category map to {} and redirect map to {}.", wikipediaXml, categoryTreeOutputFile, redirectsOutputFile);
        creator.create();
        log.info("Finished the creation of the Wikipedia category and redirect maps. They are stored at {} and {}, respectively.", (Object)categoryTreeOutputFile, (Object)redirectsOutputFile);
    }

    private void create() throws IOException, XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        log.debug("Creating input and output streams.");
        try (FileInputStream fin = new FileInputStream(this.wikipediaXml);
             BufferedInputStream bis = new BufferedInputStream(fin);
             MultiStreamBZip2InputStream bis2 = new MultiStreamBZip2InputStream(bis);
             BufferedReader br = new BufferedReader(new InputStreamReader(bis2));
             BufferedWriter bw = FileUtilities.getWriterToFile(this.categoriesOutputFile);
             BufferedWriter bw2 = FileUtilities.getWriterToFile(this.redirectsOutputFile);){
            bw.write("page title");
            bw.write("\t");
            bw.write("category");
            bw.newLine();
            bw2.write("target title");
            bw2.write("\t");
            bw2.write("redirected title");
            bw2.newLine();
            log.debug("Starting to parse Wikipedia XML.");
            XMLStreamReader parser2 = factory.createXMLStreamReader(br);
            ParsingStatus ps = null;
            int written = 0;
            while (parser2.hasNext()) {
                int eventType = parser2.next();
                if (eventType == 1) {
                    if (parser2.getLocalName().equalsIgnoreCase("page")) {
                        ps = new ParsingStatus();
                        continue;
                    }
                    if (parser2.getLocalName().equalsIgnoreCase("title")) {
                        ps.setTitle(parser2.getElementText());
                        continue;
                    }
                    if (parser2.getLocalName().equalsIgnoreCase("ns")) {
                        ps.setNamespace(parser2.getElementText());
                        continue;
                    }
                    if (parser2.getLocalName().equalsIgnoreCase("text") && ACCEPTED_NAMESPACES.contains(ps.getNamespace())) {
                        this.parseText(parser2.getElementText(), ps);
                        continue;
                    }
                    if (parser2.getLocalName().equalsIgnoreCase("id")) {
                        ps.setId(parser2.getElementText());
                        continue;
                    }
                    if (!parser2.getLocalName().equals("redirect")) continue;
                    ps.setRedirectTitle(parser2.getAttributeValue(0));
                    continue;
                }
                if (eventType != 2 || !parser2.getLocalName().equalsIgnoreCase("page") || !ACCEPTED_NAMESPACES.contains(ps.getNamespace())) continue;
                if (!ps.isRedirect() && !ps.isDisambiguationPage()) {
                    this.writeCategories(ps, bw);
                } else {
                    this.writeRedirect(ps, bw2);
                }
                if (++written % 100000 != 0) continue;
                log.info("{} lines written.", (Object)written);
            }
        }
        catch (IOException | XMLStreamException e) {
            log.error("XML parsing error", e);
            throw e;
        }
    }

    private void writeRedirect(ParsingStatus ps, BufferedWriter bw2) throws IOException {
        if (ps.isRedirect()) {
            bw2.write(ps.getRedirectTitle());
            bw2.write("\t");
            bw2.write(ps.getTitle());
            bw2.newLine();
        } else {
            for (String referredToTitle : ps.getReferredToTitles()) {
                bw2.write(referredToTitle);
                bw2.write("\t");
                bw2.write(ps.getTitle());
                bw2.newLine();
            }
        }
    }

    private void writeCategories(ParsingStatus ps, BufferedWriter bw) throws IOException {
        for (String category : ps.getCategories()) {
            bw.write(ps.getTitle());
            bw.write("\t");
            bw.write(category);
            bw.newLine();
        }
    }

    private void parseText(String elementText, ParsingStatus ps) {
        elementText = XML_COMMENT_PATTERN.matcher(elementText).replaceAll("");
        LineIterator lineIt = new LineIterator(new StringReader(elementText));
        Matcher m3 = CATEGORY_PATTERN.matcher("");
        while (lineIt.hasNext()) {
            String line = lineIt.next();
            if (line.startsWith("[[Category:")) {
                m3.reset(line);
                if (!m3.find()) continue;
                String category = m3.group(1);
                if (category.contains("|")) {
                    category = category.substring(0, category.lastIndexOf(124));
                }
                ps.addCategory(category.trim());
                continue;
            }
            if (!line.contains("{{disambig}}")) continue;
            ps.setIsDisambiguationPage(true);
            Matcher linkM = WIKI_LINK_PATTERN.matcher(elementText);
            while (linkM.find()) {
                ps.addReferredToTitle(linkM.group(1));
            }
        }
    }

    private class ParsingStatus {
        private String title;
        private String text;
        private String namespace;
        private String id;
        private final List<String> categories = new ArrayList<String>();
        private List<String> referredToTitles = Collections.emptyList();
        private String redirectTitle;
        private boolean isDisambiguationPage;

        private ParsingStatus() {
        }

        public List<String> getCategories() {
            return this.categories;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String text) {
            this.text = text;
        }

        public String getTitle() {
            return this.title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getNamespace() {
            return this.namespace;
        }

        public void setNamespace(String namespace) {
            this.namespace = namespace;
        }

        public String getId() {
            return this.id;
        }

        public void setId(String id) {
            this.id = id;
        }

        public void addCategory(String category) {
            this.categories.add(category);
        }

        public String getRedirectTitle() {
            return this.redirectTitle;
        }

        public boolean isRedirect() {
            return this.redirectTitle != null;
        }

        public void setRedirectTitle(String redirectTitle) {
            this.redirectTitle = redirectTitle;
        }

        public boolean isDisambiguationPage() {
            return this.isDisambiguationPage;
        }

        public void setIsDisambiguationPage(boolean isDisambiguationPage) {
            this.isDisambiguationPage = isDisambiguationPage;
        }

        public List<String> getReferredToTitles() {
            return this.referredToTitles;
        }

        public void addReferredToTitle(String title) {
            if (this.referredToTitles.isEmpty()) {
                this.referredToTitles = new ArrayList<String>();
            }
            this.referredToTitles.add(title);
        }
    }
}

