/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.genemapper.resources;

import de.julielab.genemapper.resources.MultiStreamBZip2InputStream;
import de.julielab.java.utilities.FileUtilities;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Set;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikipediaTitleDictionaryCreator {
    private static final Set<Character> NON_TEXT_CHARS = Set.of(Character.valueOf('{'), Character.valueOf('}'), Character.valueOf('#'), Character.valueOf('|'), Character.valueOf('<'), Character.valueOf('['), Character.valueOf('*'));
    private static final Logger log = LoggerFactory.getLogger(WikipediaTitleDictionaryCreator.class);
    private final File wikipediaXml;
    private final File outputFile;

    public WikipediaTitleDictionaryCreator(File wikipediaXml, File outputFile) {
        this.wikipediaXml = wikipediaXml;
        this.outputFile = outputFile;
    }

    public static void main(String[] args) throws IOException, XMLStreamException {
        File wikipediaXml = new File(args[0]);
        File outputFile = new File(args[1]);
        WikipediaTitleDictionaryCreator creator = new WikipediaTitleDictionaryCreator(wikipediaXml, outputFile);
        log.info("Reading Wikipedia dump from {}. Writing dictionary to {}.", (Object)wikipediaXml, (Object)outputFile);
        creator.create();
    }

    private void create() throws IOException, XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        try (FileInputStream fin = new FileInputStream(this.wikipediaXml);
             BufferedInputStream bis = new BufferedInputStream(fin);
             MultiStreamBZip2InputStream bis2 = new MultiStreamBZip2InputStream(bis);
             BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)((Object)bis2)));
             BufferedWriter bw = FileUtilities.getWriterToFile((File)this.outputFile);){
            XMLStreamReader parser = factory.createXMLStreamReader(br);
            ParsingStatus ps = new ParsingStatus();
            int pagesProcessed = 0;
            int pagesInDict = 0;
            while (parser.hasNext()) {
                int eventType = parser.next();
                if (eventType == 1) {
                    if (parser.getLocalName().equalsIgnoreCase("page")) {
                        ps.setInPage(true);
                    }
                    if (!ps.isInPage()) continue;
                    if (parser.getLocalName().equalsIgnoreCase("ns")) {
                        ps.setCurrentNameSpace(parser.getElementText());
                        continue;
                    }
                    if (parser.getLocalName().equalsIgnoreCase("title")) {
                        ps.setTitle(parser.getElementText());
                        continue;
                    }
                    if (parser.getLocalName().equalsIgnoreCase("id") && ps.getPageId() == null) {
                        ps.setPageId(parser.getElementText());
                        continue;
                    }
                    if (!parser.getLocalName().equalsIgnoreCase("text") || !"0".equals(ps.getNameSpace())) continue;
                    this.parseText(parser.getElementText(), ps);
                    continue;
                }
                if (eventType != 2 || !parser.getLocalName().equalsIgnoreCase("page")) continue;
                if ("0".equals(ps.getNameSpace()) && ps.getText() != null && !ps.getText().isBlank()) {
                    bw.write(ps.getTitle());
                    bw.write("\t");
                    bw.write(ps.getPageId());
                    bw.newLine();
                    ++pagesInDict;
                    ps.setCurrentNameSpace(null);
                    ps.setText(null);
                    ps.setPageId(null);
                }
                if (++pagesProcessed % 100000 == 0) {
                    log.info("{} pages processed, {} pages added to dictionary.", (Object)pagesProcessed, (Object)pagesInDict);
                }
                ps.setInPage(false);
            }
        }
    }

    private void parseText(String elementText, ParsingStatus ps) {
        LineIterator lineIt = new LineIterator((Reader)new StringReader(elementText));
        while (lineIt.hasNext()) {
            String line = lineIt.next();
            int firstNonWhitespaceCharacter = this.findFirstNonWhitespaceCharacter(line);
            if (firstNonWhitespaceCharacter == 0) continue;
            char c = (char)firstNonWhitespaceCharacter;
            if (line.isBlank() || NON_TEXT_CHARS.contains(Character.valueOf(c))) continue;
            ps.setText(line.substring(0, Math.min(line.length(), 1000)));
            break;
        }
    }

    private int findFirstNonWhitespaceCharacter(String str) {
        for (int i = 0; i < str.length(); ++i) {
            char c = str.charAt(i);
            if (Character.isWhitespace(c)) continue;
            return c;
        }
        return -1;
    }

    private class ParsingStatus {
        private boolean inPage;
        private String nameSpace;
        private String title;
        private String pageId;
        private String text;

        private ParsingStatus() {
        }

        public boolean isInPage() {
            return this.inPage;
        }

        public void setInPage(boolean inPage) {
            this.inPage = inPage;
        }

        public String getNameSpace() {
            return this.nameSpace;
        }

        public void setCurrentNameSpace(String currentNameSpace) {
            this.nameSpace = currentNameSpace;
        }

        public String getTitle() {
            return this.title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getPageId() {
            return this.pageId;
        }

        public void setPageId(String pageId) {
            this.pageId = pageId;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String text) {
            this.text = text;
        }
    }
}

