package de.julielab.genemapper.resources;

import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.Set;

public class WikipediaTitleDictionaryCreator {
    private static final Set<Character> NON_TEXT_CHARS = Set.of('{', '}', '#', '|', '<', '[', '*');
    private final static Logger log = LoggerFactory.getLogger(WikipediaTitleDictionaryCreator.class);
    private final File wikipediaXml;
    private final File outputFile;

    public WikipediaTitleDictionaryCreator(File wikipediaXml, File outputFile) {
        this.wikipediaXml = wikipediaXml;
        this.outputFile = outputFile;
    }

    public static void main(String[] args) throws IOException, XMLStreamException {
        File wikipediaXml = new File(args[0]);
        File outputFile = new File(args[1]);
        WikipediaTitleDictionaryCreator creator = new WikipediaTitleDictionaryCreator(wikipediaXml, outputFile);
        log.info("Reading Wikipedia dump from {}. Writing dictionary to {}.", wikipediaXml, outputFile);
        creator.create();
    }

    private void create() throws IOException, XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        try (FileInputStream fin = new FileInputStream(wikipediaXml);
             BufferedInputStream bis = new BufferedInputStream(fin);
             MultiStreamBZip2InputStream bis2 = new MultiStreamBZip2InputStream(bis);
             BufferedReader br = new BufferedReader(new InputStreamReader(bis2));
             BufferedWriter bw = FileUtilities.getWriterToFile(outputFile)
        ) {
            XMLStreamReader parser = factory.createXMLStreamReader(br);
            ParsingStatus ps = new ParsingStatus();
            int pagesProcessed = 0;
            int pagesInDict = 0;
            while (parser.hasNext()) {
                int eventType = parser.next();
                if (eventType == XMLStreamReader.START_ELEMENT) {
                    if (parser.getLocalName().equalsIgnoreCase("page"))
                        ps.setInPage(true);
                    if (ps.isInPage()) {
                        if (parser.getLocalName().equalsIgnoreCase("ns"))
                            ps.setCurrentNameSpace(parser.getElementText());
                        else if (parser.getLocalName().equalsIgnoreCase("title"))
                            ps.setTitle(parser.getElementText());
                        else if (parser.getLocalName().equalsIgnoreCase("id") && ps.getPageId() == null)
                            ps.setPageId(parser.getElementText());
                        else if (parser.getLocalName().equalsIgnoreCase("text") && "0".equals(ps.getNameSpace()))
                            parseText(parser.getElementText(), ps);
                    }
                } else if (eventType == XMLStreamReader.END_ELEMENT) {
                    if (parser.getLocalName().equalsIgnoreCase("page")) {
                        if ("0".equals(ps.getNameSpace()) && ps.getText() != null && !ps.getText().isBlank()) {
                            bw.write(ps.getTitle());
                            bw.write("\t");
                            bw.write(ps.getPageId());
                            bw.newLine();

                            ++pagesInDict;
                            ps.setCurrentNameSpace(null);
                            ps.setText(null);
                            ps.setPageId(null);
                        }
                        if (++pagesProcessed % 100000 == 0)
                            log.info("{} pages processed, {} pages added to dictionary.", pagesProcessed, pagesInDict);
                        ps.setInPage(false);
                    }
                }
            }
        }
    }

    private void parseText(String elementText, ParsingStatus ps) {
//        text (<- Wikimedia Markup; look for first line not starting with curly braces, I guess)
        LineIterator lineIt = new LineIterator(new StringReader(elementText));
        while (lineIt.hasNext()) {
            String line = lineIt.next();
            int firstNonWhitespaceCharacter = findFirstNonWhitespaceCharacter(line);
            if (firstNonWhitespaceCharacter != 0) {
                char c = (char) firstNonWhitespaceCharacter;
                if (!line.isBlank() && !NON_TEXT_CHARS.contains(c)) {
                    ps.setText(line.substring(0, Math.min(line.length(), 1000)));
                    break;
                }
            }
        }
    }

    private int findFirstNonWhitespaceCharacter(String str) {
        for (int i = 0; i < str.length(); i++) {
            char c = str.charAt(i);
            if (!Character.isWhitespace(c))
                return c;
        }
        return -1;
    }


    private class ParsingStatus {
        private boolean inPage;
        private String nameSpace;
        private String title;
        private String pageId;
        private String text;

        public boolean isInPage() {
            return inPage;
        }

        public void setInPage(boolean inPage) {
            this.inPage = inPage;
        }

        public String getNameSpace() {
            return nameSpace;
        }

        public void setCurrentNameSpace(String currentNameSpace) {
            this.nameSpace = currentNameSpace;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getPageId() {
            return pageId;
        }

        public void setPageId(String pageId) {
            this.pageId = pageId;
        }

        public String getText() {
            return text;
        }

        public void setText(String text) {
            this.text = text;
        }
    }
}
