package de.julielab.genemapper.resources;

import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WikipediaCategoryTreeAndRedirectsExtractor {
    private final static Logger log = LoggerFactory.getLogger(WikipediaCategoryTreeAndRedirectsExtractor.class);

    /**
     * [[Category:Social theories]]
     */
    private static final Pattern CATEGORY_PATTERN = Pattern.compile("\\[\\[Category:([^]]+)\\]\\]");
    private static final Pattern XML_COMMENT_PATTERN = Pattern.compile("(?s)<!--.*?-->");
    private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[(.*?)[]|]");


    private static final Set<String> ACCEPTED_NAMESPACES = Set.of("0", "14");

    private final File wikipediaXml;
    private final File categoriesOutputFile;
    private final File redirectsOutputFile;

    public WikipediaCategoryTreeAndRedirectsExtractor(File wikipediaXml, File categoryTreeOutputFile, File redirectsOutputFile) {
        this.wikipediaXml = wikipediaXml;
        this.categoriesOutputFile = categoryTreeOutputFile;
        this.redirectsOutputFile = redirectsOutputFile;
    }

    public static void main(String[] args) throws IOException, XMLStreamException {
        if (args.length < 3) {
            System.err.println("Usage: " + WikipediaCategoryTreeAndRedirectsExtractor.class.getSimpleName() + " <wikipedia XML dump> <category tree file output path> <redirect map file output path>");
            System.exit(0);
        }
        File wikipediaXml = new File(args[0]);
        File categoryTreeOutputFile = new File(args[1]);
        File redirectsOutputFile = new File(args[2]);
        WikipediaCategoryTreeAndRedirectsExtractor creator = new WikipediaCategoryTreeAndRedirectsExtractor(wikipediaXml, categoryTreeOutputFile, redirectsOutputFile);
        log.info("Reading Wikipedia dump from {}. Writing category map to {} and redirect map to {}.", wikipediaXml, categoryTreeOutputFile, redirectsOutputFile);
        creator.create();
        log.info("Finished the creation of the Wikipedia category and redirect maps. They are stored at {} and {}, respectively.", categoryTreeOutputFile, redirectsOutputFile);
    }

    private void create() throws IOException, XMLStreamException {
        XMLInputFactory factory = XMLInputFactory.newInstance();
        log.debug("Creating input and output streams.");
        try (FileInputStream fin = new FileInputStream(wikipediaXml);
             BufferedInputStream bis = new BufferedInputStream(fin);
             MultiStreamBZip2InputStream bis2 = new MultiStreamBZip2InputStream(bis);
             BufferedReader br = new BufferedReader(new InputStreamReader(bis2));
             BufferedWriter bw = FileUtilities.getWriterToFile(categoriesOutputFile);
             BufferedWriter bw2 = FileUtilities.getWriterToFile(redirectsOutputFile)) {
            bw.write("page title");
            bw.write("\t");
            bw.write("category");
            bw.newLine();
            bw2.write("target title");
            bw2.write("\t");
            bw2.write("redirected title");
            bw2.newLine();

            log.debug("Starting to parse Wikipedia XML.");
            XMLStreamReader parser = factory.createXMLStreamReader(br);
            ParsingStatus ps = null;
            int written = 0;
            while (parser.hasNext()) {
                int eventType = parser.next();
                if (eventType == XMLStreamReader.START_ELEMENT) {
                    // page, title, text (<- Wikimedia Markup; look for first line not starting with curly braces, I guess)
                    if (parser.getLocalName().equalsIgnoreCase("page")) {
                        ps = new ParsingStatus();
                    } else if (parser.getLocalName().equalsIgnoreCase("title"))
                        ps.setTitle(parser.getElementText());
                    else if (parser.getLocalName().equalsIgnoreCase("ns"))
                        ps.setNamespace(parser.getElementText());
                    else if (parser.getLocalName().equalsIgnoreCase("text") && ACCEPTED_NAMESPACES.contains(ps.getNamespace()))
                        parseText(parser.getElementText(), ps);
                    else if (parser.getLocalName().equalsIgnoreCase("id"))
                        ps.setId(parser.getElementText());
                    else if (parser.getLocalName().equals("redirect"))
                        ps.setRedirectTitle(parser.getAttributeValue(0));
                } else if (eventType == XMLStreamReader.END_ELEMENT) {
                    if (parser.getLocalName().equalsIgnoreCase("page") && ACCEPTED_NAMESPACES.contains(ps.getNamespace())) {
                        if (!ps.isRedirect() && !ps.isDisambiguationPage())
                        writeCategories(ps, bw);
                        else
                            writeRedirect(ps, bw2);
                        ++written;
                        if (written % 100000 == 0)
                            log.info("{} lines written.", written);
                    }
                }
            }
        } catch (IOException | XMLStreamException e) {
            log.error("XML parsing error", e);
            throw e;
        }
    }

    private void writeRedirect(ParsingStatus ps, BufferedWriter bw2) throws IOException {
        if (ps.isRedirect()) {
            bw2.write(ps.getRedirectTitle());
            bw2.write("\t");
            bw2.write(ps.getTitle());
            bw2.newLine();
        } else {
            for (String referredToTitle : ps.getReferredToTitles()) {
                bw2.write(referredToTitle);
                bw2.write("\t");
                bw2.write(ps.getTitle());
                bw2.newLine();
            }
        }
    }

    private void writeCategories(ParsingStatus ps, BufferedWriter bw) throws IOException {
        for (String category : ps.getCategories()) {
            bw.write(ps.getTitle());
            bw.write("\t");
            bw.write(category);
            bw.newLine();
        }
    }

    private void parseText(String elementText, ParsingStatus ps) {
        elementText = XML_COMMENT_PATTERN.matcher(elementText).replaceAll("");
        LineIterator lineIt = new LineIterator(new StringReader(elementText));
        Matcher m = CATEGORY_PATTERN.matcher("");
        while (lineIt.hasNext()) {
            String line = lineIt.next();
            if (line.startsWith("[[Category:")) {
                m.reset(line);
                if (m.find()) {
                    String category = m.group(1);
                    if (category.contains("|"))
                        category = category.substring(0, category.lastIndexOf('|'));
                    ps.addCategory(category.trim());
                }
            } else if (line.contains("{{disambig}}")) {
                ps.setIsDisambiguationPage(true);
                Matcher linkM = WIKI_LINK_PATTERN.matcher(elementText);
                while (linkM.find()) {
                    ps.addReferredToTitle(linkM.group(1));
                }
            }
        }
    }


    private class ParsingStatus {
        private String title;
        private String text;
        private String namespace;
        private String id;
        private final List<String> categories = new ArrayList<>();
        private List<String> referredToTitles = Collections.emptyList();
        private String redirectTitle;
        private boolean isDisambiguationPage;

        public List<String> getCategories() {
            return categories;
        }

        public String getText() {
            return text;
        }

        public void setText(String text) {
            this.text = text;
        }


        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getNamespace() {
            return namespace;
        }

        public void setNamespace(String namespace) {
            this.namespace = namespace;
        }

        public String getId() {
            return id;
        }

        public void setId(String id) {
            this.id = id;
        }

        public void addCategory(String category) {
            categories.add(category);
        }

        public String getRedirectTitle() {
            return redirectTitle;
        }

        public boolean isRedirect() {
            return this.redirectTitle != null;
        }

        public void setRedirectTitle(String redirectTitle) {
            this.redirectTitle = redirectTitle;
        }

        public boolean isDisambiguationPage() {
            return isDisambiguationPage;
        }

        public void setIsDisambiguationPage(boolean isDisambiguationPage) {
            this.isDisambiguationPage = isDisambiguationPage;
        }

        public List<String> getReferredToTitles() {
            return referredToTitles;
        }

        public void addReferredToTitle(String title) {
            if (referredToTitles.isEmpty())
                referredToTitles = new ArrayList<>();
            referredToTitles.add(title);
        }
    }


}
