/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.sector.reader;

import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Document;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.RawTextDatasetReader;
import de.datexis.sector.model.SectionAnnotation;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Wiki727Reader
extends RawTextDatasetReader {
    protected static final Logger log = LoggerFactory.getLogger(Wiki727Reader.class);
    protected int sectionLevel = 2;
    protected boolean skipPrefaceText = false;
    protected boolean skipPrefaceAnnotation = false;
    protected Pattern SECTION_PATTERN = Pattern.compile("^========,(\\d+),(.+?)\\.$");

    public Wiki727Reader withSectionLevel(int level) {
        this.sectionLevel = level;
        return this;
    }

    public Wiki727Reader withSkipPreface(boolean skip) {
        this.skipPrefaceText = skip;
        return this;
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public Document readDocumentFromFile(Resource file) {
        try (InputStream in = file.getInputStream();){
            CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
            BufferedReader br = new BufferedReader(new InputStreamReader(in, utf8));
            LineIterator it = new LineIterator((Reader)br);
            boolean k = false;
            boolean j = false;
            boolean length = false;
            Document doc = new Document();
            doc.setId(file.getFileName());
            doc.setSource((Object)file.toString());
            doc.setType("wiki");
            StringBuilder text = new StringBuilder();
            SectionAnnotation ann = new SectionAnnotation(Annotation.Source.GOLD);
            String sectionHeading = "";
            while (it.hasNext()) {
                String line = (String)it.next();
                Matcher matcher = this.SECTION_PATTERN.matcher(line);
                if (line.startsWith("=====") && matcher.matches()) {
                    int level = Integer.parseInt(matcher.group(1));
                    String heading = matcher.group(2);
                    if (this.sectionLevel != 0 && level > this.sectionLevel) continue;
                    String sectionText = text.toString();
                    if (sectionText.trim().length() > 0) {
                        this.addToDocument(sectionText, sectionHeading, doc);
                    }
                    int split = 0;
                    while (--level > 1) {
                        split = sectionHeading.indexOf(" | ", split + 1);
                    }
                    sectionHeading = split > 0 ? sectionHeading.substring(0, split) + " | " : (split < 0 ? sectionHeading + " | " : "");
                    sectionHeading = sectionHeading + heading;
                    text = new StringBuilder();
                    continue;
                }
                if (text.length() > 0) {
                    text.append(" ");
                }
                if ((line = line.replaceAll("\\*\\*\\*LIST\\*\\*\\*", "").replaceAll("\\*\\*\\*formula\\*\\*\\*", "").replaceAll("\\*\\*\\*codice\\*\\*\\*", "")).trim().isEmpty()) continue;
                text.append(line).append("\n");
            }
            String sectionText = text.toString();
            if (sectionText.trim().length() > 0) {
                this.addToDocument(sectionText, sectionHeading, doc);
            }
            Document document = doc;
            return document;
        }
        catch (IOException ex) {
            log.error(ex.toString());
            throw new RuntimeException(ex.toString(), ex.getCause());
        }
    }

    private void addToDocument(String text, String sectionHeading, Document doc) {
        if (text.trim().length() == 0) {
            return;
        }
        Document section = new Document();
        for (String paragraph : text.split("\n")) {
            if (paragraph.trim().isEmpty()) continue;
            Document temp = DocumentFactory.fromText((String)(paragraph.trim() + "\n"), (DocumentFactory.Newlines)DocumentFactory.Newlines.KEEP);
            section.addSentence(DocumentFactory.createSentenceFromTokens((List)temp.getTokens()));
        }
        if (sectionHeading == null) {
            return;
        }
        String sectionHead = sectionHeading.replaceFirst("\\|.+$", "").trim().toLowerCase();
        if (!this.skipPrefaceText || !sectionHead.equals("preface")) {
            doc.append(section);
            SectionAnnotation sectionAnn = new SectionAnnotation(Annotation.Source.GOLD, doc.getType(), sectionHeading);
            sectionAnn.setSectionLabel(sectionHeading);
            sectionAnn.setBegin(section.getBegin());
            sectionAnn.setEnd(section.getEnd());
            doc.addAnnotation((Annotation)sectionAnn);
        }
    }
}

