/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.sector.reader;

import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.DatasetReader;
import de.datexis.sector.model.SectionAnnotation;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikiCitiesReader
implements DatasetReader {
    protected static final Logger log = LoggerFactory.getLogger(WikiCitiesReader.class);
    protected boolean skipTopLevelSegment = true;
    protected Pattern LINE_PATTERN = Pattern.compile("^(\\d+),(\\d+),(.+?)\u0001(.+?)$");
    protected String TOPLEVEL_STRING = "TOP-LEVEL SEGMENT";

    public WikiCitiesReader withSkipTopLevelSegment(boolean skip) {
        this.skipTopLevelSegment = skip;
        return this;
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public Dataset read(Resource file) throws IOException {
        try (InputStream in = file.getInputStream();){
            CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
            BufferedReader br = new BufferedReader(new InputStreamReader(in, utf8));
            LineIterator it = new LineIterator((Reader)br);
            Dataset result = new Dataset(file.getFileName());
            result.setName(file.getFileName());
            Document doc = new Document();
            StringBuilder text = new StringBuilder();
            String sectionHeading = "";
            while (it.hasNext()) {
                String sectionText;
                String line = (String)it.next();
                Matcher matcher = this.LINE_PATTERN.matcher(line);
                if (!matcher.matches()) {
                    log.error("matcher did not match for lineL\n{}", (Object)line);
                    continue;
                }
                String documentNo = matcher.group(1);
                int sentenceNo = Integer.parseInt(matcher.group(2));
                String heading = matcher.group(3);
                String sentence = matcher.group(4);
                if (sentenceNo == 1) {
                    sectionText = text.toString();
                    if (sectionText.trim().length() > 0) {
                        this.addToDocument(sectionText, sectionHeading, doc);
                    }
                    if (doc.countTokens() > 0) {
                        result.addDocument(doc);
                    }
                    doc = new Document();
                    doc.setId(documentNo);
                    text = new StringBuilder();
                    sectionHeading = "";
                }
                if (this.skipTopLevelSegment && heading.equals(this.TOPLEVEL_STRING)) continue;
                if (!heading.equals(sectionHeading)) {
                    sectionText = text.toString();
                    if (sectionText.trim().length() > 0) {
                        this.addToDocument(sectionText, sectionHeading, doc);
                    }
                    text = new StringBuilder();
                    sectionHeading = heading;
                }
                if (text.length() > 0) {
                    text.append(" ");
                }
                text.append(sentence).append(" .");
            }
            String sectionText = text.toString();
            if (sectionText.trim().length() > 0) {
                this.addToDocument(sectionText, sectionHeading, doc);
            }
            if (doc.countTokens() > 0) {
                result.addDocument(doc);
            }
            Dataset dataset = result;
            return dataset;
        }
        catch (IOException ex) {
            log.error(ex.toString());
            throw new RuntimeException(ex.toString(), ex.getCause());
        }
    }

    private void addToDocument(String text, String sectionHeading, Document doc) {
        if (text.trim().length() == 0) {
            return;
        }
        Document section = DocumentFactory.fromTokenizedText((String)text);
        sectionHeading = sectionHeading == null ? "" : (sectionHeading.equals(this.TOPLEVEL_STRING) ? "preface" : sectionHeading.trim().toLowerCase());
        doc.append(section);
        SectionAnnotation sectionAnn = new SectionAnnotation(Annotation.Source.GOLD, "wiki", sectionHeading);
        sectionAnn.setSectionLabel(sectionHeading.replaceAll("\\s+", "_"));
        sectionAnn.setBegin(section.getBegin());
        sectionAnn.setEnd(section.getEnd());
        doc.addAnnotation((Annotation)sectionAnn);
    }
}

