/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.sector.reader;

import de.datexis.common.InternalResource;
import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.RawTextDatasetReader;
import de.datexis.sector.model.SectionAnnotation;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MultiSegReader
extends RawTextDatasetReader {
    protected static final Logger log = LoggerFactory.getLogger(MultiSegReader.class);

    public Dataset read(Resource path) throws IOException {
        if (path.isDirectory()) {
            return this.readDatasetFromDirectory(path, "^(.+?)\\.(\\d+)$");
        }
        if (path.isFile()) {
            Document doc = this.readDocumentFromFile(path);
            Dataset data = new Dataset(path.getFileName());
            data.addDocument(doc);
            return data;
        }
        throw new FileNotFoundException("cannot open path: " + path.toString());
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public Document readDocumentFromFile(Resource file) {
        try (InputStream in = file.getInputStream();){
            CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
            BufferedReader br = new BufferedReader(new InputStreamReader(in));
            LineIterator it = new LineIterator((Reader)br);
            int i = 0;
            int n = 0;
            Document doc = new Document();
            doc.setId(file.getFileName());
            doc.setSource((Object)file.toString());
            doc.setType("multiseg");
            StringBuilder text = new StringBuilder();
            Pattern filePattern = Pattern.compile("^(.+?/)(.+?)\\.(\\d+)$");
            Matcher m = filePattern.matcher(file.toString());
            if (!m.matches()) {
                throw new IllegalArgumentException("invalid file name");
            }
            String basePath = m.group(1);
            String baseDoc = m.group(2);
            int docNum = Integer.parseInt(m.group(3));
            Resource labels = file instanceof InternalResource ? Resource.fromJAR((String)(basePath + baseDoc + ".label")) : Resource.fromFile((String)basePath, (String)(baseDoc + ".label"));
            TreeSet[] sections = this.readSectionsFromLabel(labels, docNum);
            while (it.hasNext()) {
                String line = (String)it.next();
                if (sections[0].contains(i)) {
                    text = new StringBuilder();
                }
                text.append(line).append("\n");
                if (sections[1].contains(i)) {
                    String sectionText = text.toString();
                    if (sectionText.trim().length() > 0) {
                        this.addToDocument(sectionText, n++, doc);
                    }
                    text = new StringBuilder();
                }
                ++i;
            }
            Document document = doc;
            return document;
        }
        catch (IOException ex) {
            log.error(ex.toString());
            throw new RuntimeException(ex.toString(), ex.getCause());
        }
    }

    private void addToDocument(String text, int sectionId, Document doc) {
        if (text.trim().length() == 0) {
            return;
        }
        Document section = new Document();
        for (String paragraph : text.split("\n")) {
            Document temp = DocumentFactory.fromTokenizedText((String)(paragraph.trim() + "\n"));
            section.addSentence(DocumentFactory.createSentenceFromTokens((List)temp.getTokens()));
        }
        String sectionHead = Integer.toString(sectionId);
        doc.append(section);
        SectionAnnotation sectionAnn = new SectionAnnotation(Annotation.Source.GOLD, doc.getType(), sectionHead);
        sectionAnn.setSectionLabel(sectionHead);
        sectionAnn.setBegin(section.getBegin());
        sectionAnn.setEnd(section.getEnd());
        doc.addAnnotation((Annotation)sectionAnn);
    }

    protected TreeSet[] readSectionsFromLabel(Resource file, int docNum) throws IOException {
        TreeSet<Integer> sectionStarts = new TreeSet<Integer>();
        TreeSet<Integer> sectionEnds = new TreeSet<Integer>();
        ArrayList<AbstractMap.SimpleEntry<Integer, Integer>> sections = new ArrayList<AbstractMap.SimpleEntry<Integer, Integer>>();
        try (InputStream in = file.getInputStream();){
            CharsetDecoder utf8 = StandardCharsets.UTF_8.newDecoder();
            BufferedReader br = new BufferedReader(new InputStreamReader(in, utf8));
            String line = null;
            while ((line = br.readLine()) != null) {
                StringTokenizer tokens = new StringTokenizer(line, " ");
                String topicLabel = tokens.nextToken();
                while (tokens.hasMoreTokens()) {
                    String token = tokens.nextToken();
                    String[] segs = token.split("::", -1);
                    int docId = Integer.parseInt(segs[0]);
                    String[] segPoints = segs[1].split("-", -1);
                    int start = Integer.parseInt(segPoints[0]);
                    int end = Integer.parseInt(segPoints[1]);
                    if (docId != docNum) continue;
                    sectionStarts.add(start);
                    sectionEnds.add(end);
                    sections.add(new AbstractMap.SimpleEntry<Integer, Integer>(start, end));
                }
            }
        }
        return new TreeSet[]{sectionStarts, sectionEnds};
    }
}

