/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.ner.reader;

import de.datexis.common.Resource;
import de.datexis.common.WordHelpers;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.model.Token;
import de.datexis.model.tag.BIO2Tag;
import de.datexis.model.tag.Tag;
import de.datexis.ner.MentionAnnotation;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.DatasetReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CoNLLDatasetReader
implements DatasetReader {
    private static final Logger log = LoggerFactory.getLogger(CoNLLDatasetReader.class);
    private static final String LINE_START = "-DOCSTART-";
    protected boolean useFirstSentenceAsTitle = false;
    protected Annotation.Source annotationSource = Annotation.Source.GOLD;
    protected int tagIndex = -1;
    protected String type = null;
    protected String name;

    public CoNLLDatasetReader withName(String name) {
        this.name = name;
        return this;
    }

    public CoNLLDatasetReader withTagIndex(int tagIndex) {
        this.tagIndex = tagIndex;
        return this;
    }

    public CoNLLDatasetReader withFirstSentenceAsTitle(boolean useFirstSentence) {
        this.useFirstSentenceAsTitle = useFirstSentence;
        return this;
    }

    public CoNLLDatasetReader withAnnotationSource(Annotation.Source annotationSource) {
        this.annotationSource = annotationSource;
        return this;
    }

    public CoNLLDatasetReader withGenericType(String type) {
        this.type = type;
        return this;
    }

    public Dataset read(Resource path) throws IOException {
        return this.read(path, Charset.UTF_8);
    }

    public Dataset read(Resource path, Charset charset) throws IOException {
        Dataset data;
        log.info("Reading Dataset from `{}`...", (Object)path.toString());
        try (InputStream in = path.getInputStream();){
            CharsetDecoder cs = charset.equals((Object)Charset.UTF_8) ? StandardCharsets.UTF_8.newDecoder() : StandardCharsets.ISO_8859_1.newDecoder();
            BufferedReader br = new BufferedReader(new InputStreamReader(in, cs));
            data = this.readLines((Iterator<String>)new LineIterator((Reader)br));
        }
        if (this.name != null) {
            data.setName(this.name);
        } else {
            data.setName(path.getFileName().replaceFirst("\\..+$", ""));
        }
        return data;
    }

    public static Dataset readDataset(Resource path, String name, Charset charset) throws IOException {
        CoNLLDatasetReader reader = new CoNLLDatasetReader();
        Dataset data = reader.read(path, charset);
        data.setName(name);
        return data;
    }

    protected Dataset readLines(Iterator<String> lines) {
        Dataset result = new Dataset();
        ArrayList<Token> tokens = new ArrayList<Token>();
        Token token = null;
        String type = null;
        int cursor = 0;
        String last = "";
        while (lines.hasNext()) {
            String line = lines.next().trim();
            if (line.startsWith(LINE_START)) {
                if (!tokens.isEmpty()) {
                    Document document = DocumentFactory.fromTokens(tokens);
                    MentionAnnotation.annotateFromTags(document, this.annotationSource, BIO2Tag.class);
                    result.addDocument(document);
                }
                cursor = 0;
                tokens = new ArrayList();
                type = null;
                last = "";
                continue;
            }
            if (line.length() == 0) {
                if (!tokens.isEmpty()) {
                    token = new Token("\n", cursor, cursor + 1);
                    token.putTag(this.annotationSource, (Tag)BIO2Tag.O());
                    tokens.add(token);
                    cursor = token.getEnd();
                    last = token.getText();
                }
                type = null;
                continue;
            }
            if (line.length() <= 0 || (token = this.createTokenFromLine(line, cursor, type)) == null) continue;
            if (!WordHelpers.skipSpaceAfter.contains(last) && !WordHelpers.skipSpaceBefore.contains(token.getText())) {
                token.setBegin(token.getBegin() + 1);
                token.setEnd(token.getEnd() + 1);
            }
            tokens.add(token);
            cursor = token.getEnd();
            type = ((BIO2Tag)token.getTag(this.annotationSource, BIO2Tag.class)).getType();
            last = token.getText();
        }
        if (!tokens.isEmpty()) {
            Document document = DocumentFactory.fromTokens(tokens);
            MentionAnnotation.annotateFromTags(document, this.annotationSource, BIO2Tag.class);
            result.addDocument(document);
        }
        for (Document doc : result.getDocuments()) {
            if (this.useFirstSentenceAsTitle) {
                if (doc.countSentences() > 0) {
                    doc.setTitle(doc.getSentence(0).getText());
                } else {
                    doc.setTitle("");
                }
            }
            doc.setTagAvailable(this.annotationSource, BIO2Tag.class, true);
        }
        log.info(String.format("Finished reading dataset (%,d docs, %,d sentences, %,d tokens, %,d mentions)", result.countDocuments(), result.countSentences(), result.countTokens(), result.countAnnotations()));
        return result;
    }

    protected Token createTokenFromLine(String line, int cursor, String prevType) {
        try {
            String[] csv = line.split("\\s+");
            int pos = this.tagIndex >= 0 ? this.tagIndex : csv.length - 1;
            String text = csv[0];
            BIO2Tag tag = this.createTag(csv[pos], prevType);
            int start = cursor;
            int end = cursor + text.length();
            Token token = new Token(text, start, end);
            token.putTag(this.annotationSource, (Tag)tag);
            return token;
        }
        catch (Exception e) {
            log.warn("could not read line: " + line);
            return null;
        }
    }

    protected BIO2Tag createTag(String label, String prevType) {
        String[] parts = label.split("\\-");
        String tag = parts[0];
        String type = this.type != null ? this.type : (parts.length > 1 ? parts[1] : "GENERIC");
        switch (tag) {
            case "O": {
                return new BIO2Tag(BIO2Tag.Label.O, null);
            }
            case "B": {
                return new BIO2Tag(BIO2Tag.Label.B, type);
            }
            case "I": {
                if (type.equals(prevType)) {
                    return new BIO2Tag(BIO2Tag.Label.I, type);
                }
                return new BIO2Tag(BIO2Tag.Label.B, type);
            }
        }
        log.warn("reading unknown tag " + label);
        return new BIO2Tag(BIO2Tag.Label.O, null);
    }

    public static enum Charset {
        UTF_8,
        ISO_8859_1;

    }
}

