/*
 * Decompiled with CFR 0.152.
 */
package banner.eval.dataset;

import banner.eval.dataset.Dataset;
import banner.tokenization.Tokenizer;
import banner.types.EntityType;
import banner.types.Mention;
import banner.types.Sentence;
import banner.types.Token;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.configuration.HierarchicalConfiguration;
import org.apache.commons.configuration.SubnodeConfiguration;

public class BC2GMDataset
extends Dataset {
    public BC2GMDataset(Tokenizer tokenizer) {
        this.tokenizer = tokenizer;
    }

    public BC2GMDataset() {
    }

    @Override
    public void load(HierarchicalConfiguration config) {
        SubnodeConfiguration localConfig = config.configurationAt(this.getClass().getPackage().getName());
        String sentenceFilename = localConfig.getString("sentenceFilename");
        String mentionsFilename = localConfig.getString("mentionTestFilename");
        String alternateMentionsFilename = localConfig.getString("mentionAlternateFilename");
        String geneLabel = localConfig.getString("geneLabel");
        this.load(sentenceFilename, mentionsFilename, alternateMentionsFilename, geneLabel);
    }

    public void load(String sentenceFilename, String mentionsFilename, String alternateMentionsFilename, String geneLabel) {
        try {
            BufferedReader mentionTestFile = new BufferedReader(new FileReader(mentionsFilename));
            HashMap<String, LinkedList<Dataset.Tag>> tags = this.getTags(mentionTestFile, geneLabel);
            mentionTestFile.close();
            HashMap<String, LinkedList<Dataset.Tag>> alternateTags = null;
            if (alternateMentionsFilename != null) {
                BufferedReader mentionAlternateFile = new BufferedReader(new FileReader(alternateMentionsFilename));
                alternateTags = new HashMap<String, LinkedList<Dataset.Tag>>(this.getAlternateTags(mentionAlternateFile, geneLabel));
                mentionAlternateFile.close();
            }
            Pattern ws = Pattern.compile("\\s+");
            BufferedReader sentenceFile = new BufferedReader(new FileReader(sentenceFilename));
            String line = sentenceFile.readLine();
            while (line != null) {
                Matcher matcher = ws.matcher(line);
                matcher.find();
                String id = line.substring(0, matcher.start()).trim();
                String sentenceText = line.substring(matcher.end()).trim();
                Sentence sentence = this.getSentence(id, sentenceText, this.tokenizer, tags);
                if (alternateTags != null) {
                    this.addAlternateMentions(sentence, alternateTags);
                }
                this.sentences.add(sentence);
                line = sentenceFile.readLine();
            }
            sentenceFile.close();
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    protected HashMap<String, LinkedList<Dataset.Tag>> getTags(BufferedReader tagFile, String geneLabel) throws IOException {
        EntityType type = EntityType.getType(geneLabel != null && !geneLabel.isEmpty() ? geneLabel : "GENE");
        HashMap<String, LinkedList<Dataset.Tag>> tags = new HashMap<String, LinkedList<Dataset.Tag>>();
        String line = tagFile.readLine();
        while (line != null) {
            String[] split = line.split("\\s|\\|");
            LinkedList<Dataset.Tag> tagList = tags.get(split[0]);
            if (tagList == null) {
                tagList = new LinkedList();
            }
            Dataset.Tag tag = new Dataset.Tag(type, Integer.parseInt(split[1]), Integer.parseInt(split[2]));
            Iterator tagIterator = tagList.iterator();
            boolean add = true;
            while (tagIterator.hasNext() && add) {
                Dataset.Tag tag2 = (Dataset.Tag)tagIterator.next();
                if (tag.contains(tag2)) {
                    tagIterator.remove();
                    continue;
                }
                if (!tag2.contains(tag)) continue;
                add = false;
            }
            if (add) {
                tagList.add(tag);
                tags.put(split[0], tagList);
            }
            line = tagFile.readLine();
        }
        return tags;
    }

    protected HashMap<String, LinkedList<Dataset.Tag>> getAlternateTags(BufferedReader tagFile, String geneLabel) throws IOException {
        HashMap<String, LinkedList<Dataset.Tag>> tags = new HashMap<String, LinkedList<Dataset.Tag>>();
        String line = tagFile.readLine();
        while (line != null) {
            String[] split = line.split(" |\\|");
            LinkedList<Dataset.Tag> tagList = tags.get(split[0]);
            if (tagList == null) {
                tagList = new LinkedList();
            }
            EntityType type = EntityType.getType(geneLabel != null && !geneLabel.isEmpty() ? geneLabel : "GENE");
            Dataset.Tag tag = new Dataset.Tag(type, Integer.parseInt(split[1]), Integer.parseInt(split[2]));
            tagList.add(tag);
            tags.put(split[0], tagList);
            line = tagFile.readLine();
        }
        return tags;
    }

    protected Sentence getSentence(String id, String sentenceText, Tokenizer tokenizer, HashMap<String, LinkedList<Dataset.Tag>> tags) {
        Sentence sentence = new Sentence(id, null, sentenceText);
        tokenizer.tokenize(sentence);
        List<Token> tokens = sentence.getTokens();
        LinkedList<Dataset.Tag> tagList = tags.get(id);
        if (tagList != null) {
            for (Dataset.Tag tag : tagList) {
                int start = BC2GMDataset.getTokenIndex(tokens, tag.start);
                assert (start >= 0);
                int end = BC2GMDataset.getTokenIndex(tokens, tag.end);
                assert (end >= start);
                sentence.addMention(new Mention(sentence, start, end + 1, tag.type, Mention.MentionType.Required));
            }
        }
        return sentence;
    }

    protected void addAlternateMentions(Sentence sentence, HashMap<String, LinkedList<Dataset.Tag>> tags) {
        List<Token> tokens = sentence.getTokens();
        LinkedList<Dataset.Tag> tagList = tags.get(sentence.getSentenceId());
        if (tagList != null) {
            for (Dataset.Tag tag : tagList) {
                int start = BC2GMDataset.getTokenIndex(tokens, tag.start);
                assert (start >= 0);
                int end = BC2GMDataset.getTokenIndex(tokens, tag.end);
                assert (end >= start);
                sentence.addMention(new Mention(sentence, start, end + 1, tag.type, Mention.MentionType.Allowed));
            }
        }
    }

    protected static int getTokenIndex(List<Token> tokens, int index) {
        int chars = 0;
        for (int i = 0; i < tokens.size(); ++i) {
            int length = tokens.get(i).getText().length();
            if (index >= chars && index <= chars + length - 1) {
                return i;
            }
            chars += length;
        }
        return -1;
    }

    @Override
    public List<Dataset> split(int n) {
        ArrayList<Dataset> splitDatasets = new ArrayList<Dataset>();
        for (int i = 0; i < n; ++i) {
            BC2GMDataset dataset = new BC2GMDataset(this.tokenizer);
            splitDatasets.add(dataset);
        }
        Random r = new Random();
        for (Sentence sentence : this.sentences) {
            int num = r.nextInt(n);
            ((Dataset)splitDatasets.get((int)num)).sentences.add(sentence);
        }
        return splitDatasets;
    }
}

