/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jcore.reader.bc2gm;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.types.Header;
import de.julielab.jcore.types.Sentence;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name="JCoRe BioCreative II Gene Mention reader", description="This component reads gene annotated sentences in the BioCreative II Gene Mention challenge format. Each CAS will contain one annotated sentence.")
@TypeCapability(outputs={"de.julielab.jcore.types.Gene"})
public class BC2GMReader
extends JCasCollectionReader_ImplBase {
    private static final Logger log = LoggerFactory.getLogger(BC2GMReader.class);
    public static final String PARAM_SENTENCES = "SentencesFile";
    public static final String PARAM_GENES = "GenesFile";
    @ConfigurationParameter(name="SentencesFile", description="The BC2GM data is comprised of one file holding one sentence per line and another file holding the annotations. This parameter should be set to the file containing the sentences.")
    private String sentenceFile;
    @ConfigurationParameter(name="GenesFile", mandatory=false, description="The BC2GM data is comprised of one file holding one sentence per line and another file holding the annotations. This parameter should be set to the file holding the gene annotations.")
    private String genesFile;
    private Multimap<String, GeneAnnotation> geneAnnotations;
    private Iterator<String> sentencesIterator;

    public static TreeMap<Integer, Integer> createNumWsMap(String input) {
        TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
        map.put(0, 0);
        int numWs = 0;
        boolean lastCharWasWs = false;
        for (int i = 0; i < input.length(); ++i) {
            char c;
            if (lastCharWasWs) {
                map.put(i, numWs);
            }
            if (Character.isWhitespace(c = input.charAt(i))) {
                ++numWs;
                lastCharWasWs = true;
                continue;
            }
            lastCharWasWs = false;
        }
        return map;
    }

    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        this.sentenceFile = (String)context.getConfigParameterValue(PARAM_SENTENCES);
        this.genesFile = (String)context.getConfigParameterValue(PARAM_GENES);
        if (null == this.sentenceFile) {
            throw new ResourceInitializationException((Throwable)new IllegalArgumentException("Sentences file parameter is null."));
        }
        log.info("Reading sentences from {}", (Object)this.sentenceFile);
        if (null != this.genesFile) {
            try {
                log.info("Reading gene annotations from {}", (Object)this.genesFile);
                this.geneAnnotations = this.readGeneAnnotations(this.genesFile);
            }
            catch (IOException e) {
                throw new ResourceInitializationException((Throwable)e);
            }
        } else {
            log.info("No gene annotation file specified.");
        }
        try {
            this.sentencesIterator = Files.readAllLines(new File(this.sentenceFile).toPath(), Charset.forName("UTF-8")).iterator();
        }
        catch (IOException e) {
            throw new ResourceInitializationException((Throwable)e);
        }
    }

    private Multimap<String, GeneAnnotation> readGeneAnnotations(String genesFile) throws FileNotFoundException, IOException {
        HashMultimap annotations = HashMultimap.create();
        try (FileInputStream is = new FileInputStream(genesFile);){
            Iterator lineIterator = IOUtils.readLines((InputStream)is).iterator();
            while (lineIterator.hasNext()) {
                GeneAnnotation geneAnnotation = new GeneAnnotation();
                String genesLine = (String)lineIterator.next();
                String[] record = genesLine.split("\\|");
                String[] offsets = record[1].split(" ");
                geneAnnotation.sentenceId = record[0];
                geneAnnotation.start = Integer.parseInt(offsets[0].trim());
                geneAnnotation.end = Integer.parseInt(offsets[1].trim());
                geneAnnotation.text = record[2];
                annotations.put((Object)geneAnnotation.sentenceId, (Object)geneAnnotation);
            }
        }
        if (log.isInfoEnabled()) {
            log.info("Got {} gene annotations in {} sentences", (Object)annotations.size(), (Object)annotations.keySet().size());
        }
        return annotations;
    }

    public void getNext(JCas cas) throws IOException, CollectionException {
        String sentenceLine = this.sentencesIterator.next();
        String[] split = sentenceLine.split(" ", 2);
        String id = split[0];
        String sentence = split[1];
        cas.setDocumentText(sentence);
        Header header = new Header(cas);
        header.setDocId(id);
        header.addToIndexes();
        new Sentence(cas, 0, sentence.length()).addToIndexes();
        TreeMap<Integer, Integer> wsMap = BC2GMReader.createNumWsMap(sentence);
        if (this.geneAnnotations != null) {
            Collection sentenceAnnotations = this.geneAnnotations.get((Object)id);
            for (GeneAnnotation ga : sentenceAnnotations) {
                int start = ga.start;
                String text = ga.text;
                int textStart = 0;
                boolean positionFound = false;
                while ((textStart = sentence.indexOf(text, textStart)) != -1) {
                    Integer numWs = wsMap.floorEntry(textStart).getValue();
                    if (start + numWs == textStart || start + numWs == textStart + 1 || start + numWs == textStart - 1) {
                        int end = ga.end + wsMap.floorEntry(textStart + text.length()).getValue() + 1;
                        Gene gene = new Gene(cas, start + numWs, end);
                        gene.addToIndexes();
                        positionFound = true;
                        break;
                    }
                    textStart += text.length();
                }
                if (positionFound) continue;
                throw new IllegalStateException("The document-relative, whitespace-including position of the gene \"" + text + "\" with BC2GM offsets " + ga.start + "-" + ga.end + " in sentence " + id + " could not be found.");
            }
        }
    }

    public void close() throws IOException {
    }

    public Progress[] getProgress() {
        return null;
    }

    public boolean hasNext() throws IOException, CollectionException {
        return this.sentencesIterator.hasNext();
    }

    public class GeneAnnotation {
        public String sentenceId;
        public int start;
        public int end;
        public String text;
    }
}

