/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.jcore.reader.nlmgene;

import com.pengyifan.bioc.BioCAnnotation;
import com.pengyifan.bioc.BioCCollection;
import com.pengyifan.bioc.BioCDocument;
import com.pengyifan.bioc.BioCPassage;
import com.pengyifan.bioc.io.BioCCollectionReader;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.types.ResourceEntry;
import de.julielab.jcore.types.Title;
import de.julielab.jcore.types.pubmed.AbstractText;
import de.julielab.jcore.types.pubmed.Header;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.util.Collections;
import java.util.Iterator;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import javax.xml.stream.XMLStreamException;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name="JCoRe NLM-Gene Reader", description="Collection reader for the BioC format of the NLM-Gene corpus.", vendor="JULIE Lab Jena, Germany")
@TypeCapability(inputs={}, outputs={"de.julielab.jcore.types.Gene", "de.julielab.jcore.types.ResourceEntry"})
public class NLMGeneReader
extends JCasCollectionReader_ImplBase {
    public static final String PARAM_INPUT_DIR = "InputDirectory";
    public static final String PARAM_ID_LIST_PATH = "IdList";
    private static final Logger log = LoggerFactory.getLogger(NLMGeneReader.class);
    @ConfigurationParameter(name="InputDirectory", description="Path to the directory that contains the BioC XML files of the NLM-Gene corpus.")
    private String inputDir;
    @ConfigurationParameter(name="IdList", mandatory=false, description="Path to a file with a list of IDs to restrict the read files to. This will typically be the list with IDs for the training or for the test set of the corpus. When no list is specified, the whole corpus is read.")
    private String idList;
    private Iterator<Path> corpusFileIterator;
    private int numRead;

    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        this.inputDir = (String)context.getConfigParameterValue(PARAM_INPUT_DIR);
        this.idList = (String)context.getConfigParameterValue(PARAM_ID_LIST_PATH);
        try {
            this.corpusFileIterator = this.readInputFiles(this.inputDir, this.idList);
        }
        catch (IOException e) {
            log.error("Could not read NLM-Gene corpus input files.", (Throwable)e);
            throw new ResourceInitializationException((Throwable)e);
        }
        this.numRead = 0;
    }

    private Iterator<Path> readInputFiles(String inputDir, String idList) throws IOException {
        Path inputPath = Path.of(inputDir, new String[0]);
        Path idListPath = idList != null ? Path.of(idList, new String[0]) : null;
        Set ids = idListPath != null && Files.exists(idListPath, new LinkOption[0]) ? Files.readAllLines(idListPath).stream().collect(Collectors.toSet()) : Collections.emptySet();
        return Files.list(inputPath).filter(p -> p.toString().toLowerCase().endsWith(".xml") || p.toString().toLowerCase().endsWith(".xml.gz")).filter(p -> ids.isEmpty() ? true : ids.contains(p.getFileName().toString().replaceAll("(?i)\\.bioc\\.xml(\\.gz)?", ""))).iterator();
    }

    public void getNext(JCas jCas) throws CollectionException {
        Path nextFile = this.corpusFileIterator.next();
        try {
            BioCCollectionReader reader = new BioCCollectionReader(nextFile);
            BioCCollection collection = reader.readCollection();
            if (collection.getDocmentCount() > 1) {
                throw new IllegalArgumentException("A single document per BioC collection is expected but the collection of file " + nextFile + " has " + collection.getDocmentCount() + " documents. This case is not supported.");
            }
            BioCDocument document = collection.getDocument(0);
            this.handleHeader(jCas, document);
            StringBuilder textBuilder = new StringBuilder();
            for (BioCPassage p : document.getPassages()) {
                int previousTextLength = textBuilder.length();
                textBuilder.append((String)p.getText().get());
                this.handlePassageStructureType(jCas, textBuilder, p, previousTextLength);
                this.handleAnnotation(jCas, document, p, textBuilder);
                textBuilder.append(System.getProperty("line.separator"));
            }
            jCas.setDocumentText(textBuilder.toString());
        }
        catch (IOException | XMLStreamException e) {
            log.error("Could not read NLM-Gene corpus file {}", (Object)nextFile, (Object)e);
            throw new CollectionException((Throwable)e);
        }
    }

    private void handleHeader(JCas jCas, BioCDocument document) {
        Header h = new Header(jCas);
        h.setDocId(document.getID());
        h.setComponentId(((Object)((Object)this)).getClass().getSimpleName());
        h.setSource("NLM-Gene");
        h.addToIndexes();
    }

    private void handleAnnotation(JCas jCas, BioCDocument document, BioCPassage p, StringBuilder textBuilder) {
        for (BioCAnnotation a : p.getAnnotations()) {
            Gene g = new Gene(jCas, a.getTotalLocation().getOffset(), a.getTotalLocation().getOffset() + a.getTotalLocation().getLength());
            g.setComponentId(((Object)((Object)this)).getClass().getSimpleName());
            Optional typeInfon = a.getInfon("type");
            Optional codeInfon = a.getInfon("code");
            this.handleErrors(document, p, a, g, typeInfon, textBuilder);
            this.handleGeneId(jCas, a, g);
            this.handleSpecificType(g, typeInfon, codeInfon);
            g.addToIndexes();
        }
    }

    private void handleSpecificType(Gene g, Optional<String> typeInfon, Optional<String> codeInfon) {
        g.setSpecificType(typeInfon.get());
        if (codeInfon.isPresent()) {
            g.setSpecificType(typeInfon.get() + "-" + codeInfon.get());
        }
    }

    private void handleErrors(BioCDocument document, BioCPassage p, BioCAnnotation a, Gene g, Optional<String> typeInfon, StringBuilder textBuilder) {
        if (!typeInfon.isPresent()) {
            throw new IllegalStateException("The annotation " + a.getID() + " of passage " + (String)p.getInfon("type").get() + " of document " + document.getID() + " does not specify a type.");
        }
    }

    private void handleGeneId(JCas jCas, BioCAnnotation a, Gene g) {
        Optional ncbiGeneId = a.getInfon("NCBI Gene identifier");
        if (ncbiGeneId.isPresent()) {
            ResourceEntry re = new ResourceEntry(jCas, g.getBegin(), g.getEnd());
            re.setEntryId((String)ncbiGeneId.get());
            if (((String)ncbiGeneId.get()).contains("|")) {
                re.setEntryId(((String)ncbiGeneId.get()).split("\\|")[0]);
            }
            re.setComponentId(((Object)((Object)this)).getClass().getSimpleName());
            FSArray resourceEntryList = new FSArray(jCas, 1);
            resourceEntryList.set(0, (FeatureStructure)re);
            g.setResourceEntryList(resourceEntryList);
        }
    }

    private void handlePassageStructureType(JCas jCas, StringBuilder textBuilder, BioCPassage p, int previousTextLength) {
        Optional typeInfon = p.getInfon("type");
        if (typeInfon.isPresent() && ((String)typeInfon.get()).equals("title")) {
            Title t = new Title(jCas, previousTextLength, textBuilder.length());
            t.setTitleType("document");
            t.setComponentId(((Object)((Object)this)).getClass().getSimpleName());
            t.addToIndexes();
        } else if (typeInfon.isPresent() && ((String)typeInfon.get()).equals("abstract")) {
            AbstractText abstractText = new AbstractText(jCas, previousTextLength, textBuilder.length());
            abstractText.setComponentId(((Object)((Object)this)).getClass().getSimpleName());
            abstractText.addToIndexes();
        }
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.numRead, 0, "documents")};
    }

    public boolean hasNext() {
        return this.corpusFileIterator.hasNext();
    }
}

