/*
 * Decompiled with CFR 0.152.
 */
package de.julielab.topicmodeling.services;

import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.ArrayIterator;
import cc.mallet.topics.ParallelTopicModel;
import cc.mallet.topics.TopicInferencer;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.TokenSequence;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.types.Lemma;
import de.julielab.jcore.types.Token;
import de.julielab.jcore.types.pubmed.Header;
import de.julielab.jcore.utility.JCoReTools;
import de.julielab.topicmodeling.businessobjects.Document;
import de.julielab.topicmodeling.businessobjects.Model;
import de.julielab.topicmodeling.businessobjects.TMSearchResult;
import de.julielab.topicmodeling.businessobjects.Topic;
import de.julielab.topicmodeling.services.ITopicModeling;
import de.julielab.xml.JulieXMLTools;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.configuration2.HierarchicalConfiguration;
import org.apache.commons.configuration2.XMLConfiguration;
import org.apache.commons.configuration2.builder.BuilderParameters;
import org.apache.commons.configuration2.builder.FileBasedConfigurationBuilder;
import org.apache.commons.configuration2.builder.fluent.Parameters;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.apache.commons.configuration2.tree.ImmutableNode;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MalletTopicModeling
implements ITopicModeling {
    private static final Logger LOGGER = LoggerFactory.getLogger(MalletTopicModeling.class);
    String forEach = "/PubmedArticleSet/PubmedArticle/MedlineCitation";
    String idField = "PMID";
    String textField = "Article/Abstract/AbstractText";
    String alternativeTextField = "OtherAbstract/AbstractText";

    public static double computeSimilarity(double[] vectorA, double[] vectorB) {
        double dotProduct = 0.0;
        double normA = 0.0;
        double normB = 0.0;
        for (int i = 0; i < vectorA.length; ++i) {
            dotProduct += vectorA[i] * vectorB[i];
            normA += Math.pow(vectorA[i], 2.0);
            normB += Math.pow(vectorB[i], 2.0);
        }
        return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
    }

    public XMLConfiguration loadConfig(String configFile) throws ConfigurationException {
        Parameters params = new Parameters();
        FileBasedConfigurationBuilder builder = new FileBasedConfigurationBuilder(XMLConfiguration.class).configure(new BuilderParameters[]{(BuilderParameters)params.xml().setFileName(configFile)});
        XMLConfiguration xmlConfig = (XMLConfiguration)builder.getConfiguration();
        return xmlConfig;
    }

    @Override
    public Model train(List<Document> docs, XMLConfiguration xmlConfig) {
        int numTopics = xmlConfig.getInt("train.parameters.parameter.numTopics");
        double alphaSum = xmlConfig.getDouble("train.parameters.parameter.alphaSum");
        double beta = xmlConfig.getDouble("train.parameters.parameter.beta");
        int numThreads = xmlConfig.getInt("train.parameters.parameter.numThreads");
        int numIterations = xmlConfig.getInt("train.parameters.parameter.numIterations");
        int optimizationInterval = xmlConfig.getInt("train.parameters.parameter.optimizationInterval");
        String modelId = xmlConfig.getString("model.meta.ID");
        String modelVersion = xmlConfig.getString("model.meta.Version");
        LOGGER.info("Chosen number of topics: " + numTopics);
        LOGGER.info("Chosen Dirichlet-alpha: " + alphaSum);
        LOGGER.info("Chosen Dirichlet-beta: " + beta);
        LOGGER.info("Chosen training iterations: " + numIterations);
        LOGGER.info("Chosen optimization interval (if 0, optim. is deactivated): " + optimizationInterval);
        ParallelTopicModel malletParallelModel = new ParallelTopicModel(numTopics, alphaSum, beta);
        Model model = new Model();
        model.modelId = modelId;
        model.modelVersion = modelVersion;
        try {
            LOGGER.info("Start preprocessing");
            InstanceList instances = this.preprocess(docs);
            malletParallelModel.addInstances(instances);
            malletParallelModel.setNumThreads(numThreads);
            malletParallelModel.setNumIterations(numIterations);
            malletParallelModel.setOptimizeInterval(optimizationInterval);
            malletParallelModel.estimate();
            model.malletModel = malletParallelModel;
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("Model: " + modelId + " Version: " + modelVersion + " is trained");
        this.mapMalletIdToPubmedId(docs, model);
        LOGGER.info("PubMed citation IDs (PMIDs) are mapped to Mallet document IDs");
        return model;
    }

    public Model train(InstanceList instances, XMLConfiguration xmlConfig) {
        String modelId = xmlConfig.getString("model.id");
        String modelVersion = xmlConfig.getString("model.version");
        int numTopics = xmlConfig.getInt("train.parameters.parameter.numTopics");
        double alphaSum = xmlConfig.getDouble("train.parameters.parameter.alphaSum");
        double beta = xmlConfig.getDouble("train.parameters.parameter.beta");
        int numThreads = xmlConfig.getInt("train.parameters.parameter.numThreads");
        int numIterations = xmlConfig.getInt("train.parameters.parameter.numIterations");
        int optimizationInterval = xmlConfig.getInt("train.parameters.parameter.optimizationInterval");
        LOGGER.info("Chosen number of topics: " + numTopics);
        LOGGER.info("Chosen Dirichlet-alpha: " + alphaSum);
        LOGGER.info("Chosen Dirichlet-beta: " + beta);
        LOGGER.info("Chosen training iterations: " + numIterations);
        LOGGER.info("Chosen optimization interval (if 0, optim. is deactivated): " + optimizationInterval);
        ParallelTopicModel malletParallelModel = new ParallelTopicModel(numTopics, alphaSum, beta);
        Model model = new Model();
        try {
            LOGGER.info("Start preprocessing");
            malletParallelModel.addInstances(instances);
            malletParallelModel.setNumThreads(numThreads);
            malletParallelModel.setNumIterations(numIterations);
            malletParallelModel.setOptimizeInterval(optimizationInterval);
            LOGGER.info("Start training");
            malletParallelModel.estimate();
            model.malletModel = malletParallelModel;
            model.modelId = modelId;
            model.modelVersion = modelVersion;
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("Model is trained");
        return model;
    }

    public void saveMalletModel(Model model, File file) {
        try {
            if (model.malletModel != null) {
                ParallelTopicModel newModel = model.malletModel;
                newModel.write(file);
                LOGGER.info("Mallet model is saved in " + file.getName());
            } else {
                LOGGER.info("No Mallet model was found in ModelID: " + model.modelId + ", ModelVersion: " + model.modelVersion);
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    public void saveModel(Model model, String filename) {
        try {
            BufferedOutputStream fileOut = FileUtilities.getOutputStreamToFile((File)new File(filename));
            ObjectOutputStream out = new ObjectOutputStream(fileOut);
            out.writeObject(model);
            out.close();
            ((OutputStream)fileOut).close();
            LOGGER.info("Serialized Model is saved in " + filename);
        }
        catch (IOException i) {
            i.printStackTrace();
        }
    }

    @Override
    public List<Document> readDocuments(File file, XMLConfiguration xmlConfig) {
        if (file.isDirectory()) {
            FilenameFilter xmlFilter = new FilenameFilter(){

                @Override
                public boolean accept(File dir, String name) {
                    return name.endsWith(".xml.gz") || name.endsWith(".xml.zip") || name.endsWith(".xml.gzip") || name.endsWith(".xml");
                }
            };
            ArrayList<Document> docs = new ArrayList<Document>();
            File[] xmlFiles = file.listFiles(xmlFilter);
            int fileCount = xmlConfig.getInteger("evaluate.heldout.files.number", Integer.valueOf(xmlFiles.length));
            for (int i = 0; i < fileCount; ++i) {
                LOGGER.info("Attempt to read " + xmlFiles[i].getName() + ", no. " + (i + 1) + " of total " + fileCount);
                List<Document> docsFileI = this.readDocuments(xmlFiles[i], xmlConfig);
                for (int j = 0; j < docsFileI.size(); ++j) {
                    docs.add(docsFileI.get(j));
                }
            }
            return docs;
        }
        String fileName = file.getAbsolutePath();
        String[] fieldPaths = new String[]{this.idField, this.textField, this.alternativeTextField};
        ArrayList fields = new ArrayList();
        for (int i = 0; i < fieldPaths.length; ++i) {
            String path = fieldPaths[i];
            HashMap<String, String> field = new HashMap<String, String>();
            field.put("name", "fieldvalue" + i);
            field.put("xpath", path);
            fields.add(field);
        }
        Iterator rowIterator = JulieXMLTools.constructRowIterator((String)fileName, (int)1024, (String)this.forEach, fields, (boolean)false);
        ArrayList<Document> docs = new ArrayList<Document>();
        while (rowIterator.hasNext()) {
            Document doc = new Document();
            Map row = (Map)rowIterator.next();
            ArrayList<String> rowValues = new ArrayList<String>();
            String idValue = (String)row.get("fieldvalue0");
            String textValue = (String)row.get("fieldvalue1");
            String alternativeTextValue = (String)row.get("fieldvalue2");
            rowValues.add(idValue);
            rowValues.add(textValue);
            rowValues.add(alternativeTextValue);
            if (idValue != null) {
                doc.id = idValue;
            }
            if (textValue != null) {
                doc.text = textValue;
            }
            if (alternativeTextValue != null) {
                doc.text = alternativeTextValue;
            }
            docs.add(doc);
        }
        LOGGER.info("Total citations found: " + docs.size());
        return docs;
    }

    public List<Document> readXmiDb(MalletTopicModeling tm, HierarchicalConfiguration<ImmutableNode> configuration) {
        String subset = configuration.getString("train.corpus.subset.table");
        String annotationPostgresSchema = configuration.getString("train.corpus.subset.annotationpgschema");
        boolean resetSubset = configuration.getBoolean("train.corpus.subset.reset", false);
        String costosysConfigFile = configuration.getString("train.corpus.costosys.configurationFile");
        LOGGER.info("Start reading from DB table {} with CoStoSys configuration file {}", (Object)subset, (Object)costosysConfigFile);
        ArrayList<String> annotationsToLoad = new ArrayList<String>();
        annotationsToLoad.add(Token.class.getCanonicalName());
        ArrayList<Document> docs = new ArrayList<Document>();
        try {
            CollectionReader xmiDbReader = CollectionReaderFactory.createReader((String)"de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-reader", (Object[])new Object[]{"AdditionalTables", annotationsToLoad, "Table", subset, "CostosysConfigFile", costosysConfigFile, "ReadsBaseDocument", true, "ResetTable", resetSubset, "AdditionalTablesPostgresSchema", annotationPostgresSchema});
            JCas jCas = JCasFactory.createJCas((String[])new String[]{"de.julielab.jcore.types.jcore-all-types"});
            CAS aCAS = jCas.getCas();
            while (xmiDbReader.hasNext()) {
                xmiDbReader.getNext(aCAS);
                Document doc = new Document();
                JCas filledjCas = aCAS.getJCas();
                TokenSequence docLemmata = tm.getLemmata(filledjCas);
                doc.preprocessedData = docLemmata;
                doc.id = tm.getId(filledjCas);
                LOGGER.debug("Data for doc " + doc.id + ": " + doc.preprocessedData);
                docs.add(doc);
            }
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
        return docs;
    }

    @Override
    public TMSearchResult search(Document query, Model model, XMLConfiguration xmlConfig) {
        double probabilityThreshold = xmlConfig.getDouble("search.parameters.parameter.probabilityThreshold");
        TMSearchResult result = new TMSearchResult();
        result.malletId = new ArrayList<Integer>();
        result.probabilities = new ArrayList<Double>();
        result.pubmedID = new ArrayList<String>();
        if (query.preprocessedData == null) {
            ArrayList<Document> queryInList = new ArrayList<Document>();
            queryInList.add(query);
            List<TokenSequence> preprocessedQueryInList = this.jcorePreprocess(queryInList);
            query.preprocessedData = preprocessedQueryInList.get(0);
        }
        Map<String, List<Topic>> queryInstance = this.inferLabel(query, model, xmlConfig);
        List<Topic> queryTopics = queryInstance.get(query.id);
        ArrayList<Integer> relevantProbabilitiesIndex = new ArrayList<Integer>();
        for (int i = 0; i < queryTopics.size(); ++i) {
            if (!(queryTopics.get((int)i).probability >= probabilityThreshold)) continue;
            relevantProbabilitiesIndex.add(i);
        }
        double[] queryProbabilities = new double[relevantProbabilitiesIndex.size()];
        HashMap<Integer, Double> cosineSimilarities = new HashMap<Integer, Double>();
        for (int i = 0; i < relevantProbabilitiesIndex.size(); ++i) {
            double queryProbability;
            queryProbabilities[i] = queryProbability = queryTopics.get((int)i).probability;
        }
        ParallelTopicModel malletModel = model.malletModel;
        double[][] documentsTopics = malletModel.getDocumentTopics(false, false);
        for (int i = 0; i < relevantProbabilitiesIndex.size(); ++i) {
            double[] documentTopics = documentsTopics[(Integer)relevantProbabilitiesIndex.get(i)];
            double cosineSimilarity = MalletTopicModeling.computeSimilarity(queryProbabilities, documentTopics);
            cosineSimilarities.put(i, cosineSimilarity);
        }
        HashMap<String, List<Topic>> index = model.index;
        for (int i = 0; i < index.size(); ++i) {
            List<Topic> documentTopics = index.get(index.keySet().toArray()[i]);
            if (documentTopics == null) continue;
            double[] documentProbabilities = new double[relevantProbabilitiesIndex.size()];
            for (int k = 0; k < relevantProbabilitiesIndex.size(); ++k) {
                for (int m = 0; m < documentTopics.size(); ++m) {
                    if (documentTopics.get((int)m).id != (Integer)relevantProbabilitiesIndex.get(k) || relevantProbabilitiesIndex.get(k) == null) continue;
                    documentProbabilities[k] = documentTopics.get((int)m).probability;
                }
            }
            double cosineSimilarity = MalletTopicModeling.computeSimilarity(queryProbabilities, documentProbabilities);
            cosineSimilarities.put(i, cosineSimilarity);
        }
        LinkedList list = new LinkedList(cosineSimilarities.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<Integer, Double>>(){

            @Override
            public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2) {
                return o1.getValue().compareTo(o2.getValue());
            }
        });
        int displayedHits = xmlConfig.getInt("search.results.displayedHits", list.size());
        for (int i = 0; i < displayedHits; ++i) {
            Map.Entry entry = (Map.Entry)list.get(i);
            result.malletId.add((Integer)entry.getKey());
            result.probabilities.add((Double)entry.getValue());
            result.pubmedID.add(model.ModelIdpubmedId.get(entry.getKey()));
        }
        return result;
    }

    public TMSearchResult searchModelOnly(Document query, Model model, XMLConfiguration xmlConfig) {
        double probabilityThreshold = xmlConfig.getDouble("search.parameters.parameter.probabilityThreshold");
        TMSearchResult result = new TMSearchResult();
        result.malletId = new ArrayList<Integer>();
        result.probabilities = new ArrayList<Double>();
        result.pubmedID = new ArrayList<String>();
        if (query.preprocessedData == null) {
            ArrayList<Document> queryInList = new ArrayList<Document>();
            queryInList.add(query);
            List<TokenSequence> preprocessedQueryInList = this.jcorePreprocess(queryInList);
            query.preprocessedData = preprocessedQueryInList.get(0);
        }
        Map<String, List<Topic>> queryInstance = this.inferLabel(query, model, xmlConfig);
        List<Topic> queryTopics = queryInstance.get(query.id);
        ArrayList<Integer> relevantProbabilitiesIndex = new ArrayList<Integer>();
        for (int i = 0; i < queryTopics.size(); ++i) {
            if (!(queryTopics.get((int)i).probability >= probabilityThreshold)) continue;
            relevantProbabilitiesIndex.add(i);
        }
        double[] queryProbabilities = new double[relevantProbabilitiesIndex.size()];
        HashMap<Integer, Double> cosineSimilarities = new HashMap<Integer, Double>();
        for (int i = 0; i < relevantProbabilitiesIndex.size(); ++i) {
            double queryProbability;
            queryProbabilities[i] = queryProbability = queryTopics.get((int)i).probability;
        }
        ParallelTopicModel malletModel = model.malletModel;
        double[][] documentsTopics = malletModel.getDocumentTopics(false, false);
        for (int i = 0; i < relevantProbabilitiesIndex.size(); ++i) {
            double[] documentTopics = documentsTopics[(Integer)relevantProbabilitiesIndex.get(i)];
            double cosineSimilarity = MalletTopicModeling.computeSimilarity(queryProbabilities, documentTopics);
            cosineSimilarities.put(i, cosineSimilarity);
        }
        LinkedList list = new LinkedList(cosineSimilarities.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<Integer, Double>>(){

            @Override
            public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2) {
                return o1.getValue().compareTo(o2.getValue());
            }
        });
        int displayedHits = xmlConfig.getInt("search.results.displayedHits", list.size());
        for (int i = 0; i < displayedHits; ++i) {
            Map.Entry entry = (Map.Entry)list.get(i);
            result.malletId.add((Integer)entry.getKey());
            result.probabilities.add((Double)entry.getValue());
            result.pubmedID.add(model.ModelIdpubmedId.get(entry.getKey()));
        }
        return result;
    }

    public TMSearchResult searchIndexOnly(Document query, Model model, XMLConfiguration xmlConfig) {
        double probabilityThreshold = xmlConfig.getDouble("search.parameters.parameter.probabilityThreshold");
        TMSearchResult result = new TMSearchResult();
        result.malletId = new ArrayList<Integer>();
        result.probabilities = new ArrayList<Double>();
        result.pubmedID = new ArrayList<String>();
        if (query.preprocessedData == null) {
            ArrayList<Document> queryInList = new ArrayList<Document>();
            queryInList.add(query);
            List<TokenSequence> preprocessedQueryInList = this.jcorePreprocess(queryInList);
            query.preprocessedData = preprocessedQueryInList.get(0);
        }
        Map<String, List<Topic>> queryInstance = this.inferLabel(query, model, xmlConfig);
        List<Topic> queryTopics = queryInstance.get(query.id);
        ArrayList<Integer> relevantProbabilitiesIndex = new ArrayList<Integer>();
        for (int i = 0; i < queryTopics.size(); ++i) {
            if (!(queryTopics.get((int)i).probability >= probabilityThreshold)) continue;
            relevantProbabilitiesIndex.add(i);
        }
        double[] queryProbabilities = new double[relevantProbabilitiesIndex.size()];
        HashMap<Integer, Double> cosineSimilarities = new HashMap<Integer, Double>();
        for (int i = 0; i < relevantProbabilitiesIndex.size(); ++i) {
            double queryProbability;
            queryProbabilities[i] = queryProbability = queryTopics.get((int)i).probability;
        }
        HashMap<String, List<Topic>> index = model.index;
        for (int i = 0; i < index.size(); ++i) {
            List<Topic> documentTopics = index.get(index.keySet().toArray()[i]);
            if (documentTopics == null) continue;
            double[] documentProbabilities = new double[relevantProbabilitiesIndex.size()];
            for (int k = 0; k < relevantProbabilitiesIndex.size(); ++k) {
                for (int m = 0; m < documentTopics.size(); ++m) {
                    if (documentTopics.get((int)m).id != (Integer)relevantProbabilitiesIndex.get(k) || relevantProbabilitiesIndex.get(k) == null) continue;
                    documentProbabilities[k] = documentTopics.get((int)m).probability;
                }
            }
            double cosineSimilarity = MalletTopicModeling.computeSimilarity(queryProbabilities, documentProbabilities);
            cosineSimilarities.put(i, cosineSimilarity);
        }
        LinkedList list = new LinkedList(cosineSimilarities.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<Integer, Double>>(){

            @Override
            public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2) {
                return o1.getValue().compareTo(o2.getValue());
            }
        });
        int displayedHits = xmlConfig.getInt("search.results.displayedHits", list.size());
        for (int i = 0; i < displayedHits; ++i) {
            Map.Entry entry = (Map.Entry)list.get(i);
            result.malletId.add((Integer)entry.getKey());
            result.probabilities.add((Double)entry.getValue());
            result.pubmedID.add(model.ModelIdpubmedId.get(entry.getKey()));
        }
        return result;
    }

    @Override
    public Map<String, List<Topic>> inferLabel(Document doc, Model model, XMLConfiguration xmlConfig) {
        int numIterations = xmlConfig.getInt("infer.parameters.parameter.numIterations");
        int thinning = xmlConfig.getInt("infer.parameters.parameter.savingInterval");
        int burnIn = xmlConfig.getInt("infer.parameters.parameter.firstSavingInterval");
        HashMap<String, List<Topic>> result = new HashMap<String, List<Topic>>();
        ParallelTopicModel malletModel = model.malletModel;
        TopicInferencer inferencer = malletModel.getInferencer();
        TokenSequence jcorePreprocessed = (TokenSequence)doc.preprocessedData;
        if (jcorePreprocessed.isEmpty()) {
            LOGGER.warn("Document tokens are empty");
        }
        ArrayList<TokenSequence> jcorePreprocessedList = new ArrayList<TokenSequence>();
        jcorePreprocessedList.add(jcorePreprocessed);
        InstanceList instances = this.malletPreprocess(jcorePreprocessedList);
        Instance instance = (Instance)instances.get(0);
        double[] distribution = inferencer.getSampledDistribution(instance, numIterations, thinning, burnIn);
        ArrayList<Topic> topics = new ArrayList<Topic>();
        int i = 0;
        while (i < distribution.length) {
            Topic topic = new Topic();
            topic.probability = distribution[i];
            topic.id = i++;
            topics.add(topic);
        }
        result.put(doc.id, topics);
        return result;
    }

    public Map<String, List<Topic>> inferLabel(JCas cas, Model model, XMLConfiguration xmlConfig) {
        TokenSequence docLemmata = this.getLemmata(cas);
        Document doc = new Document();
        doc.id = JCoReTools.getDocId((JCas)cas);
        doc.preprocessedData = docLemmata;
        Map<String, List<Topic>> result = this.inferLabel(doc, model, xmlConfig);
        return result;
    }

    public Model readMalletModel(File file) {
        Model model = new Model();
        try {
            ParallelTopicModel malletParallelTopicModel;
            model.malletModel = malletParallelTopicModel = ParallelTopicModel.read((File)file);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        return model;
    }

    @Override
    public Model readModel(String filename) {
        Model model = new Model();
        try {
            BufferedInputStream fileIn = FileUtilities.getInputStreamFromFile((File)new File(filename));
            ObjectInputStream in = new ObjectInputStream(fileIn);
            model = (Model)in.readObject();
            in.close();
            ((InputStream)fileIn).close();
        }
        catch (IOException i) {
            i.printStackTrace();
        }
        catch (ClassNotFoundException c) {
            LOGGER.error("Model class not found", (Throwable)c);
        }
        return model;
    }

    public InstanceList preprocess(List<Document> docs) {
        List<TokenSequence> allLemmata = this.jcorePreprocess(docs);
        InstanceList instances = this.malletPreprocess(allLemmata);
        return instances;
    }

    public List<TokenSequence> jcorePreprocess(List<Document> docs) {
        ArrayList<TokenSequence> allLemmata = new ArrayList<TokenSequence>();
        try {
            AnalysisEngine sentenceDetector = AnalysisEngineFactory.createEngine((String)"de.julielab.jcore.ae.jsbd.desc.jcore-jsbd-ae-biomedical-english", (Object[])new Object[0]);
            AnalysisEngine tokenizer = AnalysisEngineFactory.createEngine((String)"de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english", (Object[])new Object[0]);
            AnalysisEngine posTagger = AnalysisEngineFactory.createEngine((String)"de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp-postag-ae-biomedical-english", (Object[])new Object[0]);
            AnalysisEngine bioLemmatizer = AnalysisEngineFactory.createEngine((String)"de.julielab.jcore.ae.biolemmatizer.desc.jcore-biolemmatizer-ae", (Object[])new Object[0]);
            JCas jCas = JCasFactory.createJCas();
            for (int i = 0; i < docs.size(); ++i) {
                String sentences = docs.get((int)i).text;
                LOGGER.debug("Attempt to process document: " + docs.get((int)i).id);
                if (sentences == null) continue;
                jCas.setDocumentText(sentences);
                sentenceDetector.process(jCas);
                tokenizer.process(jCas);
                posTagger.process(jCas);
                bioLemmatizer.process(jCas);
                TokenSequence foundLemmata = this.getLemmata(jCas);
                allLemmata.add(foundLemmata);
                jCas.reset();
            }
            sentenceDetector.destroy();
            tokenizer.destroy();
            posTagger.destroy();
            bioLemmatizer.destroy();
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("JCoRe preprocessing finished");
        return allLemmata;
    }

    public InstanceList malletPreprocess(List<TokenSequence> data) {
        ArrayList<Object> pipeList = new ArrayList<Object>();
        pipeList.add(new TokenSequenceRemoveStopwords(false, false));
        pipeList.add(new TokenSequence2FeatureSequence());
        InstanceList instances = new InstanceList((Pipe)new SerialPipes(pipeList));
        ArrayIterator dataListIterator = new ArrayIterator(data);
        instances.addThruPipe((Iterator)dataListIterator);
        return instances;
    }

    public TokenSequence getLemmata(JCas aJCas) {
        TokenSequence lemmata = new TokenSequence();
        AnnotationIndex tokenIndex = aJCas.getAnnotationIndex(Token.type);
        FSIterator tokenIterator = tokenIndex.iterator();
        while (tokenIterator.hasNext()) {
            Token token = (Token)tokenIterator.get();
            Lemma lemma = token.getLemma();
            if (lemma == null) {
                throw new IllegalArgumentException("The input UIMA CAS is missing lemma annotations set to the tokens as the lemma feature.");
            }
            String lemmaString = lemma.getValue();
            if (this.isNotNum(lemmaString) && this.isNotPunctuation(lemmaString)) {
                lemmata.add(lemmaString);
            }
            tokenIterator.next();
        }
        return lemmata;
    }

    public String getId(JCas aJCas) {
        String id = "";
        AnnotationIndex headerIndex = aJCas.getAnnotationIndex(Header.type);
        for (Header header : headerIndex) {
            id = header.getDocId();
            LOGGER.trace("Found id: " + id);
        }
        return id;
    }

    public boolean isNotNum(String lemmaString) {
        String num = "\\s?-?\\d+.?\\d*\\s?";
        return !lemmaString.matches(num);
    }

    public boolean isNotPunctuation(String lemmaString) {
        String punct = "[.,:;!?\\-\\/()<>\\[\\]%'+=]";
        return !lemmaString.matches(punct);
    }

    public void mapPubmedIdToMalletId(List<Document> docs, Model model) {
        model.pubmedIdModelId = new HashMap();
        for (int i = 0; i < docs.size(); ++i) {
            Document doci = docs.get(i);
            String dociId = doci.id;
            LOGGER.trace("Attempting to map PMID " + dociId + " to mallet doc " + i);
            model.pubmedIdModelId.put(dociId, i);
            LOGGER.debug("PubMed citation IDs (PMIDs) are mapped to Mallet document IDs");
        }
    }

    public void mapMalletIdToPubmedId(List<Document> docs, Model model) {
        model.ModelIdpubmedId = new HashMap();
        for (int i = 0; i < docs.size(); ++i) {
            Document doci = docs.get(i);
            String dociId = doci.id;
            LOGGER.trace("Attempting to map Mallet DocID " + i + " to PMID " + dociId);
            model.ModelIdpubmedId.put(i, dociId);
        }
        LOGGER.debug("Mallet document IDs are mapped to PubMed citation IDs (PMIDs)");
    }

    public Object[] getVocabulary(Model model) {
        ParallelTopicModel malletModel = model.malletModel;
        Alphabet alphabet = malletModel.getAlphabet();
        Object[] alphabetArray = alphabet.toArray();
        return alphabetArray;
    }
}

