/*
 * Decompiled with CFR 0.152.
 */
package de.datexis.index.impl;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import de.datexis.common.ObjectSerializer;
import de.datexis.common.Resource;
import de.datexis.index.ArticleIndex;
import de.datexis.index.ArticleRef;
import de.datexis.index.WikiDataArticle;
import de.datexis.nel.model.Article;
import de.datexis.preprocess.MinimalLowercasePreprocessor;
import info.debatty.java.stringsimilarity.JaroWinkler;
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.icu.ICUFoldingFilterFactory;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LuceneArticleIndex
extends ArticleIndex {
    protected static final Logger log = LoggerFactory.getLogger(LuceneArticleIndex.class);
    protected static final String FIELD_TITLE = "title";
    protected static final String FIELD_TYPE = "type";
    protected static final String FIELD_REFIDS = "refID_";
    protected static final String FIELD_REFIDS_WIKIDATA = "refID_wikidata";
    protected static final String FIELD_REFIDS_WIKIPEDIA = "refID_wikipedia";
    protected static final String FIELD_REFIDS_FREEBASE = "refID_freebase";
    protected static final String FIELD_REFIDS_UMLS = "refID_umls";
    protected static final String FIELD_REFIDS_ICD10 = "refID_icd10";
    protected static final String FIELD_REFURLS_WIKIPEDIA = "refURL_wikipedia";
    protected static final String FIELD_TEXT = "text";
    protected static final String FIELD_DESCRIPTION = "description";
    protected static final String FIELD_NAMES = "name";
    protected static final String FIELD_TERMS = "term";
    protected static final String FIELD_VECTOR = "vector";
    protected static final String PARAM_PROXIMITY = "2";
    protected static final String PARAM_FUZZY = "0.8";
    protected static final int NUM_BM25_CANDIDATES = 1024;
    protected IndexReader reader;
    protected IndexSearcher searcher;
    protected Analyzer analyzer;
    protected final NormalizedStringSimilarity sim = new JaroWinkler();
    protected final TokenPreProcess preprocessor = new MinimalLowercasePreprocessor();
    static final FieldType FIELDTYPE_NAME = new FieldType();
    static final FieldType FIELDTYPE_VECTOR;

    public boolean openIndex(Resource path) {
        try {
            FSDirectory index = FSDirectory.open((Path)path.getPath());
            return this.openIndex((Directory)index);
        }
        catch (IOException e) {
            return false;
        }
    }

    private boolean openIndex(Directory index) {
        try {
            this.reader = DirectoryReader.open((Directory)index);
            this.searcher = new IndexSearcher(this.reader);
            this.analyzer = this.buildAnalyzer();
            return true;
        }
        catch (IOException e) {
            return false;
        }
    }

    public void createIndexRAM(Iterator<? extends Article> articles) {
        RAMDirectory index = new RAMDirectory();
        this.createIndex(articles, (Directory)index);
        this.openIndex((Directory)index);
    }

    public void createIndexDirectory(Iterator<? extends Article> articles, Resource cacheDir) throws IOException {
        FSDirectory index = FSDirectory.open((Path)cacheDir.getPath());
        this.createIndex(articles, (Directory)index);
        this.openIndex((Directory)index);
    }

    public void createIndex(Iterator<? extends Article> articles, Directory index) {
        log.info("creating new index...");
        this.analyzer = this.buildAnalyzer();
        try {
            IndexWriterConfig config = new IndexWriterConfig(this.analyzer);
            IndexWriter writer = new IndexWriter(index, config);
            int num = 0;
            int empty = 0;
            log.info("writing articles...");
            while (articles.hasNext()) {
                WikiDataArticle article = (WikiDataArticle)articles.next();
                Document doc = this.createLuceneDocument(article);
                writer.addDocument((Iterable)doc);
                if (++num % 100000 != 0) continue;
                log.info("wrote " + num + " articles so far");
            }
            writer.close();
            log.info(num + " articles (" + empty + " empty) written to index");
        }
        catch (IOException e) {
            log.error(e.toString());
        }
    }

    @Override
    public List<ArticleRef> queryNames(String name, int hits) {
        List<Document> docs = this.queryIndexNames(name, 1024);
        ArrayList<ArticleRef> candidates = new ArrayList<ArticleRef>(1024);
        for (Document d : docs) {
            Article a = this.createWikidataArticle(d);
            double score = 0.0;
            for (String title : a.getNames()) {
                double jaroSim = this.sim.similarity(this.preprocessor.preProcess(title), this.preprocessor.preProcess(name));
                if (!(jaroSim > score)) continue;
                score = jaroSim;
            }
            ArticleRef ref = new ArticleRef(a);
            ref.setScore(score);
            candidates.add(ref);
        }
        Collections.sort(candidates, new ArticleRef.ScoreComparator());
        return Lists.newArrayList((Iterable)Iterables.limit(candidates, (int)hits));
    }

    @Override
    public List<ArticleRef> queryPrefixNames(String prefix, int hits) {
        return this.queryIndexPrefix(prefix, hits);
    }

    @Override
    public Optional<ArticleRef> queryID(String id) {
        return this.queryWikidataID(id);
    }

    public Optional<ArticleRef> queryWikidataID(String id) {
        Optional<Document> d = this.queryIndexID(FIELD_REFIDS_WIKIDATA, id);
        if (d.isPresent()) {
            ArticleRef ref = this.createWikidataArticleRef(d.get());
            return Optional.of(ref);
        }
        return Optional.empty();
    }

    public Optional<ArticleRef> queryWikipediaURL(String url) {
        if (url == null || url.isEmpty()) {
            return Optional.empty();
        }
        if (url.startsWith("http://")) {
            url = url.replaceFirst("http://", "https://");
        } else if (!url.startsWith("https://")) {
            url = "https://" + url;
        }
        url = this.decodeWikiUrl(url);
        Optional<Document> d = this.queryIndexID(FIELD_REFURLS_WIKIPEDIA, url);
        if (d.isPresent()) {
            ArticleRef ref = this.createWikidataArticleRef(d.get());
            return Optional.of(ref);
        }
        return Optional.empty();
    }

    public Optional<ArticleRef> queryWikipediaPage(String name) {
        Optional<Document> d = this.queryIndexID(FIELD_REFIDS_WIKIPEDIA, name = this.decodeWikiUrl(name));
        if (d.isPresent()) {
            ArticleRef ref = this.createWikidataArticleRef(d.get());
            return Optional.of(ref);
        }
        return Optional.empty();
    }

    protected String decodeWikiUrl(String url) {
        try {
            url = URLDecoder.decode(url, "UTF-8");
        }
        catch (UnsupportedEncodingException ex) {
            log.debug("could not decode URL '" + url + "'");
        }
        url = url.replace(" ", "_").replaceFirst("#.+$", "");
        return url;
    }

    public Collection<String> getAllArticleTitles() {
        return this.getAllFields(FIELD_TITLE);
    }

    public Collection<String> getAllArticleNames() {
        return this.getAllFields(FIELD_NAMES);
    }

    public Collection<String> getAllArticleTerms() {
        return this.getAllFields(FIELD_TERMS);
    }

    public Collection<String> getAllArticleURLs() {
        return this.getAllFields(FIELD_REFURLS_WIKIPEDIA);
    }

    public Collection<String> getAllArticleIDs() {
        return this.getAllFields(FIELD_REFIDS_WIKIDATA);
    }

    protected Collection<String> getAllFields(String field) {
        TreeSet<String> result = new TreeSet<String>();
        try {
            IndexReader reader = this.searcher.getIndexReader();
            TreeSet<String> fields = new TreeSet<String>();
            fields.add(field);
            for (int i = 0; i < reader.maxDoc(); ++i) {
                String[] values;
                Document d = reader.document(i, fields);
                for (String v : values = d.getValues(field)) {
                    result.add(v);
                }
            }
        }
        catch (Exception ex) {
            log.error(ex.toString());
        }
        return result;
    }

    protected Document createLuceneDocument(WikiDataArticle article) {
        Document doc = new Document();
        this.addTextField(doc, FIELD_TITLE, article.getTitle().trim(), Field.Store.YES);
        this.addTextField(doc, FIELD_TYPE, article.getType(), Field.Store.YES);
        this.addTextField(doc, FIELD_DESCRIPTION, article.getDescription(), Field.Store.YES);
        for (String name : article.getNames()) {
            this.addNameField(doc, FIELD_NAMES, name);
        }
        for (String term : article.getTerms()) {
            this.addNameField(doc, FIELD_TERMS, term);
        }
        this.addStringField(doc, FIELD_REFIDS_WIKIDATA, article.getRefID(WikiDataArticle.RefID.WIKIDATA));
        this.addStringField(doc, FIELD_REFIDS_FREEBASE, article.getRefID(WikiDataArticle.RefID.FREEBASE));
        this.addStringField(doc, FIELD_REFIDS_WIKIPEDIA, article.getRefID(WikiDataArticle.RefID.WIKIPEDIA));
        this.addStringField(doc, FIELD_REFIDS_UMLS, article.getRefID(WikiDataArticle.RefID.UMLS));
        this.addStringField(doc, FIELD_REFIDS_ICD10, article.getRefID(WikiDataArticle.RefID.ICD10));
        this.addStringField(doc, FIELD_REFURLS_WIKIPEDIA, article.getUrl());
        return doc;
    }

    protected ArticleRef createWikidataArticleRef(Document doc) {
        ArticleRef art = new ArticleRef();
        art.setTitle(doc.get(FIELD_TITLE));
        art.setType(doc.get(FIELD_TYPE));
        art.setDescription(doc.get(FIELD_DESCRIPTION));
        art.setId(doc.get(FIELD_REFIDS_WIKIDATA));
        art.setUrl(doc.get(FIELD_REFURLS_WIKIPEDIA));
        String vec = doc.get(FIELD_VECTOR);
        if (vec != null) {
            art.setVector(ObjectSerializer.getArrayFromBase64String((String)vec));
        }
        return art;
    }

    protected Article createWikidataArticle(Document doc) {
        Article art = new Article();
        art.setTitle(doc.get(FIELD_TITLE));
        art.setType(doc.get(FIELD_TYPE));
        art.setDescription(doc.get(FIELD_DESCRIPTION));
        art.setId(doc.get(FIELD_REFIDS_WIKIDATA));
        art.setUrl(doc.get(FIELD_REFURLS_WIKIPEDIA));
        for (IndexableField name : doc.getFields(FIELD_NAMES)) {
            art.addName(name.stringValue());
        }
        String vec = doc.get(FIELD_VECTOR);
        if (vec != null) {
            art.setVector(ObjectSerializer.getArrayFromBase64String((String)vec));
        }
        return art;
    }

    protected String splitString(String name, String suffix) {
        String[] parts = name.split("\\s");
        StringBuilder result = new StringBuilder();
        for (String part : parts) {
            if (result.length() > 0) {
                result.append(" ");
            }
            result.append(part).append(suffix);
        }
        return result.toString();
    }

    protected void addTextField(Document doc, String name, String value, Field.Store store) {
        if (value != null) {
            doc.add((IndexableField)new TextField(name, value, store));
        }
    }

    protected void addStringField(Document doc, String name, String value) {
        if (value != null) {
            doc.add((IndexableField)new StringField(name, value, Field.Store.YES));
        }
    }

    protected void addNameField(Document doc, String name, String value) {
        if (value != null) {
            doc.add((IndexableField)new Field(name, value, FIELDTYPE_NAME));
        }
    }

    protected void addVectorField(Document doc, String name, INDArray arr) {
        String value = ObjectSerializer.getArrayAsBase64String((INDArray)arr);
        if (value != null) {
            doc.add((IndexableField)new Field(name, value, FIELDTYPE_VECTOR));
        }
    }

    protected Analyzer buildAnalyzer() {
        TreeMap<String, Object> analyzers = new TreeMap<String, Object>();
        try {
            CustomAnalyzer wordAnalyzer = CustomAnalyzer.builder().withTokenizer(ICUTokenizerFactory.class, new String[0]).addTokenFilter(ICUFoldingFilterFactory.class, new String[0]).build();
            analyzers.put(FIELD_NAMES, wordAnalyzer);
            analyzers.put(FIELD_TERMS, wordAnalyzer);
            KeywordAnalyzer idAnalyzer = new KeywordAnalyzer();
            analyzers.put(FIELD_VECTOR, idAnalyzer);
            analyzers.put(FIELD_REFIDS_WIKIDATA, idAnalyzer);
            analyzers.put(FIELD_REFIDS_WIKIPEDIA, idAnalyzer);
            analyzers.put(FIELD_REFIDS_FREEBASE, idAnalyzer);
            analyzers.put(FIELD_REFIDS_UMLS, idAnalyzer);
            analyzers.put(FIELD_REFIDS_ICD10, idAnalyzer);
            analyzers.put(FIELD_REFURLS_WIKIPEDIA, idAnalyzer);
        }
        catch (IOException e) {
            log.error("Could not create Lucene Analyzer: ");
            log.error(e.toString());
        }
        return new PerFieldAnalyzerWrapper((Analyzer)new StandardAnalyzer(), analyzers);
    }

    protected List<Document> queryIndexNames(String name, int hits) {
        ArrayList<Document> result = new ArrayList<Document>();
        try {
            ScoreDoc[] docs;
            BoostQuery exactQ = new BoostQuery(new QueryParser(FIELD_NAMES, this.analyzer).parse("\"" + name + "\"~" + PARAM_PROXIMITY), 1.0f);
            BooleanQuery query = new BooleanQuery.Builder().add((Query)exactQ, BooleanClause.Occur.SHOULD).build();
            TopDocs top = this.searcher.search((Query)query, hits);
            for (ScoreDoc hit : docs = top.scoreDocs) {
                Document d = this.searcher.doc(hit.doc);
                result.add(d);
            }
        }
        catch (Exception ex) {
            log.error(ex.toString());
        }
        return result;
    }

    protected List<ArticleRef> queryIndexPrefix(String prefix, int hits) {
        ArrayList<ArticleRef> result = new ArrayList<ArticleRef>();
        try {
            ScoreDoc[] docs;
            BoostQuery exactQ = new BoostQuery(new QueryParser(FIELD_NAMES, this.analyzer).parse("\"" + prefix + "\"*"), 1.0f);
            BooleanQuery query = new BooleanQuery.Builder().add((Query)exactQ, BooleanClause.Occur.SHOULD).build();
            TopDocs top = this.searcher.search((Query)query, hits);
            for (ScoreDoc hit : docs = top.scoreDocs) {
                Document d = this.searcher.doc(hit.doc);
                ArticleRef ref = this.createWikidataArticleRef(d);
                ref.setScore(hit.score);
                result.add(ref);
            }
        }
        catch (Exception ex) {
            log.error(ex.toString());
        }
        return result;
    }

    protected Optional<Document> queryIndexID(String field, String id) {
        try {
            Query query = new QueryParser(field, this.analyzer).parse("\"" + id + "\"");
            TopDocs top = this.searcher.search(query, 1);
            if (top.scoreDocs.length > 0) {
                ScoreDoc hit = top.scoreDocs[0];
                Document d = this.searcher.doc(hit.doc);
                return Optional.ofNullable(d);
            }
        }
        catch (Exception ex) {
            log.error(ex.toString());
        }
        return Optional.empty();
    }

    static {
        FIELDTYPE_NAME.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        FIELDTYPE_NAME.setTokenized(true);
        FIELDTYPE_NAME.setStored(true);
        FIELDTYPE_NAME.setOmitNorms(true);
        FIELDTYPE_NAME.freeze();
        FIELDTYPE_VECTOR = new FieldType();
        FIELDTYPE_VECTOR.setIndexOptions(IndexOptions.NONE);
        FIELDTYPE_VECTOR.setTokenized(false);
        FIELDTYPE_VECTOR.setStored(true);
        FIELDTYPE_VECTOR.setOmitNorms(true);
        FIELDTYPE_VECTOR.freeze();
    }
}

