/*
 * Decompiled with CFR 0.152.
 */
package de.jungblut.nlp;

import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.hash.HashFunction;
import de.jungblut.datastructure.ArrayUtils;
import de.jungblut.math.DoubleVector;
import de.jungblut.math.dense.DenseDoubleVector;
import de.jungblut.math.sparse.SparseDoubleVector;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;
import java.util.stream.Stream;
import org.apache.commons.math3.util.FastMath;

public final class VectorizerUtils {
    public static final String OUT_OF_VOCABULARY = "@__OOV__@";

    public static String[] buildDictionary(Stream<String[]> tokenizedDocuments) {
        return VectorizerUtils.buildDictionary(tokenizedDocuments, 0.9f, 0);
    }

    public static String[] buildDictionary(Stream<String[]> tokenizedDocuments, float stopWordPercentage, int minFrequency) {
        Preconditions.checkArgument((stopWordPercentage >= 0.0f && stopWordPercentage <= 1.0f ? 1 : 0) != 0, (Object)("The provided stop word percentage is not between 0 and 1: " + stopWordPercentage));
        ConcurrentHashMultiset set = ConcurrentHashMultiset.create();
        AtomicLong numDocs = new AtomicLong();
        tokenizedDocuments.forEach(doc -> {
            numDocs.incrementAndGet();
            set.addAll(ArrayUtils.deduplicate(doc));
        });
        int threshold = (int)(stopWordPercentage * (float)numDocs.get());
        HashSet<String> toRemove = new HashSet<String>();
        for (Object entry : set.entrySet()) {
            if (entry.getCount() <= threshold && entry.getCount() >= minFrequency) continue;
            toRemove.add((String)entry.getElement());
        }
        toRemove.remove("<START>");
        toRemove.remove("<END>");
        Set elementSet = set.elementSet();
        for (String removal : toRemove) {
            elementSet.remove(removal);
        }
        set.add((Object)OUT_OF_VOCABULARY);
        Object[] array = elementSet.toArray(new String[elementSet.size()]);
        elementSet = null;
        Arrays.sort(array);
        return array;
    }

    public static int[] buildTransitionVector(String[] dict, String[] doc) {
        int[] toReturn = new int[doc.length];
        for (int i = 0; i < doc.length; ++i) {
            int idx = Arrays.binarySearch(dict, doc[i]);
            if (idx >= 0) {
                toReturn[i] = idx;
                continue;
            }
            idx = Arrays.binarySearch(dict, OUT_OF_VOCABULARY);
            if (idx < 0) continue;
            toReturn[i] = idx;
        }
        return toReturn;
    }

    public static HashMultimap<String, Integer> buildInvertedIndexMap(List<String[]> tokenizedDocuments, String[] dictionary) {
        HashMultimap indexMap = HashMultimap.create();
        for (int i = 0; i < tokenizedDocuments.size(); ++i) {
            String[] tokens;
            for (String token : tokens = tokenizedDocuments.get(i)) {
                if (Arrays.binarySearch(dictionary, token) < 0) continue;
                indexMap.put((Object)token, (Object)i);
            }
        }
        return indexMap;
    }

    public static int[][] buildInvertedIndexArray(List<String[]> tokenizedDocuments, String[] dictionary) {
        HashMultimap<String, Integer> invertedIndex = VectorizerUtils.buildInvertedIndexMap(tokenizedDocuments, dictionary);
        int[][] docs = new int[dictionary.length][];
        for (int i = 0; i < dictionary.length; ++i) {
            Set set = invertedIndex.get((Object)dictionary[i]);
            docs[i] = ArrayUtils.toPrimitiveArray(set.toArray(new Integer[set.size()]));
        }
        return docs;
    }

    public static int[] buildInvertedIndexDocumentCount(List<String[]> tokenizedDocuments, String[] dictionary) {
        HashMultimap<String, Integer> invertedIndex = VectorizerUtils.buildInvertedIndexMap(tokenizedDocuments, dictionary);
        int[] docs = new int[dictionary.length];
        for (int i = 0; i < dictionary.length; ++i) {
            Set set = invertedIndex.get((Object)dictionary[i]);
            docs[i] = set.size();
        }
        return docs;
    }

    public static Stream<DoubleVector> wordFrequencyVectorize(String[] ... vars) {
        return VectorizerUtils.wordFrequencyVectorize(Arrays.stream(vars));
    }

    public static Stream<DoubleVector> wordFrequencyVectorize(Stream<String[]> tokenizedDocuments) {
        return VectorizerUtils.wordFrequencyVectorize(tokenizedDocuments, VectorizerUtils.buildDictionary(tokenizedDocuments));
    }

    public static Stream<DoubleVector> wordFrequencyVectorize(Stream<String[]> tokenizedDocuments, String[] dictionary) {
        int oovIndex = Arrays.binarySearch(dictionary, OUT_OF_VOCABULARY);
        return tokenizedDocuments.map(tokens -> {
            SparseDoubleVector vector = new SparseDoubleVector(dictionary.length);
            HashMultiset set = HashMultiset.create(Arrays.asList(tokens));
            for (String s : tokens) {
                int foundIndex = Arrays.binarySearch(dictionary, s);
                if (foundIndex >= 0) {
                    vector.set(foundIndex, (double)set.count((Object)s));
                    continue;
                }
                if (oovIndex < 0) continue;
                vector.set(oovIndex, 1.0);
            }
            return vector;
        });
    }

    public static List<DoubleVector> tfIdfVectorize(List<String[]> tokenizedDocuments, String[] dictionary, int[] termDocumentCount) {
        int numDocuments = tokenizedDocuments.size();
        ArrayList<DoubleVector> list = new ArrayList<DoubleVector>(numDocuments);
        for (String[] document : tokenizedDocuments) {
            list.add(VectorizerUtils.tfIdfVectorize(numDocuments, document, dictionary, termDocumentCount));
        }
        return list;
    }

    public static DoubleVector tfIdfVectorize(int numDocuments, String[] document, String[] dictionary, int[] termDocumentCount) {
        int numTokens = dictionary.length;
        SparseDoubleVector vector = new SparseDoubleVector(numTokens);
        HashMultiset termFrequencySet = HashMultiset.create(Arrays.asList(document));
        int oovIndex = Arrays.binarySearch(dictionary, OUT_OF_VOCABULARY);
        double docLog = FastMath.log((double)numDocuments);
        for (String token : document) {
            int index = Arrays.binarySearch(dictionary, token);
            if (index >= 0) {
                double tfIdf = (double)termFrequencySet.count((Object)token) * (docLog - FastMath.log((double)termDocumentCount[index]));
                vector.set(index, tfIdf);
                continue;
            }
            if (oovIndex < 0) continue;
            vector.set(oovIndex, 1.0);
        }
        return vector;
    }

    public static <E> ArrayList<Multiset.Entry<E>> getMostFrequentItems(Multiset<E> set) {
        return VectorizerUtils.getMostFrequentItems(set, null);
    }

    public static <E> ArrayList<Multiset.Entry<E>> getMostFrequentItems(Multiset<E> set, Predicate<Multiset.Entry<E>> filter) {
        ArrayList list = Lists.newArrayList((Iterable)(filter == null ? set.entrySet() : Iterables.filter((Iterable)set.entrySet(), filter)));
        Collections.sort(list, new Comparator<Multiset.Entry<E>>(){

            @Override
            public int compare(Multiset.Entry<E> o1, Multiset.Entry<E> o2) {
                return Integer.compare(o2.getCount(), o1.getCount());
            }
        });
        return list;
    }

    public static DoubleVector hashVectorize(DoubleVector inputFeature, int n, HashFunction hashFunction) {
        DenseDoubleVector dense = new DenseDoubleVector(n);
        Iterator iterateNonZero = inputFeature.iterateNonZero();
        while (iterateNonZero.hasNext()) {
            DoubleVector.DoubleVectorElement next = (DoubleVector.DoubleVectorElement)iterateNonZero.next();
            int hash = hashFunction.hashInt(next.getIndex()).asInt();
            int bucket = Math.abs(hash) % n;
            dense.set(bucket, dense.get(bucket) + (hash < 0 ? -1.0 : 1.0));
        }
        return dense;
    }

    public static DoubleVector[] hashVectorize(DoubleVector[] features, int n, HashFunction hashFunction) {
        DoubleVector[] lst = new DoubleVector[features.length];
        for (int i = 0; i < features.length; ++i) {
            lst[i] = VectorizerUtils.hashVectorize(features[i], n, hashFunction);
        }
        return lst;
    }

    public static Stream<DoubleVector> sparseHashVectorize(Stream<String[]> documents, HashFunction hashFunction, Supplier<DoubleVector> vectorFactory) {
        return documents.map(doc -> VectorizerUtils.sparseHashVectorize(doc, documents.isParallel() ? null : hashFunction, vectorFactory));
    }

    public static DoubleVector sparseHashVectorize(String[] doc, HashFunction hashFunction, Supplier<DoubleVector> vectorFactory) {
        DoubleVector vec = vectorFactory.get();
        for (int i = 0; i < doc.length; ++i) {
            int hash = 0;
            hash = hashFunction == null ? doc[i].hashCode() : hashFunction.hashString((CharSequence)doc[i], Charset.defaultCharset()).asInt();
            int idx = FastMath.abs((int)(hash % vec.getDimension()));
            vec.set(idx, vec.get(idx) + 1.0);
        }
        return vec;
    }
}

