/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.util;

import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureSequence;
import cc.mallet.types.IDSorter;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import gnu.trove.TIntIntHashMap;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.logging.Logger;

public class FeatureCooccurrenceCounter {
    private static Logger logger = MalletLogger.getLogger(FeatureCooccurrenceCounter.class.getName());
    static CommandOption.String inputFile = new CommandOption.String(FeatureCooccurrenceCounter.class, "input", "FILENAME", true, null, "The filename from which to read the list of training instances.  Use - for stdin.  The instances must be FeatureSequence or FeatureSequenceWithBigrams, not FeatureVector", null);
    static CommandOption.String weightsFile = new CommandOption.String(FeatureCooccurrenceCounter.class, "weights-filename", "FILENAME", true, null, "The filename to write the word-word weights file.", null);
    static CommandOption.Double idfCutoff = new CommandOption.Double(FeatureCooccurrenceCounter.class, "idf-cutoff", "NUMBER", true, 3.0, "Words with IDF below this threshold will not be linked to any other word.", null);
    static CommandOption.String unlinkedFile = new CommandOption.String(FeatureCooccurrenceCounter.class, "unlinked-filename", "FILENAME", true, null, "A file to write words that were not linked.", null);
    TIntIntHashMap[] featureFeatureCounts;
    InstanceList instances;
    int numFeatures;
    int[] documentFrequencies;

    public FeatureCooccurrenceCounter(InstanceList instances) {
        this.instances = instances;
        this.numFeatures = instances.getDataAlphabet().size();
        this.featureFeatureCounts = new TIntIntHashMap[this.numFeatures];
        int feature = 0;
        while (feature < this.numFeatures) {
            this.featureFeatureCounts[feature] = new TIntIntHashMap();
            ++feature;
        }
        this.documentFrequencies = new int[this.numFeatures];
    }

    public void count() {
        TIntIntHashMap featureCounts = new TIntIntHashMap();
        int index = 0;
        for (Instance instance : this.instances) {
            FeatureSequence features = (FeatureSequence)instance.getData();
            int i = 0;
            while (i < features.getLength()) {
                featureCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1);
                ++i;
            }
            int[] keys = featureCounts.keys();
            int i2 = 0;
            while (i2 < keys.length - 1) {
                int leftFeature = keys[i2];
                int j = i2 + 1;
                while (j < keys.length) {
                    int rightFeature = keys[j];
                    this.featureFeatureCounts[leftFeature].adjustOrPutValue(rightFeature, 1, 1);
                    this.featureFeatureCounts[rightFeature].adjustOrPutValue(leftFeature, 1, 1);
                    ++j;
                }
                ++i2;
            }
            int[] nArray = keys;
            int n = keys.length;
            int n2 = 0;
            while (n2 < n) {
                int key;
                int n3 = key = nArray[n2];
                this.documentFrequencies[n3] = this.documentFrequencies[n3] + 1;
                ++n2;
            }
            featureCounts = new TIntIntHashMap();
            if (++index % 1000 != 0) continue;
            System.err.println(index);
        }
    }

    public double g2(double left, double right, double both, double total) {
        double justLeft = left - both + 0.01;
        double justRight = right - both + 0.01;
        double neither = total - left - right + (both += 0.01) + 0.01;
        double leftMarginalProb = (justLeft + both) / (total += 0.04);
        double rightMarginalProb = (justRight + both) / total;
        double logLeft = Math.log(leftMarginalProb);
        double logRight = Math.log(rightMarginalProb);
        double logNotLeft = Math.log(1.0 - leftMarginalProb);
        double logNotRight = Math.log(1.0 - rightMarginalProb);
        double g2 = both * (Math.log(both / total) - logLeft - logRight) + justLeft * (Math.log(justLeft / total) - logLeft - logNotRight) + justRight * (Math.log(justRight / total) - logNotLeft - logRight) + neither * (Math.log(neither / total) - logNotLeft - logNotRight);
        return g2;
    }

    public void printCounts() throws IOException {
        int feature;
        NumberFormat formatter = NumberFormat.getInstance();
        formatter.setMaximumFractionDigits(3);
        Alphabet alphabet = this.instances.getDataAlphabet();
        double logTotalDocs = Math.log(this.instances.size());
        double[] logCache = new double[this.instances.size() + 1];
        int n = 1;
        while (n < logCache.length) {
            logCache[n] = Math.log(n);
            ++n;
        }
        if (FeatureCooccurrenceCounter.unlinkedFile.value != null) {
            PrintWriter out = new PrintWriter(FeatureCooccurrenceCounter.unlinkedFile.value);
            feature = 0;
            while (feature < this.numFeatures) {
                double featureIDF = logTotalDocs - logCache[this.documentFrequencies[feature]];
                if (featureIDF < FeatureCooccurrenceCounter.idfCutoff.value) {
                    out.println(alphabet.lookupObject(feature));
                }
                ++feature;
            }
            out.close();
        }
        PrintWriter out = new PrintWriter(FeatureCooccurrenceCounter.weightsFile.value);
        feature = 0;
        while (feature < this.numFeatures) {
            TIntIntHashMap featureCounts = this.featureFeatureCounts[feature];
            int[] keys = featureCounts.keys();
            double featureIDF = logTotalDocs - logCache[this.documentFrequencies[feature]];
            StringBuilder output = new StringBuilder();
            output.append(alphabet.lookupObject(feature));
            output.append("\t");
            output.append("1.0");
            if (this.documentFrequencies[feature] <= 5) {
                out.println(output);
            } else {
                if (featureIDF - FeatureCooccurrenceCounter.idfCutoff.value > 0.0) {
                    int key;
                    Object[] sortedWeights = new IDSorter[keys.length];
                    int i = 0;
                    int[] nArray = keys;
                    int n2 = keys.length;
                    int n3 = 0;
                    while (n3 < n2) {
                        key = nArray[n3];
                        double keyIDF = logTotalDocs - logCache[this.documentFrequencies[key]];
                        sortedWeights[i] = keyIDF - FeatureCooccurrenceCounter.idfCutoff.value > 0.0 ? new IDSorter(key, (keyIDF - FeatureCooccurrenceCounter.idfCutoff.value) / (featureIDF - FeatureCooccurrenceCounter.idfCutoff.value) * ((double)featureCounts.get(key) / (double)this.documentFrequencies[feature])) : new IDSorter(key, 0);
                        ++i;
                        ++n3;
                    }
                    Arrays.sort(sortedWeights);
                    i = 0;
                    while (i < 10) {
                        if (i >= sortedWeights.length) break;
                        key = ((IDSorter)sortedWeights[i]).getID();
                        Object word = alphabet.lookupObject(((IDSorter)sortedWeights[i]).getID());
                        double weight = ((IDSorter)sortedWeights[i]).getWeight();
                        if (weight < 0.05) break;
                        output.append("\t" + word + "\t" + weight);
                        ++i;
                    }
                }
                out.println(output);
            }
            ++feature;
        }
        out.close();
    }

    public static void main(String[] args) throws Exception {
        CommandOption.setSummary(FeatureCooccurrenceCounter.class, "Build a file containing weights between word types");
        CommandOption.process(FeatureCooccurrenceCounter.class, args);
        InstanceList training = InstanceList.load(new File(FeatureCooccurrenceCounter.inputFile.value));
        FeatureCooccurrenceCounter counter = new FeatureCooccurrenceCounter(training);
        counter.count();
        counter.printCounts();
    }
}

