package org.apache.mahout.utils.vectors.lucene;

import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import com.ibm.icu.impl.locale.LanguageTag;
import com.ibm.icu.text.DateFormat;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.store.FSDirectory;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.utils.vectors.TermEntry;
import org.apache.mahout.utils.vectors.io.DelimitedTermInfoWriter;
import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
import org.apache.mahout.utils.vectors.io.VectorWriter;
import org.apache.mahout.vectorizer.TF;
import org.apache.mahout.vectorizer.TFIDF;
import org.apache.mahout.vectorizer.Weight;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/utils/vectors/lucene/Driver.class */
public final class Driver {
    private static final Logger log = LoggerFactory.getLogger(Driver.class);
    private String luceneDir;
    private String outFile;
    private String field;
    private String idField;
    private String dictOut;
    private String seqDictOut = "";
    private String weightType = "tfidf";
    private String delimiter = "\t";
    private double norm = -1.0d;
    private long maxDocs = Long.MAX_VALUE;
    private int minDf = 1;
    private int maxDFPercent = 99;
    private double maxPercentErrorDocs = 0.0d;

    /* JADX WARN: Finally extract failed */
    public void dumpVectors() throws IOException {
        Weight tfidf;
        File file = new File(this.luceneDir);
        Preconditions.checkArgument(file.isDirectory(), "Lucene directory: " + file.getAbsolutePath() + " does not exist or is not a directory");
        Preconditions.checkArgument(this.maxDocs >= 0, "maxDocs must be >= 0");
        Preconditions.checkArgument(this.minDf >= 1, "minDf must be >= 1");
        Preconditions.checkArgument(this.maxDFPercent <= 99, "maxDFPercent must be <= 99");
        DirectoryReader open = DirectoryReader.open(FSDirectory.open(file));
        if ("tf".equalsIgnoreCase(this.weightType)) {
            tfidf = new TF();
        } else {
            if (!"tfidf".equalsIgnoreCase(this.weightType)) {
                throw new IllegalArgumentException("Weight type " + this.weightType + " is not supported");
            }
            tfidf = new TFIDF();
        }
        CachedTermInfo cachedTermInfo = new CachedTermInfo(open, this.field, this.minDf, this.maxDFPercent);
        LuceneIterable luceneIterable = this.norm == -1.0d ? new LuceneIterable(open, this.idField, this.field, cachedTermInfo, tfidf, -1.0d, this.maxPercentErrorDocs) : new LuceneIterable(open, this.idField, this.field, cachedTermInfo, tfidf, this.norm, this.maxPercentErrorDocs);
        log.info("Output File: {}", this.outFile);
        VectorWriter seqFileWriter = getSeqFileWriter(this.outFile);
        try {
            log.info("Wrote: {} vectors", Long.valueOf(seqFileWriter.write(luceneIterable, this.maxDocs)));
            Closeables.close(seqFileWriter, false);
            File file2 = new File(this.dictOut);
            log.info("Dictionary Output file: {}", file2);
            DelimitedTermInfoWriter delimitedTermInfoWriter = new DelimitedTermInfoWriter(Files.newWriter(file2, Charsets.UTF_8), this.delimiter, this.field);
            try {
                delimitedTermInfoWriter.write(cachedTermInfo);
                Closeables.close(delimitedTermInfoWriter, false);
                if ("".equals(this.seqDictOut)) {
                    return;
                }
                log.info("SequenceFile Dictionary Output file: {}", this.seqDictOut);
                Path path = new Path(this.seqDictOut);
                Configuration configuration = new Configuration();
                SequenceFile.Writer writer = null;
                try {
                    writer = SequenceFile.createWriter(FileSystem.get(configuration), configuration, path, Text.class, IntWritable.class);
                    Text text = new Text();
                    IntWritable intWritable = new IntWritable();
                    Iterator<TermEntry> allEntries = cachedTermInfo.getAllEntries();
                    while (allEntries.hasNext()) {
                        TermEntry next = allEntries.next();
                        text.set(next.getTerm());
                        intWritable.set(next.getTermIdx());
                        writer.append(text, intWritable);
                    }
                    Closeables.close(writer, false);
                } catch (Throwable th) {
                    Closeables.close(writer, false);
                    throw th;
                }
            } catch (Throwable th2) {
                Closeables.close(delimitedTermInfoWriter, false);
                throw th2;
            }
        } catch (Throwable th3) {
            Closeables.close(seqFileWriter, false);
            throw th3;
        }
    }

    public static void main(String[] strArr) throws IOException {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = defaultOptionBuilder.withLongName("dir").withRequired(true).withArgument(argumentBuilder.withName("dir").withMinimum(1).withMaximum(1).create()).withDescription("The Lucene directory").withShortName(DateFormat.DAY).create();
        DefaultOption create2 = defaultOptionBuilder.withLongName("output").withRequired(true).withArgument(argumentBuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file").withShortName("o").create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("field").withRequired(true).withArgument(argumentBuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription("The field in the index").withShortName("f").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("idField").withRequired(false).withArgument(argumentBuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription("The field in the index containing the index.  If null, then the Lucene internal doc id is used which is prone to error if the underlying index changes").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("dictOut").withRequired(true).withArgument(argumentBuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription("The output of the dictionary").withShortName("t").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("seqDictOut").withRequired(false).withArgument(argumentBuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create()).withDescription("The output of the dictionary as sequence file").withShortName("st").create();
        DefaultOption create7 = defaultOptionBuilder.withLongName("weight").withRequired(false).withArgument(argumentBuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
        DefaultOption create8 = defaultOptionBuilder.withLongName(DelimitedPayloadTokenFilterFactory.DELIMITER_ATTR).withRequired(false).withArgument(argumentBuilder.withName(DelimitedPayloadTokenFilterFactory.DELIMITER_ATTR).withMinimum(1).withMaximum(1).create()).withDescription("The delimiter for outputting the dictionary").withShortName("l").create();
        DefaultOption create9 = defaultOptionBuilder.withLongName("norm").withRequired(false).withArgument(argumentBuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription("The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
        DefaultOption create10 = defaultOptionBuilder.withLongName("max").withRequired(false).withArgument(argumentBuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription("The maximum number of vectors to output.  If not specified, then it will loop over all docs").withShortName(FuzzyKMeansDriver.M_OPTION).create();
        DefaultOption create11 = defaultOptionBuilder.withLongName("minDF").withRequired(false).withArgument(argumentBuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();
        DefaultOption create12 = defaultOptionBuilder.withLongName("maxDFPercent").withRequired(false).withArgument(argumentBuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription("The max percentage of docs for the DF.  Can be used to remove really high frequency terms.  Expressed as an integer between 0 and 100. Default is 99.").withShortName(LanguageTag.PRIVATEUSE).create();
        DefaultOption create13 = defaultOptionBuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(argumentBuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription("The max percentage of docs that can have a null term vector. These are noise document and can occur if the analyzer used strips out all terms in the target field. This percentage is expressed as a value between 0 and 1. The default is 0.").withShortName("err").create();
        DefaultOption create14 = defaultOptionBuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
        Group create15 = groupBuilder.withName("Options").withOption(create).withOption(create4).withOption(create2).withOption(create8).withOption(create14).withOption(create3).withOption(create10).withOption(create5).withOption(create6).withOption(create9).withOption(create12).withOption(create7).withOption(create11).withOption(create13).create();
        try {
            Parser parser = new Parser();
            parser.setGroup(create15);
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(create14)) {
                CommandLineUtil.printHelp(create15);
                return;
            }
            if (parse.hasOption(create)) {
                Driver driver = new Driver();
                driver.setLuceneDir(parse.getValue(create).toString());
                if (parse.hasOption(create10)) {
                    driver.setMaxDocs(Long.parseLong(parse.getValue(create10).toString()));
                }
                if (parse.hasOption(create7)) {
                    driver.setWeightType(parse.getValue(create7).toString());
                }
                driver.setField(parse.getValue(create3).toString());
                if (parse.hasOption(create11)) {
                    driver.setMinDf(Integer.parseInt(parse.getValue(create11).toString()));
                }
                if (parse.hasOption(create12)) {
                    driver.setMaxDFPercent(Integer.parseInt(parse.getValue(create12).toString()));
                }
                if (parse.hasOption(create9)) {
                    String obj = parse.getValue(create9).toString();
                    if ("INF".equals(obj)) {
                        driver.setNorm(Double.POSITIVE_INFINITY);
                    } else {
                        driver.setNorm(Double.parseDouble(obj));
                    }
                }
                if (parse.hasOption(create4)) {
                    driver.setIdField(parse.getValue(create4).toString());
                }
                if (parse.hasOption(create13)) {
                    driver.setMaxPercentErrorDocs(Double.parseDouble(parse.getValue(create13).toString()));
                }
                driver.setOutFile(parse.getValue(create2).toString());
                driver.setDelimiter(parse.hasOption(create8) ? parse.getValue(create8).toString() : "\t");
                driver.setDictOut(parse.getValue(create5).toString());
                if (parse.hasOption(create6)) {
                    driver.setSeqDictOut(parse.getValue(create6).toString());
                }
                driver.dumpVectors();
            }
        } catch (OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(create15);
        }
    }

    private static VectorWriter getSeqFileWriter(String str) throws IOException {
        Path path = new Path(str);
        Configuration configuration = new Configuration();
        return new SequenceFileVectorWriter(SequenceFile.createWriter(FileSystem.get(configuration), configuration, path, LongWritable.class, VectorWritable.class));
    }

    public void setLuceneDir(String str) {
        this.luceneDir = str;
    }

    public void setMaxDocs(long j) {
        this.maxDocs = j;
    }

    public void setWeightType(String str) {
        this.weightType = str;
    }

    public void setField(String str) {
        this.field = str;
    }

    public void setMinDf(int i) {
        this.minDf = i;
    }

    public void setMaxDFPercent(int i) {
        this.maxDFPercent = i;
    }

    public void setNorm(double d) {
        this.norm = d;
    }

    public void setIdField(String str) {
        this.idField = str;
    }

    public void setOutFile(String str) {
        this.outFile = str;
    }

    public void setDelimiter(String str) {
        this.delimiter = str;
    }

    public void setDictOut(String str) {
        this.dictOut = str;
    }

    public void setSeqDictOut(String str) {
        this.seqDictOut = str;
    }

    public void setMaxPercentErrorDocs(double d) {
        this.maxPercentErrorDocs = d;
    }
}
