package org.apache.mahout.text;

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.text.wikipedia.WikipediaMapper;
import org.apache.mahout.text.wikipedia.XmlInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/text/WikipediaToSequenceFile.class */
public final class WikipediaToSequenceFile {
    private static final Logger log = LoggerFactory.getLogger((Class<?>) WikipediaToSequenceFile.class);

    private WikipediaToSequenceFile() {
    }

    public static void main(String[] strArr) throws IOException {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = DefaultOptionCreator.inputOption().create();
        DefaultOption create2 = DefaultOptionCreator.outputOption().create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("categories").withArgument(argumentBuilder.withName("categories").withMinimum(1).withMaximum(1).create()).withDescription("Location of the categories file.  One entry per line. Will be used to make a string match in Wikipedia Category field").withShortName(WikipediaTokenizer.CATEGORY).create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("exactMatch").withDescription("If set, then the category name must exactly match the entry in the categories file. Default is false").withShortName("e").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("all").withDescription("If set, Select all files. Default is false").withShortName("all").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("removeLabels").withDescription("If set, remove [[Category:labels]] from document text after extracting label.Default is false").withShortName("rl").create();
        Option helpOption = DefaultOptionCreator.helpOption();
        Group create7 = groupBuilder.withName("Options").withOption(create3).withOption(create).withOption(create2).withOption(create4).withOption(create5).withOption(helpOption).withOption(create6).create();
        Parser parser = new Parser();
        parser.setGroup(create7);
        parser.setHelpOption(helpOption);
        try {
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(helpOption)) {
                CommandLineUtil.printHelp(create7);
                return;
            }
            String str = (String) parse.getValue(create);
            String str2 = (String) parse.getValue(create2);
            String str3 = parse.hasOption(create3) ? (String) parse.getValue(create3) : "";
            boolean z = false;
            if (parse.hasOption(create5)) {
                z = true;
            }
            boolean z2 = false;
            if (parse.hasOption(create6)) {
                z2 = true;
            }
            runJob(str, str2, str3, parse.hasOption(create4), z, z2);
        } catch (ClassNotFoundException | InterruptedException | OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(create7);
        }
    }

    public static void runJob(String str, String str2, String str3, boolean z, boolean z2, boolean z3) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration = new Configuration();
        configuration.set(XmlInputFormat.START_TAG_KEY, "<page>");
        configuration.set(XmlInputFormat.END_TAG_KEY, "</page>");
        configuration.setBoolean("exact.match.only", z);
        configuration.setBoolean("all.files", z2);
        configuration.setBoolean("remove.labels", z3);
        configuration.set(CommonConfigurationKeysPublic.IO_SERIALIZATIONS_KEY, "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        HashSet hashSet = new HashSet();
        if (!str3.isEmpty()) {
            Iterator<String> it2 = new FileLineIterable(new File(str3)).iterator();
            while (it2.hasNext()) {
                hashSet.add(it2.next().trim().toLowerCase(Locale.ENGLISH));
            }
        }
        configuration.set("wikipedia.categories", new DefaultStringifier(configuration, GenericsUtil.getClass(hashSet)).toString(hashSet));
        Job job = new Job(configuration);
        log.info("Input: {} Out: {} Categories: {} All Files: {}", str, str2, str3, Boolean.valueOf(z2));
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(job, new Path(str));
        Path path = new Path(str2);
        FileOutputFormat.setOutputPath(job, path);
        job.setMapperClass(WikipediaMapper.class);
        job.setInputFormatClass(XmlInputFormat.class);
        job.setReducerClass(Reducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setJarByClass(WikipediaToSequenceFile.class);
        HadoopUtil.delete(configuration, path);
        if (!job.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }
}
