package org.apache.mahout.text.wikipedia;

import com.google.common.io.Closeables;
import com.ibm.icu.text.DateFormat;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.text.DecimalFormat;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/text/wikipedia/WikipediaXmlSplitter.class */
public final class WikipediaXmlSplitter {
    private static final Logger log = LoggerFactory.getLogger(WikipediaXmlSplitter.class);

    private WikipediaXmlSplitter() {
    }

    public static void main(String[] strArr) throws IOException {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = defaultOptionBuilder.withLongName("dumpFile").withRequired(true).withArgument(argumentBuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName(DateFormat.DAY).create();
        DefaultOption create2 = defaultOptionBuilder.withLongName("outputDir").withRequired(true).withArgument(argumentBuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription("The output directory to place the splits in:\nlocal files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\nHadoop DFS:\n\thdfs://wikipedia-xml-chunks\nAWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\nAWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n").withShortName("o").create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("s3ID").withRequired(false).withArgument(argumentBuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 ID key").withShortName(WikipediaTokenizer.ITALICS).create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("s3Secret").withRequired(false).withArgument(argumentBuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 secret key").withShortName("s").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("chunkSize").withRequired(true).withArgument(argumentBuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription("The Size of the chunk, in megabytes").withShortName(WikipediaTokenizer.CATEGORY).create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("numChunks").withRequired(false).withArgument(argumentBuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()).withDescription("The maximum number of chunks to create.  If specified, program will only create a subset of the chunks").withShortName("n").create();
        Group create7 = groupBuilder.withName("Options").withOption(create).withOption(create2).withOption(create5).withOption(create6).withOption(create3).withOption(create4).create();
        Parser parser = new Parser();
        parser.setGroup(create7);
        try {
            CommandLine parse = parser.parse(strArr);
            Configuration configuration = new Configuration();
            String str = (String) parse.getValue(create);
            String str2 = (String) parse.getValue(create2);
            if (parse.hasOption(create3)) {
                String str3 = (String) parse.getValue(create3);
                configuration.set("fs.s3n.awsAccessKeyId", str3);
                configuration.set("fs.s3.awsAccessKeyId", str3);
            }
            if (parse.hasOption(create4)) {
                String str4 = (String) parse.getValue(create4);
                configuration.set("fs.s3n.awsSecretAccessKey", str4);
                configuration.set("fs.s3.awsSecretAccessKey", str4);
            }
            configuration.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
            FileSystem fileSystem = FileSystem.get(URI.create(str2), configuration);
            int parseInt = 1048576 * Integer.parseInt((String) parse.getValue(create5));
            int i = Integer.MAX_VALUE;
            if (parse.hasOption(create6)) {
                i = Integer.parseInt((String) parse.getValue(create6));
            }
            StringBuilder sb = new StringBuilder();
            sb.append("<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd\" version=\"0.3\" xml:lang=\"en\">\n  <siteinfo>\n<sitename>Wikipedia</sitename>\n    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n    <generator>MediaWiki 1.13alpha</generator>\n    <case>first-letter</case>\n    <namespaces>\n      <namespace key=\"-2\">Media</namespace>\n      <namespace key=\"-1\">Special</namespace>\n      <namespace key=\"0\" />\n      <namespace key=\"1\">Talk</namespace>\n      <namespace key=\"2\">User</namespace>\n      <namespace key=\"3\">User talk</namespace>\n      <namespace key=\"4\">Wikipedia</namespace>\n      <namespace key=\"5\">Wikipedia talk</namespace>\n      <namespace key=\"6\">Image</namespace>\n      <namespace key=\"7\">Image talk</namespace>\n      <namespace key=\"8\">MediaWiki</namespace>\n      <namespace key=\"9\">MediaWiki talk</namespace>\n      <namespace key=\"10\">Template</namespace>\n      <namespace key=\"11\">Template talk</namespace>\n      <namespace key=\"12\">Help</namespace>\n      <namespace key=\"13\">Help talk</namespace>\n      <namespace key=\"14\">Category</namespace>\n      <namespace key=\"15\">Category talk</namespace>\n      <namespace key=\"100\">Portal</namespace>\n      <namespace key=\"101\">Portal talk</namespace>\n    </namespaces>\n  </siteinfo>\n");
            DecimalFormat decimalFormat = new DecimalFormat("0000");
            File file = new File(str);
            if (!file.exists()) {
                log.error("Input file path {} doesn't exist", str);
                return;
            }
            FileLineIterator fileLineIterator = str.endsWith(".bz2") ? new FileLineIterator((InputStream) new BZip2Codec().createInputStream(new FileInputStream(file))) : new FileLineIterator(file);
            int i2 = 0;
            while (fileLineIterator.hasNext()) {
                String next = fileLineIterator.next();
                if (next.trim().startsWith("<page>")) {
                    boolean z = false;
                    while (true) {
                        if (!next.trim().startsWith("</page>")) {
                            sb.append(next).append('\n');
                            if (!fileLineIterator.hasNext()) {
                                z = true;
                                break;
                            }
                            next = fileLineIterator.next();
                        } else {
                            break;
                        }
                    }
                    sb.append(next).append('\n');
                    if (sb.length() > parseInt || z) {
                        sb.append("</mediawiki>");
                        i2++;
                        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter((OutputStream) fileSystem.create(new Path(str2 + "/chunk-" + decimalFormat.format(i2) + ".xml")), "UTF-8"));
                        try {
                            bufferedWriter.write(sb.toString(), 0, sb.length());
                            Closeables.close(bufferedWriter, false);
                            if (i2 >= i) {
                                return;
                            }
                            sb = new StringBuilder();
                            sb.append("<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd\" version=\"0.3\" xml:lang=\"en\">\n  <siteinfo>\n<sitename>Wikipedia</sitename>\n    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n    <generator>MediaWiki 1.13alpha</generator>\n    <case>first-letter</case>\n    <namespaces>\n      <namespace key=\"-2\">Media</namespace>\n      <namespace key=\"-1\">Special</namespace>\n      <namespace key=\"0\" />\n      <namespace key=\"1\">Talk</namespace>\n      <namespace key=\"2\">User</namespace>\n      <namespace key=\"3\">User talk</namespace>\n      <namespace key=\"4\">Wikipedia</namespace>\n      <namespace key=\"5\">Wikipedia talk</namespace>\n      <namespace key=\"6\">Image</namespace>\n      <namespace key=\"7\">Image talk</namespace>\n      <namespace key=\"8\">MediaWiki</namespace>\n      <namespace key=\"9\">MediaWiki talk</namespace>\n      <namespace key=\"10\">Template</namespace>\n      <namespace key=\"11\">Template talk</namespace>\n      <namespace key=\"12\">Help</namespace>\n      <namespace key=\"13\">Help talk</namespace>\n      <namespace key=\"14\">Category</namespace>\n      <namespace key=\"15\">Category talk</namespace>\n      <namespace key=\"100\">Portal</namespace>\n      <namespace key=\"101\">Portal talk</namespace>\n    </namespaces>\n  </siteinfo>\n");
                        } catch (Throwable th) {
                            Closeables.close(bufferedWriter, false);
                            throw th;
                        }
                    }
                }
            }
        } catch (OptionException e) {
            log.error("Error while parsing options", e);
            CommandLineUtil.printHelp(create7);
        }
    }
}
