package org.apache.accumulo.examples.wikisearch.ingest;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.admin.TableOperations;
import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.iterators.IteratorUtil;
import org.apache.accumulo.core.iterators.LongCombiner;
import org.apache.accumulo.core.iterators.user.SummingCombiner;
import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor;
import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner;
import org.apache.accumulo.examples.wikisearch.iterator.TextIndexCombiner;
import org.apache.accumulo.examples.wikisearch.output.SortingRFileOutputFormat;
import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

/* loaded from: input_file:org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedIngester.class */
public class WikipediaPartitionedIngester extends Configured implements Tool {
    public static final String INGEST_LANGUAGE = "wikipedia.ingest_language";
    public static final String SPLIT_FILE = "wikipedia.split_file";
    public static final String TABLE_NAME = "wikipedia.table";
    private static final Logger log = Logger.getLogger(WikipediaPartitionedIngester.class);
    public static final PathFilter partFilter = new PathFilter() { // from class: org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester.1
        public boolean accept(Path path) {
            return path.getName().startsWith("part");
        }
    };
    protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(new Configuration(), new WikipediaPartitionedIngester(), strArr));
    }

    private void createTables(TableOperations tableOperations, String str) throws AccumuloException, AccumuloSecurityException, TableNotFoundException, TableExistsException {
        String str2 = str + "Index";
        String str3 = str + "ReverseIndex";
        String str4 = str + "Metadata";
        if (!tableOperations.exists(str)) {
            tableOperations.create(str);
            if ("TEXT".length() > 0) {
                System.out.println("Adding content combiner on the fields: TEXT");
                IteratorSetting iteratorSetting = new IteratorSetting(10, TextIndexCombiner.class);
                ArrayList arrayList = new ArrayList();
                for (String str5 : StringUtils.split("TEXT", ',')) {
                    arrayList.add(new IteratorSetting.Column("fi��" + str5));
                }
                TextIndexCombiner.setColumns(iteratorSetting, arrayList);
                TextIndexCombiner.setLossyness(iteratorSetting, true);
                tableOperations.attachIterator(str, iteratorSetting, EnumSet.allOf(IteratorUtil.IteratorScope.class));
            }
            tableOperations.setLocalityGroups(str, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text("d"))));
        }
        if (!tableOperations.exists(str2)) {
            tableOperations.create(str2);
            IteratorSetting iteratorSetting2 = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
            GlobalIndexUidCombiner.setCombineAllColumns(iteratorSetting2, true);
            GlobalIndexUidCombiner.setLossyness(iteratorSetting2, true);
            tableOperations.attachIterator(str2, iteratorSetting2, EnumSet.allOf(IteratorUtil.IteratorScope.class));
        }
        if (!tableOperations.exists(str3)) {
            tableOperations.create(str3);
            IteratorSetting iteratorSetting3 = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
            GlobalIndexUidCombiner.setCombineAllColumns(iteratorSetting3, true);
            GlobalIndexUidCombiner.setLossyness(iteratorSetting3, true);
            tableOperations.attachIterator(str3, iteratorSetting3, EnumSet.allOf(IteratorUtil.IteratorScope.class));
        }
        if (tableOperations.exists(str4)) {
            return;
        }
        tableOperations.create(str4);
        IteratorSetting iteratorSetting4 = new IteratorSetting(10, SummingCombiner.class);
        SummingCombiner.setColumns(iteratorSetting4, Collections.singletonList(new IteratorSetting.Column("f")));
        SummingCombiner.setEncodingType(iteratorSetting4, LongCombiner.Type.VARLEN);
        tableOperations.attachIterator(str4, iteratorSetting4, EnumSet.allOf(IteratorUtil.IteratorScope.class));
    }

    public int run(String[] strArr) throws Exception {
        int runPartitionerJob;
        Configuration conf = getConf();
        if (WikipediaConfiguration.runPartitioner(conf) && (runPartitionerJob = runPartitionerJob()) != 0) {
            return runPartitionerJob;
        }
        if (!WikipediaConfiguration.runIngest(conf)) {
            return 0;
        }
        int runIngestJob = runIngestJob();
        if (runIngestJob != 0) {
            return runIngestJob;
        }
        if (WikipediaConfiguration.bulkIngest(conf)) {
            return loadBulkFiles();
        }
        return 0;
    }

    private int runPartitionerJob() throws Exception {
        Job job = new Job(getConf(), "Partition Wikipedia");
        Configuration configuration = job.getConfiguration();
        configuration.set("mapred.map.tasks.speculative.execution", "false");
        configurePartitionerJob(job);
        List<Path> arrayList = new ArrayList<>();
        TreeSet treeSet = new TreeSet();
        FileSystem fileSystem = FileSystem.get(configuration);
        Path path = new Path(configuration.get("wikipedia.input"));
        listFiles(path, fileSystem, arrayList, treeSet);
        System.out.println("Input files in " + path + ":" + arrayList.size());
        Path[] pathArr = new Path[arrayList.size()];
        arrayList.toArray(pathArr);
        System.out.println("Languages:" + treeSet.size());
        WikipediaInputFormat.setInputPaths(job, pathArr);
        job.setMapperClass(WikipediaPartitioner.class);
        job.setNumReduceTasks(0);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ArticleExtractor.Article.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArticleExtractor.Article.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(job, WikipediaConfiguration.getPartitionedArticlesPath(configuration));
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        return job.waitForCompletion(true) ? 0 : 1;
    }

    private int runIngestJob() throws Exception {
        Job job = new Job(getConf(), "Ingest Partitioned Wikipedia");
        Configuration configuration = job.getConfiguration();
        configuration.set("mapred.map.tasks.speculative.execution", "false");
        configureIngestJob(job);
        String tableName = WikipediaConfiguration.getTableName(configuration);
        createTables(WikipediaConfiguration.getConnector(configuration).tableOperations(), tableName);
        job.setMapperClass(WikipediaPartitionedMapper.class);
        job.setNumReduceTasks(0);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.setInputPaths(job, new Path[]{WikipediaConfiguration.getPartitionedArticlesPath(configuration)});
        SequenceFileInputFormat.setMinInputSplitSize(job, WikipediaConfiguration.getMinInputSplitSize(configuration));
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Mutation.class);
        if (WikipediaConfiguration.bulkIngest(configuration)) {
            job.setOutputFormatClass(SortingRFileOutputFormat.class);
            SortingRFileOutputFormat.setMaxBufferSize(configuration, WikipediaConfiguration.bulkIngestBufferSize(configuration));
            if (WikipediaConfiguration.bulkIngestDir(configuration) == null) {
                log.error("Bulk ingest dir not set");
                return 1;
            }
            SortingRFileOutputFormat.setPathName(configuration, WikipediaConfiguration.bulkIngestDir(configuration));
        } else {
            job.setOutputFormatClass(AccumuloOutputFormat.class);
            String zookeepers = WikipediaConfiguration.getZookeepers(configuration);
            String instanceName = WikipediaConfiguration.getInstanceName(configuration);
            AccumuloOutputFormat.setOutputInfo(job.getConfiguration(), WikipediaConfiguration.getUser(configuration), WikipediaConfiguration.getPassword(configuration), true, tableName);
            AccumuloOutputFormat.setZooKeeperInstance(job.getConfiguration(), instanceName, zookeepers);
        }
        return job.waitForCompletion(true) ? 0 : 1;
    }

    private int loadBulkFiles() throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {
        Configuration conf = getConf();
        Connector connector = WikipediaConfiguration.getConnector(conf);
        FileSystem fileSystem = FileSystem.get(conf);
        String bulkIngestDir = WikipediaConfiguration.bulkIngestDir(conf);
        String bulkIngestFailureDir = WikipediaConfiguration.bulkIngestFailureDir(conf);
        for (FileStatus fileStatus : fileSystem.listStatus(new Path(bulkIngestDir))) {
            if (fileStatus.isDir()) {
                Path path = fileStatus.getPath();
                Path path2 = new Path(bulkIngestFailureDir + "/" + path.getName());
                fileSystem.mkdirs(path2);
                connector.tableOperations().importDirectory(path.getName(), path.toString(), path2.toString(), true);
            }
        }
        return 0;
    }

    protected void configurePartitionerJob(Job job) {
        Configuration configuration = job.getConfiguration();
        job.setJarByClass(WikipediaPartitionedIngester.class);
        job.setInputFormatClass(WikipediaInputFormat.class);
        configuration.set(AggregatingRecordReader.START_TOKEN, "<page>");
        configuration.set(AggregatingRecordReader.END_TOKEN, "</page>");
    }

    protected void configureIngestJob(Job job) {
        job.setJarByClass(WikipediaPartitionedIngester.class);
    }

    protected void listFiles(Path path, FileSystem fileSystem, List<Path> list, Set<String> set) throws IOException {
        for (FileStatus fileStatus : fileSystem.listStatus(path)) {
            if (fileStatus.isDir()) {
                listFiles(fileStatus.getPath(), fileSystem, list, set);
            } else {
                Path path2 = fileStatus.getPath();
                Matcher matcher = filePattern.matcher(path2.getName());
                if (matcher.matches()) {
                    set.add(matcher.group(1));
                    list.add(path2);
                }
            }
        }
    }
}
