package org.apache.accumulo.examples.wikisearch.ingest;

import com.google.common.collect.HashMultimap;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.IllegalFormatException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.ColumnVisibility;
import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor;
import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat;
import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;

/* loaded from: input_file:org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.class */
public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Mutation> {
    public static final String DOCUMENT_COLUMN_FAMILY = "d";
    public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
    public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
    public static final String TOKENS_FIELD_NAME = "TEXT";
    private static final String cvPrefix = "all|";
    private ArticleExtractor extractor;
    private String language;
    private int numPartitions = 0;
    private ColumnVisibility cv = null;
    private int myGroup = -1;
    private int numGroups = -1;
    private Text tablename = null;
    private Text indexTableName = null;
    private Text reverseIndexTableName = null;
    private Text metadataTableName = null;
    private static final Logger log = Logger.getLogger(WikipediaMapper.class);
    public static final Charset UTF8 = Charset.forName("UTF-8");
    private static final Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
    private static final Value NULL_VALUE = new Value(new byte[0]);
    static HashSet<String> metadataSent = new HashSet<>();

    public void setup(Mapper<LongWritable, Text, Text, Mutation>.Context context) {
        Configuration configuration = context.getConfiguration();
        this.tablename = new Text(WikipediaConfiguration.getTableName(configuration));
        this.indexTableName = new Text(this.tablename + "Index");
        this.reverseIndexTableName = new Text(this.tablename + "ReverseIndex");
        this.metadataTableName = new Text(this.tablename + "Metadata");
        WikipediaInputFormat.WikipediaInputSplit wikipediaInputSplit = (WikipediaInputFormat.WikipediaInputSplit) context.getInputSplit();
        this.myGroup = wikipediaInputSplit.getPartition();
        this.numGroups = WikipediaConfiguration.getNumGroups(configuration);
        String name = wikipediaInputSplit.getFileSplit().getPath().getName();
        Matcher matcher = languagePattern.matcher(name);
        if (!matcher.matches()) {
            throw new RuntimeException("Unknown ingest language! " + name);
        }
        this.language = matcher.group(1).replace('_', '-').toLowerCase();
        this.extractor = new ArticleExtractor();
        this.numPartitions = WikipediaConfiguration.getNumPartitions(configuration);
        this.cv = new ColumnVisibility(cvPrefix + this.language);
    }

    public static int getPartitionId(ArticleExtractor.Article article, int i) throws IllegalFormatException {
        return article.getId() % i;
    }

    protected void map(LongWritable longWritable, Text text, Mapper<LongWritable, Text, Text, Mutation>.Context context) throws IOException, InterruptedException {
        ArticleExtractor.Article extract = this.extractor.extract(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), UTF8));
        String str = this.language + "��";
        String str2 = "fi��";
        if (extract == null) {
            context.getCounter("wikipedia", "invalid articles").increment(1L);
        } else {
            if (getPartitionId(extract, this.numGroups) != this.myGroup) {
                return;
            }
            Text text2 = new Text(Integer.toString(getPartitionId(extract, this.numPartitions)));
            Mutation mutation = new Mutation(text2);
            for (Map.Entry<String, Object> entry : extract.getFieldValues().entrySet()) {
                mutation.put(str + extract.getId(), entry.getKey() + "��" + entry.getValue().toString(), this.cv, extract.getTimestamp(), NULL_VALUE);
                String str3 = entry.getKey() + "e" + this.language;
                if (!metadataSent.contains(str3)) {
                    Mutation mutation2 = new Mutation(entry.getKey());
                    mutation2.put("e", this.language, this.cv, extract.getTimestamp(), NULL_VALUE);
                    context.write(this.metadataTableName, mutation2);
                    metadataSent.add(str3);
                }
            }
            Set<String> tokens = getTokens(extract);
            HashMultimap create = HashMultimap.create();
            LcNoDiacriticsNormalizer lcNoDiacriticsNormalizer = new LcNoDiacriticsNormalizer();
            for (Map.Entry<String, String> entry2 : extract.getNormalizedFieldValues().entrySet()) {
                create.put(entry2.getKey(), entry2.getValue());
            }
            Iterator<String> it = tokens.iterator();
            while (it.hasNext()) {
                create.put("TEXT", lcNoDiacriticsNormalizer.normalizeFieldValue("", it.next()));
            }
            for (Map.Entry entry3 : create.entries()) {
                mutation.put(str2 + ((String) entry3.getKey()), ((String) entry3.getValue()) + "��" + str + extract.getId(), this.cv, extract.getTimestamp(), NULL_VALUE);
                Uid.List.Builder newBuilder = Uid.List.newBuilder();
                newBuilder.setIGNORE(false);
                newBuilder.setCOUNT(1L);
                newBuilder.addUID(Integer.toString(extract.getId()));
                Value value = new Value(newBuilder.m68build().toByteArray());
                Mutation mutation3 = new Mutation((CharSequence) entry3.getValue());
                mutation3.put((CharSequence) entry3.getKey(), text2 + "��" + this.language, this.cv, extract.getTimestamp(), value);
                context.write(this.indexTableName, mutation3);
                Mutation mutation4 = new Mutation(StringUtils.reverse((String) entry3.getValue()));
                mutation4.put((CharSequence) entry3.getKey(), text2 + "��" + this.language, this.cv, extract.getTimestamp(), value);
                context.write(this.reverseIndexTableName, mutation4);
                String str4 = ((String) entry3.getKey()) + "i" + this.language;
                if (!metadataSent.contains(str4)) {
                    Mutation mutation5 = new Mutation((CharSequence) entry3.getKey());
                    mutation5.put("i", this.language + "��" + LcNoDiacriticsNormalizer.class.getName(), this.cv, extract.getTimestamp(), NULL_VALUE);
                    context.write(this.metadataTableName, mutation5);
                    metadataSent.add(str4);
                }
            }
            mutation.put("d", str + extract.getId(), this.cv, extract.getTimestamp(), new Value(Base64.encodeBase64(extract.getText().getBytes())));
            context.write(this.tablename, mutation);
        }
        context.progress();
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static Set<String> getTokens(ArticleExtractor.Article article) throws IOException {
        HashSet hashSet = new HashSet();
        WikipediaTokenizer wikipediaTokenizer = new WikipediaTokenizer(new StringReader(article.getText()));
        TermAttribute addAttribute = wikipediaTokenizer.addAttribute(TermAttribute.class);
        while (wikipediaTokenizer.incrementToken()) {
            try {
                try {
                    try {
                        String term = addAttribute.term();
                        if (!StringUtils.isEmpty(term)) {
                            hashSet.add(term);
                        }
                    } catch (Throwable th) {
                        try {
                            wikipediaTokenizer.close();
                        } catch (IOException e) {
                            log.error("Error closing tokenizer", e);
                        }
                        throw th;
                    }
                } catch (Throwable th2) {
                    try {
                        try {
                            wikipediaTokenizer.end();
                            try {
                                wikipediaTokenizer.close();
                            } catch (IOException e2) {
                                log.error("Error closing tokenizer", e2);
                            }
                        } catch (IOException e3) {
                            log.error("Error calling end()", e3);
                            try {
                                wikipediaTokenizer.close();
                            } catch (IOException e4) {
                                log.error("Error closing tokenizer", e4);
                            }
                        }
                        throw th2;
                    } catch (Throwable th3) {
                        try {
                            wikipediaTokenizer.close();
                        } catch (IOException e5) {
                            log.error("Error closing tokenizer", e5);
                        }
                        throw th3;
                    }
                }
            } catch (IOException e6) {
                try {
                    log.error("Error tokenizing text", e6);
                    try {
                        wikipediaTokenizer.end();
                        try {
                            wikipediaTokenizer.close();
                        } catch (IOException e7) {
                            log.error("Error closing tokenizer", e7);
                        }
                    } catch (IOException e8) {
                        log.error("Error calling end()", e8);
                        try {
                            wikipediaTokenizer.close();
                        } catch (IOException e9) {
                            log.error("Error closing tokenizer", e9);
                        }
                    }
                } catch (Throwable th4) {
                    try {
                        wikipediaTokenizer.close();
                    } catch (IOException e10) {
                        log.error("Error closing tokenizer", e10);
                    }
                    throw th4;
                }
            }
        }
        try {
            wikipediaTokenizer.end();
            try {
                wikipediaTokenizer.close();
            } catch (IOException e11) {
                log.error("Error closing tokenizer", e11);
            }
        } catch (IOException e12) {
            log.error("Error calling end()", e12);
            try {
                wikipediaTokenizer.close();
            } catch (IOException e13) {
                log.error("Error closing tokenizer", e13);
            }
        }
        return hashSet;
    }

    protected /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
        map((LongWritable) obj, (Text) obj2, (Mapper<LongWritable, Text, Text, Mutation>.Context) context);
    }
}
