package it.unimi.dsi.law.nel.datasets;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.big.mg4j.index.DowncaseTermProcessor;
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.law.nel.interfaces.AnnotatedDocument;
import it.unimi.dsi.law.nel.interfaces.ImmutableAnnotatedDocument;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.FrontCodedStringList;
import it.unimi.dsi.util.Interval;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:it/unimi/dsi/law/nel/datasets/YErdDataset.class */
public class YErdDataset {
    private static final Logger LOGGER = LoggerFactory.getLogger(YErdDataset.class);

    /* loaded from: input_file:it/unimi/dsi/law/nel/datasets/YErdDataset$Normalizer.class */
    public static class Normalizer {
        MutableString word = new MutableString();
        MutableString nonWord = new MutableString();
        TermProcessor termProcessor = DowncaseTermProcessor.getInstance();

        public String normalize(CharSequence charSequence) {
            StringBuilder sb = new StringBuilder();
            FastBufferedReader fastBufferedReader = new FastBufferedReader(new MutableString(charSequence));
            sb.setLength(0);
            while (fastBufferedReader.next(this.word, this.nonWord)) {
                try {
                    if (!this.word.isEmpty() && this.termProcessor.processTerm(this.word)) {
                        sb.append((CharSequence) this.word).append(' ');
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            fastBufferedReader.close();
            int length = sb.length();
            if (length != 0) {
                sb.setLength(length - 1);
            }
            return sb.toString();
        }
    }

    public static Collection<AnnotatedDocument> load(String str, String str2, VirtualDocumentResolver virtualDocumentResolver, FrontCodedStringList frontCodedStringList, boolean z) throws IOException {
        LOGGER.info("Loading " + str + " as Y-ERD Dataset...");
        FileReader fileReader = new FileReader(str);
        Object2ObjectArrayMap object2ObjectArrayMap = new Object2ObjectArrayMap();
        BufferedReader bufferedReader = new BufferedReader(new FileReader(str2));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                Collection<AnnotatedDocument> load = load(fileReader, object2ObjectArrayMap, z);
                fileReader.close();
                LOGGER.info(str + " was correctly parsed.");
                return load;
            }
            String[] splitByWholeSeparator = StringUtils.splitByWholeSeparator(readLine, "\t");
            object2ObjectArrayMap.put(splitByWholeSeparator[0], EnWikiUtils.title2NormalizedUrl(splitByWholeSeparator[1], virtualDocumentResolver, frontCodedStringList));
        }
    }

    public static Collection<AnnotatedDocument> load(Reader reader, Object2ObjectMap<String, String> object2ObjectMap, boolean z) throws IOException {
        Normalizer normalizer = new Normalizer();
        ObjectArrayList objectArrayList = new ObjectArrayList();
        ObjectArrayList objectArrayList2 = new ObjectArrayList();
        ObjectArrayList objectArrayList3 = new ObjectArrayList();
        BufferedReader bufferedReader = new BufferedReader(reader);
        ProgressLogger progressLogger = new ProgressLogger(LOGGER, "lines");
        progressLogger.start();
        bufferedReader.readLine();
        String str = null;
        String str2 = null;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String trim = readLine.trim();
            String[] splitByWholeSeparatorPreserveAllTokens = StringUtils.splitByWholeSeparatorPreserveAllTokens(trim, "\t", 7);
            String str3 = splitByWholeSeparatorPreserveAllTokens[1];
            String normalize = normalizer.normalize(splitByWholeSeparatorPreserveAllTokens[2]);
            if (!str3.equals(str) && str != null) {
                if (objectArrayList3.size() > 0 || z) {
                    objectArrayList.add(new ImmutableAnnotatedDocument(StringUtils.splitByWholeSeparatorPreserveAllTokens(str2, " "), (Interval[]) objectArrayList3.toArray(new Interval[0]), (String[]) objectArrayList2.toArray(new String[0])));
                }
                objectArrayList2.clear();
                objectArrayList3.clear();
            }
            str2 = normalize;
            str = str3;
            if (splitByWholeSeparatorPreserveAllTokens.length < 7) {
                LOGGER.debug("Line <" + trim + "> contains no annotations");
                if (z) {
                    objectArrayList.add(new ImmutableAnnotatedDocument(StringUtils.splitByWholeSeparatorPreserveAllTokens(str2, " "), new Interval[0], new String[0]));
                }
            } else {
                String normalize2 = normalizer.normalize(splitByWholeSeparatorPreserveAllTokens[3]);
                String str4 = splitByWholeSeparatorPreserveAllTokens[4];
                String str5 = (String) object2ObjectMap.get(str4.substring(9, str4.length() - 1));
                if (str3.equals(str) && !normalize.equals(str2)) {
                    LOGGER.warn("Query " + str + " follows with different text: " + str2 + " != " + normalize);
                }
                if (str5 == null) {
                    LOGGER.warn("Line <" + trim + "> contains an entity " + str4 + " that can't be mapped. Ignoring");
                } else {
                    int countMatches = 1 + StringUtils.countMatches(normalize2, " ");
                    int i = -1;
                    int i2 = 0;
                    while (true) {
                        int indexOf = normalize.indexOf(normalize2, i + 1);
                        i = indexOf;
                        if (indexOf < 0) {
                            break;
                        }
                        int countMatches2 = StringUtils.countMatches(normalize.substring(0, i), " ");
                        objectArrayList3.add(Interval.valueOf(countMatches2, (countMatches2 + countMatches) - 1));
                        objectArrayList2.add(str5);
                        i2++;
                    }
                    if (i2 == 0) {
                        LOGGER.warn("In line <" + trim + "> the mention " + normalize2 + " does not appear in the text");
                    }
                    progressLogger.lightUpdate();
                }
            }
        }
        if (objectArrayList3.size() > 0 || z) {
            objectArrayList.add(new ImmutableAnnotatedDocument(StringUtils.splitByWholeSeparatorPreserveAllTokens(str2, " "), (Interval[]) objectArrayList3.toArray(new Interval[0]), (String[]) objectArrayList2.toArray(new String[0])));
        }
        progressLogger.done();
        return objectArrayList;
    }

    public static void main(String[] strArr) throws Exception {
        SimpleJSAP simpleJSAP = new SimpleJSAP(YErdDataset.class.getName(), "Serializes the Y-ERD dataset.", new Parameter[]{new Switch("printall", (char) 0, "printall", "Print all associations"), new Switch("empty", 'e', "empty", "Also add documents with no annotations"), new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The original dataset."), new UnflaggedOption("db2wp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The tsv-file containing the dbpedia-to-Wikipedia correspondence."), new UnflaggedOption("enwikired", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The serialized enwikired Virtual Document Resolver."), new UnflaggedOption("id2name", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The serialized FrontCodedStringList that encode the wikipedia titles."), new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The serialized dataset.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        LOGGER.info("Loading the wikipedia document resolver from " + parse.getString("enwikired") + "...");
        VirtualDocumentResolver virtualDocumentResolver = (VirtualDocumentResolver) BinIO.loadObject(parse.getString("enwikired"));
        LOGGER.info("Loading the wikipedia id2name FrontCodedStringList from " + parse.getString("id2name") + "...");
        Collection<AnnotatedDocument> load = load(parse.getString("dataset"), parse.getString("db2wp"), virtualDocumentResolver, (FrontCodedStringList) BinIO.loadObject(parse.getString("id2name")), parse.getBoolean("empty"));
        LOGGER.info("Dataset correctly loaded, saving it to " + parse.getString("output") + "...");
        BinIO.storeObject(load, parse.getString("output"));
        if (parse.getBoolean("printall")) {
            LOGGER.info("Printing all...");
            Iterator<AnnotatedDocument> it2 = load.iterator();
            while (it2.hasNext()) {
                System.out.println(it2.next());
            }
        }
        LOGGER.info("Done.");
    }
}
