package it.unimi.dsi.law.nel.datasets;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.law.nel.interfaces.AnnotatedDocument;
import it.unimi.dsi.law.nel.interfaces.ImmutableAnnotatedDocument;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.FrontCodedStringList;
import it.unimi.dsi.util.Interval;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/* loaded from: input_file:it/unimi/dsi/law/nel/datasets/GerdaqDataset.class */
public class GerdaqDataset implements Iterator<AnnotatedDocument> {
    private static final Logger LOGGER = LoggerFactory.getLogger(GerdaqDataset.class);
    private final boolean ONLY_BEST_ENTITY;
    private final double THRESHOLD;
    private final NodeList instances;
    private final int numInstances;
    private int nextInstance;
    private final VirtualDocumentResolver redirectVdr;
    private final FrontCodedStringList id2name;

    public GerdaqDataset(String str, double d, boolean z, VirtualDocumentResolver virtualDocumentResolver, FrontCodedStringList frontCodedStringList) throws Exception {
        this.redirectVdr = virtualDocumentResolver;
        this.id2name = frontCodedStringList;
        this.THRESHOLD = d;
        this.ONLY_BEST_ENTITY = !z;
        LOGGER.info("Parsing " + str + "...");
        Document parse = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new File(str));
        parse.getDocumentElement().normalize();
        if (!parse.getDocumentElement().getNodeName().equals("dataset")) {
            throw new IOException("File " + str + " is a xml with root name " + parse.getDocumentElement().getNodeName() + ", not dataset.");
        }
        LOGGER.info("Xml file correctly parsed.");
        this.instances = parse.getElementsByTagName("instance");
        this.numInstances = this.instances.getLength();
        this.nextInstance = 0;
    }

    @Override // java.util.Iterator
    public boolean hasNext() {
        return this.nextInstance < this.numInstances;
    }

    private String[] parseAnnotation(Node node) {
        if (!node.getNodeName().equals("annotation")) {
            throw new IllegalArgumentException("Unexpected node " + node.getNodeName());
        }
        NamedNodeMap attributes = node.getAttributes();
        String str = null;
        double d = Double.MIN_VALUE;
        ObjectArrayList objectArrayList = null;
        if (!this.ONLY_BEST_ENTITY) {
            objectArrayList = new ObjectArrayList();
        }
        int i = 0;
        while (true) {
            Node namedItem = attributes.getNamedItem("rank_" + i + "_title");
            if (namedItem == null) {
                break;
            }
            double parseDouble = Double.parseDouble(attributes.getNamedItem("rank_" + i + "_score").getNodeValue());
            if (parseDouble > this.THRESHOLD) {
                if (!this.ONLY_BEST_ENTITY) {
                    objectArrayList.add(EnWikiUtils.title2NormalizedUrl(namedItem.getNodeValue(), this.redirectVdr, this.id2name));
                } else if (parseDouble > d) {
                    d = parseDouble;
                    str = EnWikiUtils.title2NormalizedUrl(namedItem.getNodeValue(), this.redirectVdr, this.id2name);
                }
            }
            i++;
        }
        return this.ONLY_BEST_ENTITY ? new String[]{str} : (String[]) objectArrayList.toArray(new String[0]);
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.Iterator
    public AnnotatedDocument next() {
        ObjectArrayList objectArrayList = new ObjectArrayList();
        ObjectArrayList objectArrayList2 = new ObjectArrayList();
        ObjectArrayList objectArrayList3 = new ObjectArrayList();
        int i = -1;
        NodeList nodeList = this.instances;
        int i2 = this.nextInstance;
        this.nextInstance = i2 + 1;
        Node firstChild = nodeList.item(i2).getFirstChild();
        while (true) {
            Node node = firstChild;
            if (node == null) {
                return new ImmutableAnnotatedDocument((List<String>) objectArrayList, (List<Interval>) objectArrayList2, (List<String>) objectArrayList3);
            }
            String trim = node.getTextContent().trim();
            if (!trim.isEmpty()) {
                objectArrayList.add(trim);
                i++;
                switch (node.getNodeType()) {
                    case 1:
                        for (String str : parseAnnotation(node)) {
                            objectArrayList3.add(str);
                            objectArrayList2.add(Interval.valueOf(i));
                        }
                        break;
                    case 3:
                        break;
                    default:
                        throw new IllegalArgumentException("Unexpected node type: " + node.getNodeType());
                }
            }
            firstChild = node.getNextSibling();
        }
    }

    public static void main(String[] strArr) throws Exception {
        SimpleJSAP simpleJSAP = new SimpleJSAP(GerdaqDataset.class.getName(), "Serializes the AIDA Yago dataset.", new Parameter[]{new FlaggedOption("dataset", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 'd', "dataset", "The original dataset file(s) as xml.").setAllowMultipleDeclarations(true), new UnflaggedOption("enwikired", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The serialized enwikired Virtual Document Resolver."), new UnflaggedOption("id2name", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The serialized FrontCodedStringList that encode the wikipedia titles."), new FlaggedOption("threshold", JSAP.DOUBLE_PARSER, "0", false, 't', "threshold", "Only entities with score above this threshold will be considered."), new Switch("all", 'a', "all", "Repeat mentions if more entities are associated to them.  If false, only the maximum-scored entity for each mention will be annotated."), new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The serialized dataset."), new Switch("printall", (char) 0, "printall", "Print all association")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        LOGGER.info("Loading the wikipedia document resolver from " + parse.getString("enwikired") + "...");
        VirtualDocumentResolver virtualDocumentResolver = (VirtualDocumentResolver) BinIO.loadObject(parse.getString("enwikired"));
        LOGGER.info("Loading the wikipedia id2name FrontCodedStringList from " + parse.getString("id2name") + "...");
        FrontCodedStringList frontCodedStringList = (FrontCodedStringList) BinIO.loadObject(parse.getString("id2name"));
        ObjectArrayList objectArrayList = new ObjectArrayList();
        for (String str : parse.getStringArray("dataset")) {
            GerdaqDataset gerdaqDataset = new GerdaqDataset(str, parse.getDouble("threshold"), parse.getBoolean("all"), virtualDocumentResolver, frontCodedStringList);
            ProgressLogger progressLogger = new ProgressLogger(LOGGER, "documents");
            progressLogger.expectedUpdates = gerdaqDataset.numInstances;
            progressLogger.start("Parsing documents...");
            while (gerdaqDataset.hasNext()) {
                objectArrayList.add(gerdaqDataset.next());
                progressLogger.update();
            }
            progressLogger.done();
        }
        LOGGER.info("Dataset correctly loaded (" + objectArrayList.size() + " documents), saving it to " + parse.getString("output") + "...");
        BinIO.storeObject(objectArrayList, parse.getString("output"));
        if (parse.getBoolean("printall")) {
            LOGGER.info("Printing all to StdOut...");
            Iterator it2 = objectArrayList.iterator();
            while (it2.hasNext()) {
                System.out.println((AnnotatedDocument) it2.next());
            }
        }
        LOGGER.info("Done.");
    }
}
