package it.unimi.dsi.law.rank.tool;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.law.util.Norm;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.StringMap;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:it/unimi/dsi/law/rank/tool/BuildDmozTopicVector.class */
public class BuildDmozTopicVector {
    private static final Logger LOGGER = LoggerFactory.getLogger(BuildDmozTopicVector.class);
    private static final ProgressLogger PLOGGER = new ProgressLogger(LOGGER);
    public final StringMap<? extends CharSequence> URLnode;
    public final InputSource dmozDump;
    public String[] rootTopic;
    public boolean accumulating = false;
    public boolean strict = true;
    private boolean parsedOK = false;
    private final Object2ObjectOpenHashMap<String, TopicNodes> topicMap = new Object2ObjectOpenHashMap<>();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:it/unimi/dsi/law/rank/tool/BuildDmozTopicVector$MyErrorHandler.class */
    public static class MyErrorHandler implements ErrorHandler {
        private MyErrorHandler() {
        }

        private String getParseExceptionInfo(SAXParseException sAXParseException) {
            String systemId = sAXParseException.getSystemId();
            return "URI = " + (systemId == null ? "null" : systemId) + ", Line = " + sAXParseException.getLineNumber() + ": " + sAXParseException.getMessage();
        }

        @Override // org.xml.sax.ErrorHandler
        public void warning(SAXParseException sAXParseException) throws SAXException {
            BuildDmozTopicVector.LOGGER.warn(getParseExceptionInfo(sAXParseException));
        }

        @Override // org.xml.sax.ErrorHandler
        public void error(SAXParseException sAXParseException) throws SAXException {
            BuildDmozTopicVector.LOGGER.error(getParseExceptionInfo(sAXParseException));
            throw new SAXException(getParseExceptionInfo(sAXParseException));
        }

        @Override // org.xml.sax.ErrorHandler
        public void fatalError(SAXParseException sAXParseException) throws SAXException {
            BuildDmozTopicVector.LOGGER.error(getParseExceptionInfo(sAXParseException));
            throw new SAXException(getParseExceptionInfo(sAXParseException));
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:it/unimi/dsi/law/rank/tool/BuildDmozTopicVector$MySAXParser.class */
    public class MySAXParser extends DefaultHandler {
        private final IntOpenHashSet currentTopicId = new IntOpenHashSet();

        private MySAXParser() {
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) {
            Node node;
            if (str3.equals("Topic")) {
                String value = attributes.getValue("r:id");
                if (value != null) {
                    int length = BuildDmozTopicVector.this.rootTopic.length;
                    while (true) {
                        int i = length;
                        length--;
                        if (i == 0) {
                            break;
                        }
                        if (value.startsWith(BuildDmozTopicVector.this.rootTopic[length])) {
                            if (this.currentTopicId.contains(length)) {
                                BuildDmozTopicVector.LOGGER.error("Nested Topic!");
                            }
                            this.currentTopicId.add(length);
                        }
                    }
                }
                BuildDmozTopicVector.PLOGGER.update();
                return;
            }
            if (this.currentTopicId.size() <= 0 || !str3.startsWith("link")) {
                return;
            }
            try {
                URL url = new URL(attributes.getValue("r:resource"));
                int i2 = (int) BuildDmozTopicVector.this.URLnode.getLong(url.toString());
                if (i2 != -1) {
                    BuildDmozTopicVector.LOGGER.debug("Found a topic matching URL of the graph: " + url.toString());
                    IntIterator it2 = this.currentTopicId.iterator();
                    while (it2.hasNext()) {
                        Object2ObjectOpenHashMap<String, TopicNodes> object2ObjectOpenHashMap = BuildDmozTopicVector.this.topicMap;
                        String[] strArr = BuildDmozTopicVector.this.rootTopic;
                        int nextInt = it2.nextInt();
                        TopicNodes topicNodes = (TopicNodes) object2ObjectOpenHashMap.get(strArr[nextInt]);
                        BuildDmozTopicVector.LOGGER.debug("\t\tput it in " + BuildDmozTopicVector.this.rootTopic[nextInt] + " map.");
                        if (topicNodes == null) {
                            topicNodes = new TopicNodes(new Object2ObjectOpenHashMap(), 0);
                            BuildDmozTopicVector.this.topicMap.put(BuildDmozTopicVector.this.rootTopic[nextInt], topicNodes);
                            node = null;
                        } else {
                            node = (Node) topicNodes.node.get(url);
                        }
                        if (node == null) {
                            topicNodes.node.put(url, new Node(i2, 1));
                        } else {
                            node.nOccurrences++;
                        }
                        topicNodes.totalOccurrences++;
                    }
                }
            } catch (MalformedURLException e) {
                throw new RuntimeException(e);
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) {
            if (this.currentTopicId.size() <= 0 || !str3.equals("Topic")) {
                return;
            }
            BuildDmozTopicVector.LOGGER.debug("Found end of matching topic(s).");
            this.currentTopicId.clear();
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startDocument() {
            BuildDmozTopicVector.PLOGGER.expectedUpdates = 671987L;
            BuildDmozTopicVector.PLOGGER.start("Reading DMOZ content file...");
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endDocument() {
            BuildDmozTopicVector.PLOGGER.stop("...end.");
        }
    }

    /* loaded from: input_file:it/unimi/dsi/law/rank/tool/BuildDmozTopicVector$Node.class */
    public static final class Node {
        int id;
        int nOccurrences;

        public Node(int i, int i2) {
            this.id = i;
            this.nOccurrences = i2;
        }
    }

    /* loaded from: input_file:it/unimi/dsi/law/rank/tool/BuildDmozTopicVector$TopicNodes.class */
    public static final class TopicNodes {
        Object2ObjectOpenHashMap<URL, Node> node;
        public int totalOccurrences;

        public TopicNodes(Object2ObjectOpenHashMap<URL, Node> object2ObjectOpenHashMap, int i) {
            this.node = object2ObjectOpenHashMap;
            this.totalOccurrences = i;
        }
    }

    private static String convertToFileURL(String str) throws MalformedURLException {
        return new File(str).toURI().toURL().toString();
    }

    public BuildDmozTopicVector(InputSource inputSource, String[] strArr, StringMap<? extends CharSequence> stringMap) {
        this.rootTopic = strArr;
        this.URLnode = stringMap;
        this.dmozDump = inputSource;
    }

    public void setAccumulating(boolean z) {
        this.accumulating = z;
        LOGGER.info("Accumulating = " + this.accumulating);
    }

    public void setStrict(boolean z) {
        this.strict = z;
        LOGGER.info("Strict characteristic = " + this.strict);
    }

    public void setRootTopic(String[] strArr) {
        this.rootTopic = strArr;
        this.parsedOK = false;
    }

    public void parseDmoz() throws IOException, SAXException, ParserConfigurationException {
        SAXParserFactory newInstance = SAXParserFactory.newInstance();
        newInstance.setNamespaceAware(true);
        newInstance.setValidating(false);
        XMLReader xMLReader = newInstance.newSAXParser().getXMLReader();
        xMLReader.setContentHandler(new MySAXParser());
        xMLReader.setErrorHandler(new MyErrorHandler());
        xMLReader.parse(this.dmozDump);
        this.parsedOK = true;
    }

    public double[] getCharacteristicVector(String str) throws IllegalAccessException {
        double d;
        double d2;
        if (!this.parsedOK) {
            throw new IllegalAccessException("The DMOZ file isn't parsed yet!");
        }
        int i = 0;
        while (i < this.rootTopic.length && !this.rootTopic[i].equals(str)) {
            i++;
        }
        if (i == this.rootTopic.length) {
            LOGGER.info("Topic '" + str + "' is not present.");
            return null;
        }
        TopicNodes topicNodes = (TopicNodes) this.topicMap.get(this.rootTopic[i]);
        if (topicNodes == null) {
            LOGGER.info("There are no nodes in the topic '" + str + "'.");
            return null;
        }
        LOGGER.info("Prepare vector for topic \"" + this.rootTopic[i] + "\".");
        int size = this.URLnode.size();
        int size2 = topicNodes.node.size();
        int i2 = topicNodes.totalOccurrences;
        LOGGER.info("Graph dimension = " + size);
        LOGGER.info("Total topic nodes of the graph = " + size2);
        LOGGER.info("Total topic nodes occurrences of the graph = " + i2);
        double d3 = this.accumulating ? i2 : size2;
        LOGGER.info("Choosen weight to determine the characteristic values = " + d3);
        double[] dArr = new double[size];
        double d4 = (size * (1.0d + d3)) - size2;
        if (!this.strict) {
            Arrays.fill(dArr, 1.0d / d4);
        }
        LOGGER.info("Not topic node weight = " + dArr[0]);
        ObjectIterator it2 = topicNodes.node.entrySet().iterator();
        PLOGGER.expectedUpdates = topicNodes.node.entrySet().size();
        PLOGGER.start("Start filling the characteristic vector with meaningful values.");
        if (this.accumulating) {
            while (it2.hasNext()) {
                Map.Entry entry = (Map.Entry) it2.next();
                Node node = (Node) entry.getValue();
                int i3 = node.id;
                if (this.strict) {
                    d = node.nOccurrences;
                    d2 = d3;
                } else {
                    d = size * node.nOccurrences;
                    d2 = d4;
                }
                dArr[i3] = d / d2;
                Logger logger = LOGGER;
                int i4 = node.id;
                double d5 = dArr[node.id];
                ((URL) entry.getKey()).toString();
                logger.debug("characteristic[" + i4 + "] = " + d5 + " --> " + logger);
                PLOGGER.update();
            }
        } else {
            double d6 = this.strict ? 1.0d / d3 : size / d4;
            LOGGER.info("Topic node weight = " + d6);
            while (it2.hasNext()) {
                Map.Entry entry2 = (Map.Entry) it2.next();
                Node node2 = (Node) entry2.getValue();
                dArr[node2.id] = d6;
                Logger logger2 = LOGGER;
                int i5 = node2.id;
                ((URL) entry2.getKey()).toString();
                logger2.debug("characteristic[" + i5 + "] = " + d6 + " --> " + logger2);
                PLOGGER.update();
            }
        }
        PLOGGER.stop("Done.");
        LOGGER.info("Characteristic L1 norm = " + Norm.L_1.compute(dArr));
        return dArr;
    }

    public static void main(String[] strArr) throws IOException, JSAPException, SAXException, ParserConfigurationException, ClassNotFoundException, IllegalAccessException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(BuildDmozTopicVector.class.getName(), "Determine the weighted characteristic vector of 'graphBasename' graph nodes that are also in the 'topic' of ODP/DMOZ 'dump' (usually content.rdf.u8).\nThe resulting vector (L1-normalized to 1) is stored in binary form in 'outputBasename'.\nIf a graph node is not present in a topic, its weight is a negligible value so that the resulting vector can be used as a preference vector in a NodeReank computation.Topic can be more than one: for each topic, there will be a result file. See the option comments.\n\nIMPORTANT: insert the -DentityExpansionLimit=1000000 option when invoke this class!.", new Parameter[]{new Switch("accumulating", 'a', "accumulating", "the number of occurences of a node in a topic and not the only presence is used to determine the weight."), new Switch("noStrict", 'S', "noStrict", "the characteristic vectors contain negligible value for not present nodes."), new FlaggedOption("dump", JSAP.STRING_PARSER, "content.rdf.u8", true, 'd', "dump", "The ODP/dmoz dump filename."), new FlaggedOption("outputBasename", JSAP.STRING_PARSER, "topicVector-<topic>-<dump>-<grapBasename>", true, 'o', "output", "The file basename used to build the output file names: <basename>-<topic>. The default is more complete."), new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The basename of webgraph. It is required to access to the signed minimal perfect hash table of the graph (file <basename>.smph)."), new UnflaggedOption("topic", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, true, "Topic. Topic has always to begin with 'Top/'. Example: 'Top/Arts' 'Top/Games'")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        boolean z = parse.getBoolean("accumulating", false);
        System.out.println("Parameter accumulating = " + z);
        boolean z2 = parse.getBoolean("noStrict", false);
        System.out.println("Parameter noStrict = " + z2);
        String string = parse.getString("graphBasename");
        System.out.println("Parameter graphBasename = " + string);
        String[] stringArray = parse.getStringArray("topic");
        String string2 = parse.getString("dump");
        System.out.println("Parameter ODP/dmoz dump file = " + string2);
        String string3 = parse.getString("outputBasename");
        String[] strArr2 = new String[stringArray.length];
        for (int i = 0; i < stringArray.length; i++) {
            String replace = stringArray[i].replace('/', '-');
            String str = string3.startsWith("topicVector-<topic>") ? "topicVector-" + replace + "-" + new File(string2).getName() + "-" + new File(string).getName() : string3 + replace;
            LOGGER.debug("topicFilename[" + i + "]=" + str);
            File file = new File(str);
            if (!file.createNewFile() && !file.canWrite()) {
                throw new IOException("It is not possible to create the file " + str + " to store the topic characteristic vector.");
            }
            strArr2[i] = str;
            System.out.println("The characteristic vector of topic " + stringArray[i] + " will be saved into the file " + strArr2[i]);
        }
        try {
            BuildDmozTopicVector buildDmozTopicVector = new BuildDmozTopicVector(new InputSource(convertToFileURL(string2)), stringArray, (StringMap) BinIO.loadObject(string + ".smph"));
            buildDmozTopicVector.setAccumulating(z);
            buildDmozTopicVector.setStrict(!z2);
            System.out.println("Starting to parse DMOZ dump adn to build the caharacteritics vectors...");
            buildDmozTopicVector.parseDmoz();
            System.out.println("Done.\nSaving characteristic vectors...");
            for (int i2 = 0; i2 < stringArray.length; i2++) {
                System.out.println(stringArray[i2]);
                double[] characteristicVector = buildDmozTopicVector.getCharacteristicVector(stringArray[i2]);
                LOGGER.info(strArr2[i2] + "vector has dimension = " + characteristicVector.length);
                System.out.println("Characteristic vector: " + strArr2[i2]);
                BinIO.storeDoubles(characteristicVector, strArr2[i2]);
            }
            System.out.println("All operations are done.");
        } catch (FileNotFoundException e) {
            throw new FileNotFoundException("Problem to access to the input file " + new File("").getAbsolutePath() + "/" + e.getMessage());
        }
    }
}
