package it.unipi.di.acube.batframework.datasetPlugins;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/YahooWebscopeL24Dataset.class */
public class YahooWebscopeL24Dataset implements A2WDataset {
    List<String> queries = new Vector();
    List<HashSet<Annotation>> annotations = new Vector();

    public YahooWebscopeL24Dataset(String str) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException {
        NodeList nodeList = (NodeList) XPathFactory.newInstance().newXPath().compile("//query[@cannot-judge=\"false\"]").evaluate(DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(new File(str))), XPathConstants.NODESET);
        for (int i = 0; i < nodeList.getLength(); i++) {
            String str2 = "";
            HashSet<Annotation> hashSet = new HashSet<>();
            NodeList childNodes = nodeList.item(i).getChildNodes();
            for (int i2 = 0; i2 < childNodes.getLength(); i2++) {
                Node item = childNodes.item(i2);
                item.normalize();
                if (item.getNodeType() != 3) {
                    if (item.getNodeType() != 1) {
                        throw new RuntimeException("Node should be an element" + item.toString());
                    }
                    if (item.getNodeName().equals("text")) {
                        str2 = item.getTextContent();
                    } else {
                        if (!item.getNodeName().equals("annotation")) {
                            throw new RuntimeException("Unrecognized node:" + item);
                        }
                        NodeList childNodes2 = item.getChildNodes();
                        String str3 = "";
                        int i3 = -1;
                        for (int i4 = 0; i4 < childNodes2.getLength(); i4++) {
                            if (childNodes2.item(i4).getNodeName().equals("span")) {
                                str3 = childNodes2.item(i4).getTextContent().replace("/", "");
                            } else if (childNodes2.item(i4).getNodeName().equals("target")) {
                                i3 = Integer.parseInt(childNodes2.item(i4).getAttributes().getNamedItem("wiki-id").getNodeValue());
                            }
                        }
                        if (!str3.isEmpty() && i3 != -1) {
                            int indexOf = str2.toLowerCase().indexOf(str3.toLowerCase());
                            int length = str3.length();
                            if (indexOf >= 0) {
                                hashSet.add(new Annotation(indexOf, length, i3));
                            } else if (str2.toLowerCase().replaceAll("\"", "").indexOf(str3.toLowerCase()) != -1) {
                                String[] split = str2.toLowerCase().replaceAll("\\W", " ").replaceAll("^ +", "").replaceAll(" +$", "").replaceAll(" +", " ").split(" ");
                                String str4 = split[0];
                                String str5 = split[split.length - 1];
                                int indexOf2 = str2.toLowerCase().indexOf(str4);
                                int indexOf3 = (str2.toLowerCase().indexOf(str5) + str5.length()) - indexOf2;
                                hashSet.add(new Annotation(indexOf2, str3.length(), i3));
                            } else {
                                System.err.printf("mention [%s] is not a substring of [%s], skipping.%n", str3, str2);
                            }
                        }
                    }
                }
            }
            this.queries.add(str2);
            this.annotations.add(hashSet);
        }
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public int getTagsCount() {
        int i = 0;
        Iterator<HashSet<Annotation>> it2 = this.annotations.iterator();
        while (it2.hasNext()) {
            i += it2.next().size();
        }
        return i;
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public List<HashSet<Tag>> getC2WGoldStandardList() {
        return ProblemReduction.A2WToC2WList(this.annotations);
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public int getSize() {
        return this.queries.size();
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public String getName() {
        return "Yahoo Webscope L24";
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public List<String> getTextInstanceList() {
        return this.queries;
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Mention>> getMentionsInstanceList() {
        return ProblemReduction.A2WToD2WMentionsInstance(this.annotations);
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Annotation>> getD2WGoldStandardList() {
        return this.annotations;
    }

    @Override // it.unipi.di.acube.batframework.problems.A2WDataset
    public List<HashSet<Annotation>> getA2WGoldStandardList() {
        return this.annotations;
    }
}
