package it.unipi.di.acube.batframework.datasetPlugins;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.AnnotationException;
import it.unipi.di.acube.batframework.utils.CharUtils;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.Utils;
import it.unipi.di.acube.batframework.utils.WikipediaInterface;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/IITBDataset.class */
public class IITBDataset implements A2WDataset {
    private List<String> textList;
    private List<HashSet<Annotation>> annList;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/IITBDataset$IITBAnnotation.class */
    public static class IITBAnnotation implements Comparable<IITBAnnotation> {
        public int position;
        public int length;
        public String title;

        public IITBAnnotation(int i, int i2, String str) {
            this.position = i;
            this.length = i2;
            this.title = str;
        }

        @Override // java.lang.Comparable
        public int compareTo(IITBAnnotation iITBAnnotation) {
            return this.position - iITBAnnotation.position;
        }
    }

    public IITBDataset(String str, String str2, WikipediaInterface wikipediaInterface) throws IOException, ParserConfigurationException, SAXException, AnnotationException, XPathExpressionException {
        this(Utils.getFilesAndInputStreams(str, ".*"), new FileInputStream(str2), wikipediaInterface);
    }

    public IITBDataset(Map<String, InputStream> map, InputStream inputStream, WikipediaInterface wikipediaInterface) throws IOException, ParserConfigurationException, SAXException, AnnotationException, XPathExpressionException {
        HashMap<String, HashSet<Annotation>> loadAnns = loadAnns(inputStream, wikipediaInterface);
        HashMap<String, String> loadBody = loadBody(map, loadAnns.keySet());
        checkConsistency(loadBody, loadAnns);
        unifyMaps(loadBody, loadAnns);
    }

    private void checkConsistency(HashMap<String, String> hashMap, HashMap<String, HashSet<Annotation>> hashMap2) throws AnnotationException {
        for (String str : hashMap2.keySet()) {
            if (!hashMap.containsKey(str)) {
                throw new AnnotationException("Document " + str + " cited in annotation not available!");
            }
        }
    }

    public HashMap<String, String> loadBody(Map<String, InputStream> map, Set<String> set) throws IOException {
        String str;
        String str2;
        HashMap<String, String> hashMap = new HashMap<>();
        for (String str3 : map.keySet()) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(map.get(str3), Charset.forName("UTF-8")));
            while (true) {
                str2 = str;
                String readLine = bufferedReader.readLine();
                str = readLine != null ? str2 + readLine.replace((char) 0, ' ') + "\n" : "";
            }
            bufferedReader.close();
            hashMap.put(str3, str2);
        }
        return hashMap;
    }

    public HashMap<String, HashSet<Annotation>> loadAnns(InputStream inputStream, WikipediaInterface wikipediaInterface) throws ParserConfigurationException, SAXException, IOException, AnnotationException, XPathExpressionException {
        HashMap hashMap = new HashMap();
        Document parse = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(inputStream);
        parse.getDocumentElement().normalize();
        Vector vector = new Vector();
        NodeList elementsByTagName = parse.getElementsByTagName("annotation");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Node item = elementsByTagName.item(i);
            if (item.getNodeType() == 1) {
                NodeList childNodes = ((Element) item).getChildNodes();
                int i2 = -1;
                int i3 = -1;
                String str = null;
                String str2 = null;
                for (int i4 = 0; i4 < childNodes.getLength(); i4++) {
                    Node item2 = childNodes.item(i4);
                    if (item2.getNodeType() == 1) {
                        Element element = (Element) item2;
                        if (element.getTagName().equals("offset")) {
                            i2 = CharUtils.parseInt(CharUtils.trim(element.getTextContent()));
                        }
                        if (element.getTagName().equals("length")) {
                            i3 = CharUtils.parseInt(CharUtils.trim(element.getTextContent()));
                        }
                        if (element.getTagName().equals("wikiName")) {
                            str = CharUtils.trim(element.getTextContent()).toString();
                        }
                        if (element.getTagName().equals("docName")) {
                            str2 = CharUtils.trim(element.getTextContent()).toString();
                        }
                    }
                }
                if (str == null || i3 <= 0 || i2 < 0 || str2 == null) {
                    System.out.printf("ERROR: Dataset %s has an incomplete annotation: file=%s offset=%d length=%d wikiName=%s", getName(), str2, Integer.valueOf(i2), Integer.valueOf(i3), str);
                } else if (!str.equals("")) {
                    if (!hashMap.containsKey(str2)) {
                        hashMap.put(str2, new HashSet());
                    }
                    ((HashSet) hashMap.get(str2)).add(new IITBAnnotation(i2, i3, str));
                    vector.add(str);
                }
            }
        }
        wikipediaInterface.prefetchTitles(vector);
        wikipediaInterface.flush();
        HashMap<String, HashSet<Annotation>> hashMap2 = new HashMap<>();
        for (String str3 : hashMap.keySet()) {
            HashSet<Annotation> hashSet = new HashSet<>();
            hashMap2.put(str3, hashSet);
            Iterator it2 = ((HashSet) hashMap.get(str3)).iterator();
            while (it2.hasNext()) {
                IITBAnnotation iITBAnnotation = (IITBAnnotation) it2.next();
                int idByTitle = wikipediaInterface.getIdByTitle(iITBAnnotation.title);
                if (idByTitle == -1) {
                    System.out.println(getName() + " dataset is malformed: a mention has been annotated with the wikipedia title [" + iITBAnnotation.title + "] but this article does not exist. Discarding annotation.");
                } else {
                    hashSet.add(new Annotation(iITBAnnotation.position, iITBAnnotation.length, idByTitle));
                }
            }
        }
        return hashMap2;
    }

    public void unifyMaps(HashMap<String, String> hashMap, HashMap<String, HashSet<Annotation>> hashMap2) {
        this.annList = new Vector();
        this.textList = new Vector();
        for (String str : hashMap2.keySet()) {
            this.textList.add(hashMap.get(str));
            this.annList.add(hashMap2.get(str));
        }
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public int getSize() {
        return this.textList.size();
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public int getTagsCount() {
        int i = 0;
        Iterator<HashSet<Annotation>> it2 = this.annList.iterator();
        while (it2.hasNext()) {
            i += it2.next().size();
        }
        return i;
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public List<HashSet<Tag>> getC2WGoldStandardList() {
        return ProblemReduction.A2WToC2WList(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Annotation>> getD2WGoldStandardList() {
        return getA2WGoldStandardList();
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public List<String> getTextInstanceList() {
        return this.textList;
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Mention>> getMentionsInstanceList() {
        return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public String getName() {
        return "IITB";
    }

    @Override // it.unipi.di.acube.batframework.problems.A2WDataset
    public List<HashSet<Annotation>> getA2WGoldStandardList() {
        return this.annList;
    }
}
