package it.unipi.di.acube.batframework.datasetPlugins;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.AnnotationException;
import it.unipi.di.acube.batframework.utils.CharUtils;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaApiInterface;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/MSNBCDataset.class */
public class MSNBCDataset implements A2WDataset {
    private List<String> textList;
    private List<HashSet<Annotation>> annList;
    private static Pattern wikiUrlPattern = Pattern.compile("http://en.wikipedia.org/wiki/(.*?)\"?");

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/MSNBCDataset$MSNBCAnnotation.class */
    public static class MSNBCAnnotation implements Comparable<MSNBCAnnotation> {
        public int position;
        public int length;
        public String title;

        public MSNBCAnnotation(int i, int i2, String str) {
            this.position = i;
            this.length = i2;
            this.title = str;
        }

        @Override // java.lang.Comparable
        public int compareTo(MSNBCAnnotation mSNBCAnnotation) {
            return this.position - mSNBCAnnotation.position;
        }
    }

    public MSNBCDataset() {
    }

    public MSNBCDataset(String str, String str2, WikipediaApiInterface wikipediaApiInterface) throws IOException, ParserConfigurationException, SAXException, AnnotationException, XPathExpressionException {
        HashMap<String, String> loadBody = loadBody(str, ".+\\.txt");
        HashMap<String, HashSet<Annotation>> loadTags = loadTags(str2, ".+\\.txt", wikipediaApiInterface);
        checkConsistency(loadBody, loadTags);
        unifyMaps(loadBody, loadTags);
    }

    public HashMap<String, String> loadBody(String str, String str2) throws IOException {
        String str3;
        HashMap<String, String> hashMap = new HashMap<>();
        for (File file : new File(str).listFiles()) {
            if (file.isFile() && file.getName().toLowerCase().matches(str2)) {
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));
                String str4 = "";
                while (true) {
                    str3 = str4;
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    str4 = str3 + readLine + "\n";
                }
                bufferedReader.close();
                hashMap.put(file.getName(), str3);
            }
        }
        return hashMap;
    }

    public HashMap<String, HashSet<Annotation>> loadTags(String str, String str2, WikipediaApiInterface wikipediaApiInterface) throws ParserConfigurationException, SAXException, IOException, AnnotationException, XPathExpressionException {
        HashMap hashMap = new HashMap();
        for (File file : new File(str).listFiles()) {
            if (file.isFile() && file.getName().toLowerCase().matches(str2)) {
                HashSet hashSet = new HashSet();
                Document parse = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(file);
                parse.getDocumentElement().normalize();
                NodeList elementsByTagName = parse.getElementsByTagName("ReferenceInstance");
                for (int i = 0; i < elementsByTagName.getLength(); i++) {
                    Node item = elementsByTagName.item(i);
                    if (item.getNodeType() == 1) {
                        NodeList childNodes = ((Element) item).getChildNodes();
                        int i2 = -1;
                        int i3 = -1;
                        String str3 = null;
                        for (int i4 = 0; i4 < childNodes.getLength(); i4++) {
                            Node item2 = childNodes.item(i4);
                            if (item2.getNodeType() == 1) {
                                Element element = (Element) item2;
                                if (element.getTagName().equals("Offset")) {
                                    i2 = CharUtils.parseInt(CharUtils.trim(element.getTextContent()));
                                }
                                if (element.getTagName().equals("Length")) {
                                    i3 = CharUtils.parseInt(CharUtils.trim(element.getTextContent()));
                                }
                                if (element.getTagName().equals("ChosenAnnotation")) {
                                    Matcher matcher = wikiUrlPattern.matcher(URLDecoder.decode(CharUtils.trim(element.getTextContent()).toString().replace('_', ' '), "UTF-8"));
                                    if (matcher.matches()) {
                                        str3 = matcher.group(1);
                                    } else {
                                        System.out.println(getName() + " dataset is malformed: URL " + ((Object) CharUtils.trim(element.getTextContent())) + " does not match the pattern. Discarding annotation.");
                                    }
                                }
                            }
                        }
                        if (str3 != null) {
                            hashSet.add(new MSNBCAnnotation(i2, i3, str3));
                        }
                    }
                }
                hashMap.put(file.getName(), hashSet);
            }
        }
        Vector vector = new Vector();
        Iterator it2 = hashMap.values().iterator();
        while (it2.hasNext()) {
            Iterator it3 = ((HashSet) it2.next()).iterator();
            while (it3.hasNext()) {
                vector.add(((MSNBCAnnotation) it3.next()).title);
            }
        }
        wikipediaApiInterface.prefetchTitles(vector);
        HashMap<String, HashSet<Annotation>> hashMap2 = new HashMap<>();
        for (String str4 : hashMap.keySet()) {
            HashSet hashSet2 = new HashSet();
            Iterator it4 = ((HashSet) hashMap.get(str4)).iterator();
            while (it4.hasNext()) {
                MSNBCAnnotation mSNBCAnnotation = (MSNBCAnnotation) it4.next();
                int idByTitle = wikipediaApiInterface.getIdByTitle(mSNBCAnnotation.title);
                if (idByTitle == -1) {
                    System.out.println(getName() + " dataset is malformed: an entity has been tagged with the wikipedia title [" + mSNBCAnnotation.title + "] but this article does not exist. Discarding annotation.");
                } else {
                    hashSet2.add(new Annotation(mSNBCAnnotation.position, mSNBCAnnotation.length, idByTitle));
                }
            }
            hashMap2.put(str4, Annotation.deleteOverlappingAnnotations(hashSet2));
        }
        return hashMap2;
    }

    public void checkConsistency(HashMap<String, String> hashMap, HashMap<String, HashSet<Annotation>> hashMap2) throws AnnotationException {
        for (String str : hashMap2.keySet()) {
            if (!hashMap.containsKey(str)) {
                throw new AnnotationException("In " + getName() + " dataset, there is an annotation file " + str + " that has no corresponding raw text.");
            }
        }
        for (String str2 : hashMap.keySet()) {
            if (!hashMap2.containsKey(str2)) {
                throw new AnnotationException("In " + getName() + " dataset, there is a raw file " + str2 + " that has no corresponding annotations.");
            }
        }
    }

    public void unifyMaps(HashMap<String, String> hashMap, HashMap<String, HashSet<Annotation>> hashMap2) {
        this.annList = new Vector();
        this.textList = new Vector();
        for (String str : hashMap2.keySet()) {
            this.textList.add(hashMap.get(str));
            this.annList.add(hashMap2.get(str));
        }
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public int getSize() {
        return this.textList.size();
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public int getTagsCount() {
        int i = 0;
        Iterator<HashSet<Annotation>> it2 = this.annList.iterator();
        while (it2.hasNext()) {
            i += it2.next().size();
        }
        return i;
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public List<HashSet<Tag>> getC2WGoldStandardList() {
        return ProblemReduction.A2WToC2WList(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Annotation>> getD2WGoldStandardList() {
        return getA2WGoldStandardList();
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public List<String> getTextInstanceList() {
        return this.textList;
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Mention>> getMentionsInstanceList() {
        return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public String getName() {
        return "MSNBC";
    }

    @Override // it.unipi.di.acube.batframework.problems.A2WDataset
    public List<HashSet<Annotation>> getA2WGoldStandardList() {
        return this.annList;
    }
}
