package it.unipi.di.acube.batframework.datasetPlugins;

import it.unimi.dsi.lang.MutableString;
import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.AnnotationException;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaInterface;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
import org.xml.sax.SAXException;

/* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/ConllAidaDataset.class */
public class ConllAidaDataset implements A2WDataset {
    private List<HashSet<Annotation>> annotations = new Vector();
    private List<MutableString> documents = new Vector();
    private Pattern wikiUrlPattern = Pattern.compile("http://en.wikipedia.org/wiki/(.*)");
    private Pattern mentionPattern = Pattern.compile("^(.*?)\t([BI]?)\t(.*?)\t(.*?)\t(.*?)(?:\t(.*))?$");
    private Pattern nmePattern = Pattern.compile("^(.*)\t([BI])\t(.*)\t(.*)--NME--$");
    private Pattern punctuationPattern = Pattern.compile("^\\W.*$");

    /* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/ConllAidaDataset$AidaAnnotation.class */
    private class AidaAnnotation {
        public int length;
        public int position;
        public String title;

        public AidaAnnotation(int i, int i2, String str) {
            this.length = i2;
            this.position = i;
            this.title = str;
        }
    }

    public ConllAidaDataset(String str, WikipediaInterface wikipediaInterface) throws IOException, AnnotationException, XPathExpressionException, ParserConfigurationException, SAXException {
        Vector<HashSet> vector = new Vector();
        Vector vector2 = new Vector();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), Charset.forName("UTF-8")));
        MutableString mutableString = null;
        HashSet hashSet = null;
        int i = -1;
        int i2 = 0;
        String str2 = null;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                wikipediaInterface.prefetchTitles(vector2);
                for (HashSet hashSet2 : vector) {
                    HashSet hashSet3 = new HashSet();
                    Iterator it2 = hashSet2.iterator();
                    while (it2.hasNext()) {
                        AidaAnnotation aidaAnnotation = (AidaAnnotation) it2.next();
                        int idByTitle = wikipediaInterface.getIdByTitle(aidaAnnotation.title);
                        if (idByTitle == -1) {
                            System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page " + aidaAnnotation.title);
                        } else {
                            hashSet3.add(new Annotation(aidaAnnotation.position, aidaAnnotation.length, idByTitle));
                        }
                    }
                    this.annotations.add(Annotation.deleteOverlappingAnnotations(hashSet3));
                }
                return;
            }
            Matcher matcher = this.mentionPattern.matcher(readLine);
            Matcher matcher2 = this.nmePattern.matcher(readLine);
            CharSequence mutableString2 = new MutableString();
            if ((!matcher.matches() || (matcher.matches() && matcher.group(2).equals("B"))) && i != -1) {
                hashSet.add(new AidaAnnotation(i, i2, str2));
                i = -1;
                i2 = 0;
                str2 = null;
            }
            if (readLine.startsWith("-DOCSTART-")) {
                mutableString = new MutableString();
                this.documents.add(mutableString);
                hashSet = new HashSet();
                vector.add(hashSet);
            } else if (readLine.equals("")) {
                mutableString2.replace("\n");
            } else if (!matcher.matches() && !matcher2.matches()) {
                mutableString2.replace(readLine + " ");
            } else if (matcher2.matches()) {
                mutableString2.replace(matcher2.group(1) + " ");
            } else {
                if (matcher.group(2).equals("B")) {
                    Matcher matcher3 = this.wikiUrlPattern.matcher(matcher.group(5));
                    if (!matcher3.matches()) {
                        bufferedReader.close();
                        throw new AnnotationException("Dataset is malformed: string " + matcher.group(5) + " should be a wikipedia URL. Line=[" + readLine + "]");
                    }
                    str2 = matcher3.group(1);
                    i = mutableString.length();
                    i2 = matcher.group(1).length();
                    vector2.add(str2);
                } else {
                    if (!matcher.group(2).equals("B") && !matcher.group(2).equals("I")) {
                        bufferedReader.close();
                        throw new AnnotationException("Dataset is malformed: all mention should be marked as B or I. Bad mention: " + readLine);
                    }
                    i2 += matcher.group(1).length() + 1;
                }
                mutableString2.replace(matcher.group(1) + " ");
            }
            if (this.punctuationPattern.matcher(mutableString2).matches()) {
                mutableString.trimRight();
            }
            mutableString.append(mutableString2);
        }
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public int getSize() {
        return this.annotations.size();
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public int getTagsCount() {
        int i = 0;
        Iterator<HashSet<Annotation>> it2 = this.annotations.iterator();
        while (it2.hasNext()) {
            i += it2.next().size();
        }
        return i;
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public List<HashSet<Tag>> getC2WGoldStandardList() {
        return ProblemReduction.A2WToC2WList(this.annotations);
    }

    @Override // it.unipi.di.acube.batframework.problems.A2WDataset
    public List<HashSet<Annotation>> getA2WGoldStandardList() {
        return this.annotations;
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Annotation>> getD2WGoldStandardList() {
        return getA2WGoldStandardList();
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public List<String> getTextInstanceList() {
        Vector vector = new Vector();
        Iterator<MutableString> it2 = this.documents.iterator();
        while (it2.hasNext()) {
            vector.add(it2.next().toString());
        }
        return vector;
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Mention>> getMentionsInstanceList() {
        return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public String getName() {
        return "AIDA/CO-NLL";
    }
}
