package it.unipi.di.acube.batframework.datasetPlugins;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaInterface;
import it.unipi.di.acube.batframework.utils.WikipediaLocalInterface;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;

/* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/NEEL2016Dataset.class */
public class NEEL2016Dataset implements A2WDataset {
    private List<String> text;
    private List<HashSet<Annotation>> gold;
    private String portion;
    private static Pattern tweetsRE = Pattern.compile("^\\|(\\d+)\\|,\\|(.*)\\|$");
    private static final Charset UTF_8 = Charset.forName("UTF-8");

    /* JADX WARN: Multi-variable type inference failed */
    public NEEL2016Dataset(InputStream inputStream, InputStream inputStream2, WikipediaInterface wikipediaInterface, String str) throws IOException {
        this.portion = str;
        HashMap hashMap = new HashMap();
        LineIterator lineIterator = IOUtils.lineIterator(inputStream2, "utf8");
        while (lineIterator.hasNext()) {
            try {
                Matcher matcher = tweetsRE.matcher(lineIterator.nextLine());
                if (!matcher.matches()) {
                    throw new IllegalArgumentException();
                }
                long parseLong = Long.parseLong(matcher.group(1));
                hashMap.put(Long.valueOf(parseLong), new String(matcher.group(2).getBytes(UTF_8), UTF_8));
            } finally {
            }
        }
        HashMap hashMap2 = new HashMap();
        lineIterator = IOUtils.lineIterator(inputStream, "utf8");
        while (lineIterator.hasNext()) {
            try {
                String[] split = lineIterator.nextLine().split("\t");
                if (split.length != 6) {
                    throw new IllegalArgumentException();
                }
                long parseLong2 = Long.parseLong(split[0]);
                int parseInt = Integer.parseInt(split[1]);
                int parseInt2 = Integer.parseInt(split[2]);
                String str2 = split[3];
                if (!str2.startsWith("NIL")) {
                    int dereference = wikipediaInterface.dereference(wikipediaInterface.getIdByTitle(WikipediaLocalInterface.dbPediaUrlToTitle(str2)));
                    if (!hashMap2.containsKey(Long.valueOf(parseLong2))) {
                        hashMap2.put(Long.valueOf(parseLong2), new HashSet());
                    }
                    ((HashSet) hashMap2.get(Long.valueOf(parseLong2))).add(new Annotation(parseInt, parseInt2 - parseInt, dereference));
                }
            } finally {
                LineIterator.closeQuietly(lineIterator);
            }
        }
        LineIterator.closeQuietly(lineIterator);
        Vector vector = new Vector(hashMap.keySet());
        Collections.sort(vector);
        this.text = new Vector();
        Iterator it2 = vector.iterator();
        while (it2.hasNext()) {
            this.text.add(hashMap.get(Long.valueOf(((Long) it2.next()).longValue())));
        }
        this.gold = new Vector();
        Iterator it3 = vector.iterator();
        while (it3.hasNext()) {
            long longValue = ((Long) it3.next()).longValue();
            if (hashMap2.containsKey(Long.valueOf(longValue))) {
                this.gold.add(hashMap2.get(Long.valueOf(longValue)));
            } else {
                this.gold.add(new HashSet<>());
            }
        }
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public int getSize() {
        return this.text.size();
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public int getTagsCount() {
        int i = 0;
        Iterator<HashSet<Annotation>> it2 = this.gold.iterator();
        while (it2.hasNext()) {
            i += it2.next().size();
        }
        return i;
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public List<HashSet<Tag>> getC2WGoldStandardList() {
        return ProblemReduction.A2WToC2WList(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Annotation>> getD2WGoldStandardList() {
        return getA2WGoldStandardList();
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public List<String> getTextInstanceList() {
        return this.text;
    }

    @Override // it.unipi.di.acube.batframework.problems.D2WDataset
    public List<HashSet<Mention>> getMentionsInstanceList() {
        return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public String getName() {
        return "#Microposts2016 NEEL " + this.portion;
    }

    @Override // it.unipi.di.acube.batframework.problems.A2WDataset
    public List<HashSet<Annotation>> getA2WGoldStandardList() {
        return this.gold;
    }
}
