package it.unipi.di.acube.batframework.datasetPlugins;

import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.Rc2WDataset;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/MeijDataset.class */
public class MeijDataset implements Rc2WDataset {
    private List<String> texts;
    private List<HashSet<Tag>> tags;
    private List<List<Tag>> rankedTags;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:it/unipi/di/acube/batframework/datasetPlugins/MeijDataset$MeijDocument.class */
    public static class MeijDocument implements Serializable {
        private static final long serialVersionUID = 6977622102826151597L;
        String text;
        String id;
        HashSet<Integer> tags = new HashSet<>();
        Vector<Integer> ranked = new Vector<>();
    }

    public MeijDataset(String str, String str2, String str3) throws FileNotFoundException, IOException {
        this(new FileInputStream(str), new FileInputStream(str2), new FileInputStream(str3));
    }

    public MeijDataset(InputStream inputStream, InputStream inputStream2, InputStream inputStream3) throws IOException {
        Object2ObjectOpenHashMap<String, MeijDocument> ReadTweetFile = ReadTweetFile(inputStream);
        readTagFile(inputStream2, ReadTweetFile);
        loadRankedTags(inputStream3, ReadTweetFile);
        this.texts = new Vector();
        this.tags = new Vector();
        ObjectIterator it2 = ReadTweetFile.entrySet().iterator();
        while (it2.hasNext()) {
            Map.Entry entry = (Map.Entry) it2.next();
            this.texts.add(((MeijDocument) entry.getValue()).text);
            HashSet<Tag> hashSet = new HashSet<>();
            this.tags.add(hashSet);
            Iterator<Integer> it3 = ((MeijDocument) entry.getValue()).tags.iterator();
            while (it3.hasNext()) {
                hashSet.add(new Tag(it3.next().intValue()));
            }
        }
        this.rankedTags = new Vector();
        ObjectIterator it4 = ReadTweetFile.entrySet().iterator();
        while (it4.hasNext()) {
            Map.Entry entry2 = (Map.Entry) it4.next();
            Vector vector = new Vector();
            this.rankedTags.add(vector);
            Iterator<Integer> it5 = ((MeijDocument) entry2.getValue()).ranked.iterator();
            while (it5.hasNext()) {
                vector.add(new Tag(it5.next().intValue()));
            }
        }
    }

    private static Object2ObjectOpenHashMap<String, MeijDocument> ReadTweetFile(InputStream inputStream) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        Object2ObjectOpenHashMap<String, MeijDocument> object2ObjectOpenHashMap = new Object2ObjectOpenHashMap<>();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return object2ObjectOpenHashMap;
            }
            String[] split = readLine.toString().split("\t");
            MeijDocument meijDocument = new MeijDocument();
            meijDocument.id = split[0];
            meijDocument.text = CleanTweet(split[4]);
            object2ObjectOpenHashMap.put(meijDocument.id, meijDocument);
        }
    }

    private static void readTagFile(InputStream inputStream, Object2ObjectOpenHashMap<String, MeijDocument> object2ObjectOpenHashMap) throws NumberFormatException, IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return;
            }
            String[] split = readLine.toString().split("\t");
            if (Integer.parseInt(split[1]) >= 0) {
                ((MeijDocument) object2ObjectOpenHashMap.get(split[0])).tags.add(Integer.valueOf(Integer.parseInt(split[1])));
            }
        }
    }

    private static String CleanTweet(String str) {
        Pattern compile = Pattern.compile("http://|bit|yfrog|tinyurl|twitpic|justgiving|plixi");
        Matcher matcher = compile.matcher(str);
        while (true) {
            Matcher matcher2 = matcher;
            if (!matcher2.find()) {
                return str;
            }
            int start = matcher2.start(0);
            int i = start;
            while (i < str.length() && str.charAt(i) != ' ') {
                i++;
            }
            str = str.replace(str.substring(start, i), " ");
            matcher = compile.matcher(str);
        }
    }

    private static void loadRankedTags(InputStream inputStream, Object2ObjectOpenHashMap<String, MeijDocument> object2ObjectOpenHashMap) throws NumberFormatException, IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return;
            }
            String[] split = readLine.toString().split(" ");
            if (object2ObjectOpenHashMap.containsKey(split[0])) {
                ((MeijDocument) object2ObjectOpenHashMap.get(split[0])).ranked.add(new Integer(Integer.parseInt(split[2])));
            }
        }
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public int getSize() {
        return this.texts.size();
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public int getTagsCount() {
        int i = 0;
        Iterator<HashSet<Tag>> it2 = this.tags.iterator();
        while (it2.hasNext()) {
            i += it2.next().size();
        }
        return i;
    }

    @Override // it.unipi.di.acube.batframework.problems.C2WDataset
    public List<HashSet<Tag>> getC2WGoldStandardList() {
        return this.tags;
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public List<String> getTextInstanceList() {
        return this.texts;
    }

    @Override // it.unipi.di.acube.batframework.problems.TopicDataset
    public String getName() {
        return "Meij";
    }

    @Override // it.unipi.di.acube.batframework.problems.Rc2WDataset
    public List<List<Tag>> getRc2WGoldStandardList() {
        return this.rankedTags;
    }
}
