package org.apache.mahout.classifier.df.data;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.classifier.df.data.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/classifier/df/data/DataLoader.class */
public final class DataLoader {
    private static final Logger log = LoggerFactory.getLogger(DataLoader.class);
    private static final Pattern COMMA_SPACE = Pattern.compile("[, ]");

    private DataLoader() {
    }

    private static boolean parseString(Dataset.Attribute[] attributeArr, Set<String>[] setArr, CharSequence charSequence, boolean z) {
        String[] split = COMMA_SPACE.split(charSequence);
        Preconditions.checkArgument(split.length == attributeArr.length, "Wrong number of attributes in the string: " + split.length + ". Must be: " + attributeArr.length);
        for (int i = 0; i < attributeArr.length; i++) {
            if (!attributeArr[i].isIgnored() && "?".equals(split[i])) {
                return false;
            }
        }
        for (int i2 = 0; i2 < attributeArr.length; i2++) {
            if (!attributeArr[i2].isIgnored()) {
                String str = split[i2];
                if (attributeArr[i2].isCategorical() || (!z && attributeArr[i2].isLabel())) {
                    if (setArr[i2] == null) {
                        setArr[i2] = Sets.newHashSet();
                    }
                    setArr[i2].add(str);
                } else {
                    try {
                        Double.parseDouble(str);
                    } catch (NumberFormatException e) {
                        return false;
                    }
                }
            }
        }
        return true;
    }

    public static Data loadData(Dataset dataset, FileSystem fileSystem, Path path) throws IOException {
        Scanner scanner = new Scanner((InputStream) fileSystem.open(path), "UTF-8");
        ArrayList newArrayList = Lists.newArrayList();
        DataConverter dataConverter = new DataConverter(dataset);
        while (scanner.hasNextLine()) {
            String nextLine = scanner.nextLine();
            if (nextLine.isEmpty()) {
                log.warn("{}: empty string", Integer.valueOf(newArrayList.size()));
            } else {
                Instance convert = dataConverter.convert(nextLine);
                if (convert != null) {
                    newArrayList.add(convert);
                } else {
                    log.warn("{}: missing values", Integer.valueOf(newArrayList.size()));
                }
            }
        }
        scanner.close();
        return new Data(dataset, newArrayList);
    }

    public static Data loadData(Dataset dataset, String[] strArr) {
        ArrayList newArrayList = Lists.newArrayList();
        DataConverter dataConverter = new DataConverter(dataset);
        for (String str : strArr) {
            if (str.isEmpty()) {
                log.warn("{}: empty string", Integer.valueOf(newArrayList.size()));
            } else {
                Instance convert = dataConverter.convert(str);
                if (convert != null) {
                    newArrayList.add(convert);
                } else {
                    log.warn("{}: missing values", Integer.valueOf(newArrayList.size()));
                }
            }
        }
        return new Data(dataset, newArrayList);
    }

    public static Dataset generateDataset(CharSequence charSequence, boolean z, FileSystem fileSystem, Path path) throws DescriptorException, IOException {
        Dataset.Attribute[] parseDescriptor = DescriptorUtils.parseDescriptor(charSequence);
        Scanner scanner = new Scanner((InputStream) fileSystem.open(path), "UTF-8");
        Set[] setArr = new Set[parseDescriptor.length];
        int i = 0;
        while (scanner.hasNextLine()) {
            String nextLine = scanner.nextLine();
            if (!nextLine.isEmpty() && parseString(parseDescriptor, setArr, nextLine, z)) {
                i++;
            }
        }
        scanner.close();
        List[] listArr = new List[parseDescriptor.length];
        for (int i2 = 0; i2 < setArr.length; i2++) {
            if (setArr[i2] != null) {
                listArr[i2] = Lists.newArrayList(setArr[i2]);
            }
        }
        return new Dataset(parseDescriptor, listArr, i, z);
    }

    public static Dataset generateDataset(CharSequence charSequence, boolean z, String[] strArr) throws DescriptorException {
        Dataset.Attribute[] parseDescriptor = DescriptorUtils.parseDescriptor(charSequence);
        Set[] setArr = new Set[parseDescriptor.length];
        int i = 0;
        for (String str : strArr) {
            if (!str.isEmpty() && parseString(parseDescriptor, setArr, str, z)) {
                i++;
            }
        }
        List[] listArr = new List[parseDescriptor.length];
        for (int i2 = 0; i2 < setArr.length; i2++) {
            if (setArr[i2] != null) {
                listArr[i2] = Lists.newArrayList(setArr[i2]);
            }
        }
        return new Dataset(parseDescriptor, listArr, i, z);
    }
}
