package org.apache.gobblin.source;

import com.google.common.base.Preconditions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.gobblin.configuration.SourceState;
import org.apache.gobblin.converter.json.JsonSchema;
import org.apache.gobblin.source.PartitionAwareFileRetriever;
import org.apache.gobblin.source.extractor.filebased.FileBasedHelperException;
import org.apache.gobblin.source.extractor.hadoop.HadoopFsHelper;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.joda.time.DateTime;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/gobblin/source/RegexBasedPartitionedRetriever.class */
public class RegexBasedPartitionedRetriever implements PartitionAwareFileRetriever {
    private static final Logger LOGGER = LoggerFactory.getLogger(RegexBasedPartitionedRetriever.class);
    private Pattern pattern;
    private HadoopFsHelper helper;
    private Path sourceDir;
    private final String expectedExtension;
    private Duration leadTime;
    private boolean schemaInSourceDir;
    private String schemaFile;

    public RegexBasedPartitionedRetriever(String str) {
        this.expectedExtension = str;
    }

    @Override // org.apache.gobblin.source.PartitionAwareFileRetriever
    public void init(SourceState sourceState) {
        String prop = sourceState.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN);
        Preconditions.checkNotNull(prop, "Must specify a regex pattern in date.partitioned.source.partition.pattern");
        this.leadTime = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(sourceState);
        this.pattern = Pattern.compile(prop);
        this.helper = new HadoopFsHelper(sourceState);
        this.sourceDir = new Path(sourceState.getProp("source.filebased.data.directory"));
        this.schemaInSourceDir = sourceState.getPropAsBoolean("schema.in.source.dir", false);
        this.schemaFile = this.schemaInSourceDir ? sourceState.getProp("schema.filename", "metadata.json") : JsonSchema.DEFAULT_VALUE_FOR_OPTIONAL_PROPERTY;
    }

    @Override // org.apache.gobblin.source.PartitionAwareFileRetriever
    public long getWatermarkFromString(String str) {
        return Long.parseLong(str);
    }

    protected String extractWatermarkFromDirectory(String str) {
        Matcher matcher = this.pattern.matcher(str);
        if (!matcher.matches() || matcher.groupCount() < 1) {
            throw new IllegalArgumentException(str + " does not match regex " + this.pattern.toString());
        }
        return matcher.group(1);
    }

    @Override // org.apache.gobblin.source.PartitionAwareFileRetriever
    public long getWatermarkIncrementMs() {
        return 1L;
    }

    @Override // org.apache.gobblin.source.PartitionAwareFileRetriever
    public List<PartitionAwareFileRetriever.FileInfo> getFilesToProcess(long j, int i) throws IOException {
        long millis = new DateTime().minus(this.leadTime).getMillis();
        try {
            this.helper.connect();
            FileSystem fileSystem = this.helper.getFileSystem();
            ArrayList arrayList = new ArrayList();
            for (PartitionAwareFileRetriever.FileInfo fileInfo : getOuterDirectories(fileSystem, j, millis)) {
                for (FileStatus fileStatus : fileSystem.listStatus(new Path(fileInfo.getFilePath()), getFileFilter())) {
                    arrayList.add(new PartitionAwareFileRetriever.FileInfo(fileStatus.getPath().toString(), fileStatus.getLen(), fileInfo.getWatermarkMsSinceEpoch()));
                }
                if (arrayList.size() > i) {
                    break;
                }
            }
            return arrayList;
        } catch (FileBasedHelperException e) {
            throw new IOException("Error initializing Hadoop connection", e);
        }
    }

    private List<PartitionAwareFileRetriever.FileInfo> getOuterDirectories(FileSystem fileSystem, long j, long j2) throws IOException {
        LOGGER.debug("Listing contents of {}", this.sourceDir);
        FileStatus[] listStatus = fileSystem.listStatus(this.sourceDir);
        ArrayList arrayList = new ArrayList();
        for (FileStatus fileStatus : listStatus) {
            if (fileStatus.isDirectory()) {
                try {
                    long watermarkFromString = getWatermarkFromString(extractWatermarkFromDirectory(fileStatus.getPath().getName()));
                    if (watermarkFromString <= j || watermarkFromString >= j2) {
                        LOGGER.info("Ignoring directory {} - watermark {} is not between minWatermark {} and (now-leadTime) {}", new Object[]{fileStatus.getPath(), Long.valueOf(watermarkFromString), Long.valueOf(j), Long.valueOf(j2)});
                    } else {
                        LOGGER.info("Processing directory {} with watermark {}", fileStatus.getPath(), Long.valueOf(watermarkFromString));
                        arrayList.add(new PartitionAwareFileRetriever.FileInfo(fileStatus.getPath().toString(), 0L, watermarkFromString));
                    }
                } catch (IllegalArgumentException e) {
                    LOGGER.info("Directory {} ({}) does not match pattern {}; skipping", new Object[]{fileStatus.getPath().getName(), fileStatus.getPath(), this.pattern.toString()});
                }
            } else {
                LOGGER.debug("Skipping non-directory {}", fileStatus.getPath().toUri());
            }
        }
        Collections.sort(arrayList);
        return arrayList;
    }

    private PathFilter getFileFilter() {
        final String str = this.expectedExtension.startsWith(".") ? this.expectedExtension : "." + this.expectedExtension;
        return new PathFilter() { // from class: org.apache.gobblin.source.RegexBasedPartitionedRetriever.1
            public boolean accept(Path path) {
                return path.getName().endsWith(str) && !(RegexBasedPartitionedRetriever.this.schemaInSourceDir && path.getName().equals(RegexBasedPartitionedRetriever.this.schemaFile));
            }
        };
    }
}
