package edu.isi.nlp.corpora.gigaword;

import com.google.common.base.Charsets;
import com.google.common.collect.Maps;
import com.google.common.io.ByteSource;
import com.google.common.io.Files;
import edu.isi.nlp.files.FileUtils;
import edu.isi.nlp.io.OffsetIndex;
import edu.isi.nlp.io.OffsetIndices;
import edu.isi.nlp.parameters.Parameters;
import edu.isi.nlp.strings.offsets.OffsetRange;
import edu.isi.nlp.symbols.Symbol;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/isi/nlp/corpora/gigaword/IndexFlatGigaword.class */
public final class IndexFlatGigaword {
    private static final Logger log = LoggerFactory.getLogger(IndexFlatGigaword.class);
    private static final String beginProbeString = "<DOC id=\"";
    private static final byte[] beginProbe = beginProbeString.getBytes(Charsets.UTF_8);
    private static final String docIdEndString = "\"";
    private static final byte[] docIdEnd = docIdEndString.getBytes(Charsets.UTF_8);
    private static final String endProbeString = "</DOC>";
    private static final byte[] endProbe = endProbeString.getBytes(Charsets.UTF_8);
    private static final int NOT_FOUND = 1;

    private IndexFlatGigaword() {
        throw new UnsupportedOperationException();
    }

    private static void trueMain(String[] strArr) throws IOException {
        Parameters loadSerifStyle = Parameters.loadSerifStyle(new File(strArr[0]));
        File existingDirectory = loadSerifStyle.getExistingDirectory("rawGigawordRoot");
        File creatableDirectory = loadSerifStyle.getCreatableDirectory("rawGigawordOffsetIndexDir");
        File[] listFiles = existingDirectory.listFiles();
        int length = listFiles.length;
        for (int i = 0; i < length; i += NOT_FOUND) {
            File file = listFiles[i];
            File file2 = new File(creatableDirectory, file.getName());
            file2.mkdir();
            File[] listFiles2 = file.listFiles();
            int length2 = listFiles2.length;
            for (int i2 = 0; i2 < length2; i2 += NOT_FOUND) {
                File file3 = listFiles2[i2];
                log.info("Building offset map for {}", file3);
                OffsetIndex buildOffsetIndex = buildOffsetIndex(Files.asByteSource(file3));
                File file4 = new File(file2, file3.getName() + ".index");
                log.info("Writing {} offsets for {} to {}", new Object[]{Integer.valueOf(buildOffsetIndex.keySet().size()), file3, file4});
                OffsetIndices.writeBinary(buildOffsetIndex, FileUtils.asCompressedByteSink(file4));
            }
        }
    }

    static OffsetIndex buildOffsetIndex(ByteSource byteSource) throws IOException {
        HashMap newHashMap = Maps.newHashMap();
        byte[] read = byteSource.read();
        int i = 0;
        while (true) {
            int findBytes = findBytes(read, beginProbe, i);
            if (findBytes == NOT_FOUND) {
                return OffsetIndices.forMap(newHashMap);
            }
            int length = findBytes + beginProbe.length;
            int findBytes2 = findBytes(read, docIdEnd, length + NOT_FOUND);
            if (findBytes2 == NOT_FOUND) {
                throw new IOException("Failed to find end of document ID in " + byteSource);
            }
            String bytesAsString = bytesAsString(read, length, findBytes2);
            int findBytes3 = findBytes(read, endProbe, findBytes2 + NOT_FOUND);
            if (findBytes3 == NOT_FOUND) {
                throw new IOException("Failed to find closing document tag in " + byteSource);
            }
            int length2 = (findBytes3 + endProbe.length) - NOT_FOUND;
            if (newHashMap.containsKey(Symbol.from(bytesAsString))) {
                log.warn("Document ID {} occurs more than once; using latest version", bytesAsString);
            }
            newHashMap.put(Symbol.from(bytesAsString), OffsetRange.byteOffsetRange(findBytes, length2));
            i = length2 + NOT_FOUND;
        }
    }

    private static String bytesAsString(byte[] bArr, int i, int i2) {
        return new String(Arrays.copyOfRange(bArr, i, i2), Charsets.UTF_8);
    }

    private static int findBytes(byte[] bArr, byte[] bArr2, int i) {
        for (int i2 = i; i2 < bArr.length; i2 += NOT_FOUND) {
            if (matches(bArr, bArr2, i2)) {
                return i2;
            }
        }
        return NOT_FOUND;
    }

    private static boolean matches(byte[] bArr, byte[] bArr2, int i) {
        if (i + bArr2.length > bArr.length) {
            return false;
        }
        for (int i2 = 0; i2 < bArr2.length; i2 += NOT_FOUND) {
            if (bArr[i + i2] != bArr2[i2]) {
                return false;
            }
        }
        return true;
    }

    public static void main(String[] strArr) {
        try {
            trueMain(strArr);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(NOT_FOUND);
        }
    }
}
