package org.apache.joshua.subsample;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.joshua.corpus.BasicPhrase;
import org.apache.joshua.corpus.Phrase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:joshua-incubating-6.1.jar:org/apache/joshua/subsample/Subsampler.class */
public class Subsampler {
    private static final Logger LOG = LoggerFactory.getLogger(Subsampler.class);
    protected Map<Phrase, Integer> ngramCounts;
    protected final int maxN;
    protected final int targetCount;
    protected final int maxSubsample = 1500000;
    protected static final int MAX_SENTENCE_LENGTH = 100;
    protected static final int MIN_RATIO_LENGTH = 10;

    public Subsampler(String[] strArr, int i, int i2) throws IOException {
        this.maxN = i;
        this.targetCount = i2;
        this.ngramCounts = loadNgrams(strArr);
    }

    private HashMap<Phrase, Integer> loadNgrams(String[] strArr) throws IOException {
        HashMap<Phrase, Integer> hashMap = new HashMap<>();
        for (String str : strArr) {
            LOG.debug("Loading test set from {}", str);
            int i = 0;
            PhraseReader phraseReader = new PhraseReader(new FileReader(str), (byte) 1);
            Throwable th = null;
            while (true) {
                try {
                    try {
                        BasicPhrase readPhrase = phraseReader.readPhrase();
                        if (readPhrase == null) {
                            break;
                        }
                        i++;
                        Iterator<Phrase> it = readPhrase.getSubPhrases(this.maxN).iterator();
                        while (it.hasNext()) {
                            hashMap.put(it.next(), 0);
                        }
                    } finally {
                    }
                } catch (Throwable th2) {
                    if (phraseReader != null) {
                        if (th != null) {
                            try {
                                phraseReader.close();
                            } catch (Throwable th3) {
                                th.addSuppressed(th3);
                            }
                        } else {
                            phraseReader.close();
                        }
                    }
                    throw th2;
                }
            }
            if (phraseReader != null) {
                if (0 != 0) {
                    try {
                        phraseReader.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    phraseReader.close();
                }
            }
            LOG.debug("Processed {} lines in {}", Integer.valueOf(i), str);
        }
        LOG.debug("Test set: {} ngrams", Integer.valueOf(hashMap.size()));
        return hashMap;
    }

    public void subsample(String str, float f, String str2, String str3, String str4, String str5, String str6) throws IOException {
        subsample(str, f, new PhraseWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str6 + FulltextIndexConstants.EXCERPT_NODE_FIELD_NAME + str2), "UTF8")), new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str6 + FulltextIndexConstants.EXCERPT_NODE_FIELD_NAME + str3), "UTF8"))), new BiCorpusFactory(str4, str5, null, str2, str3, null));
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* JADX WARN: Finally extract failed */
    public void subsample(String str, float f, PhraseWriter phraseWriter, BiCorpusFactory biCorpusFactory) throws IOException {
        try {
            ArrayList<String> arrayList = new ArrayList();
            FileReader fileReader = null;
            BufferedReader bufferedReader = null;
            try {
                fileReader = new FileReader(str);
                bufferedReader = new BufferedReader(fileReader);
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    } else {
                        arrayList.add(readLine);
                    }
                }
                if (null != fileReader) {
                    fileReader.close();
                }
                if (null != bufferedReader) {
                    bufferedReader.close();
                }
                int i = 0;
                for (String str2 : arrayList) {
                    LOG.info("Loading training data: {}", str2);
                    BiCorpus fromFiles = biCorpusFactory.fromFiles(str2);
                    HashMap<PhrasePair, PhrasePair> hashMap = new HashMap<>();
                    int i2 = 100 / 10;
                    LOG.debug("Looking in length range");
                    for (int i3 = 0; i3 < i2; i3++) {
                        LOG.debug(" [{}, {}]", Integer.valueOf((i3 * 10) + 1), Integer.valueOf((i3 + 1) * 10));
                        subsample(hashMap, fromFiles, (i3 * 10) + 1, (i3 + 1) * 10, f);
                        if (hashMap.size() + i > 1500000) {
                            break;
                        }
                    }
                    float f2 = 0.0f;
                    float f3 = 0.0f;
                    Iterator<PhrasePair> it = hashMap.keySet().iterator();
                    while (it.hasNext()) {
                        f2 += r0.getF().size();
                        f3 += r0.getE().size();
                        phraseWriter.write(hashMap.get(it.next()));
                        phraseWriter.newLine();
                    }
                    phraseWriter.flush();
                    i += hashMap.size();
                    LOG.info("current={} [total={}] currentRatio={}", new Object[]{Integer.valueOf(hashMap.size()), Integer.valueOf(i), Float.valueOf(f2 / f3)});
                    System.gc();
                }
            } catch (Throwable th) {
                if (null != fileReader) {
                    fileReader.close();
                }
                if (null != bufferedReader) {
                    bufferedReader.close();
                }
                throw th;
            }
        } finally {
            phraseWriter.close();
        }
    }

    private void subsample(HashMap<PhrasePair, PhrasePair> hashMap, BiCorpus biCorpus, int i, int i2, float f) {
        int size;
        Iterator<PhrasePair> it = biCorpus.iterator();
        while (it.hasNext()) {
            PhrasePair next = it.next();
            PhrasePair phrasePair = new PhrasePair(new BasicPhrase((byte) 1, next.getF().toString().toLowerCase()), new BasicPhrase((byte) 1, next.getE().toString().toLowerCase()), next.getAlignment());
            int size2 = next.getE().size();
            if (size2 != 0 && size2 <= 100 && (size = next.getF().size()) != 0 && size >= i && size <= i2 && size <= 100) {
                if (size > 10 && f != 0.0f) {
                    float ratioFtoE = next.ratioFtoE();
                    if (size >= 10) {
                        if (ratioFtoE <= 1.3f * f && ratioFtoE * 1.3f >= f) {
                        }
                    }
                }
                if (!hashMap.containsKey(phrasePair)) {
                    boolean z = false;
                    for (Phrase phrase : next.getF().getSubPhrases(this.maxN)) {
                        Integer num = this.ngramCounts.get(phrase);
                        if (num != null && num.intValue() < this.targetCount) {
                            z = true;
                            this.ngramCounts.put(phrase, Integer.valueOf(num.intValue() + 1));
                        }
                    }
                    if (z) {
                        hashMap.put(phrasePair, next);
                    }
                }
            }
        }
    }

    public static void main(String[] strArr) {
        new SubsamplerCLI().runMain(strArr);
    }
}
