package dragon.config;

import dragon.nlp.extract.EngDocumentParser;
import dragon.nlp.tool.Lemmatiser;
import dragon.nlp.tool.Tagger;
import dragon.nlp.tool.xtract.SimpleXtract;
import dragon.onlinedb.CollectionReader;
import dragon.util.FileUtil;
import dragon.util.SortedArray;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.Date;
import org.apache.hadoop.fs.CommonConfigurationKeys;

/* loaded from: input_file:dragon/config/PhraseExtractAppConfig.class */
public class PhraseExtractAppConfig {
    public static void main(String[] strArr) {
        if (strArr.length != 2) {
            System.out.println("Please input two parameters: configuration xml file and phrase extraction applicaiton id");
            return;
        }
        ConfigureNode configureNode = new ConfigUtil().getConfigureNode(new BasicConfigureNode(strArr[0]), "phraseextractapp", Integer.parseInt(strArr[1]));
        if (configureNode == null) {
            return;
        }
        new PhraseExtractAppConfig().phraseExtract(configureNode);
    }

    public void phraseExtract(ConfigureNode configureNode) {
        int i = configureNode.getInt("maxspan", 4);
        String string = configureNode.getString("indexfolder");
        String string2 = configureNode.getString("phrasefile");
        boolean z = configureNode.getBoolean("indexing", true);
        double d = configureNode.getDouble("strength", 1.0d);
        double d2 = configureNode.getDouble("peakzscore", 1.0d);
        double d3 = configureNode.getDouble("spread", i);
        double d4 = configureNode.getDouble("expandratio", 0.75d);
        String string3 = configureNode.getString("vobfile", null);
        int i2 = configureNode.getInt("maxphraselength", 4);
        if (!z) {
            phraseExtract(string, i, d, d2, d3, d4, i2, string2, string3);
            return;
        }
        CollectionReaderConfig collectionReaderConfig = new CollectionReaderConfig();
        LemmatiserConfig lemmatiserConfig = new LemmatiserConfig();
        TaggerConfig taggerConfig = new TaggerConfig();
        Lemmatiser lemmatiser = lemmatiserConfig.getLemmatiser(configureNode, configureNode.getInt("lemmatiser", 0));
        Tagger tagger = taggerConfig.getTagger(configureNode, configureNode.getInt("tagger", 0));
        String[] split = configureNode.getString("collectionreader").split(CommonConfigurationKeys.NFS_EXPORTS_ALLOWED_HOSTS_SEPARATOR);
        CollectionReader[] collectionReaderArr = new CollectionReader[split.length];
        for (int i3 = 0; i3 < split.length; i3++) {
            collectionReaderArr[i3] = collectionReaderConfig.getCollectionReader(configureNode, Integer.parseInt(split[i3]));
        }
        phraseExtract(string, i, collectionReaderArr, lemmatiser, tagger, getWordDelimitor(configureNode.getString("notworddelimitor", ".")), d, d2, d3, d4, i2, string2, string3);
    }

    public void phraseExtract(String str, int i, double d, double d2, double d3, double d4, int i2, String str2, String str3) {
        new SimpleXtract(i, str).extract(d, d3, d2, d4, str2);
        if (str3 != null) {
            generateVocabulary(str2, i2, str3);
        }
    }

    public void phraseExtract(String str, int i, CollectionReader[] collectionReaderArr, Lemmatiser lemmatiser, Tagger tagger, String str2, double d, double d2, double d3, double d4, int i2, String str3, String str4) {
        SimpleXtract simpleXtract = new SimpleXtract(i, str);
        simpleXtract.index(collectionReaderArr, tagger, lemmatiser, str2);
        simpleXtract.extract(d, d3, d2, d4, str3);
        if (str4 != null) {
            generateVocabulary(str3, i2, str4);
        }
    }

    public void generateVocabulary(String str, int i, String str2) {
        generateVocabulary(postProcessExtractedPhrase(str), i, str2);
    }

    private void generateVocabulary(ArrayList arrayList, int i, String str) {
        try {
            System.out.println(new StringBuffer().append(new Date().toString()).append(" Printing vocabulary file...").toString());
            BufferedWriter textWriter = FileUtil.getTextWriter(str);
            int i2 = Integer.MAX_VALUE;
            int i3 = 0;
            ArrayList arrayList2 = new ArrayList(arrayList.size());
            for (int i4 = 0; i4 < arrayList.size(); i4++) {
                int tokenNum = getTokenNum((String) arrayList.get(i4));
                if (tokenNum <= i) {
                    arrayList2.add(arrayList.get(i4));
                    if (tokenNum > i3) {
                        i3 = tokenNum;
                    }
                    if (tokenNum < i2) {
                        i2 = tokenNum;
                    }
                }
            }
            textWriter.write(new StringBuffer().append(arrayList2.size()).append("\t").append(i2).append("\t").append(i3).append("\n").toString());
            for (int i5 = 0; i5 < arrayList2.size(); i5++) {
                textWriter.write((String) arrayList2.get(i5));
                textWriter.write(9);
                textWriter.write(String.valueOf(i5));
                textWriter.write(10);
            }
            textWriter.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private SortedArray postProcessExtractedPhrase(String str) {
        try {
            System.out.println(new StringBuffer().append(new Date().toString()).append(" Postprocessing Extracted Phrases...").toString());
            SortedArray sortedArray = new SortedArray();
            BufferedReader textReader = FileUtil.getTextReader(str);
            textReader.readLine();
            while (true) {
                String readLine = textReader.readLine();
                String str2 = readLine;
                if (readLine == null) {
                    return sortedArray;
                }
                int indexOf = str2.indexOf(9);
                if (indexOf >= 0) {
                    str2 = str2.substring(0, indexOf);
                }
                String postProcessPhrase = postProcessPhrase(str2);
                if (postProcessPhrase.indexOf(32) > 0) {
                    sortedArray.add(postProcessPhrase);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    private String postProcessPhrase(String str) {
        try {
            return removePersonTitle(str.replace('-', ' ').replace('_', ' ').replace('\'', ' ').replaceAll("   ", " ").replaceAll("  ", " ").replaceAll("  ", " ")).toLowerCase();
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    private String removePersonTitle(String str) {
        String trim = str.trim();
        int indexOf = trim.indexOf(32);
        return (indexOf <= 0 || trim.charAt(indexOf - 1) != '.' || trim.lastIndexOf(46, indexOf - 2) >= 0) ? trim : removePersonTitle(trim.substring(indexOf + 1));
    }

    private int getTokenNum(String str) {
        int i = 0;
        for (int i2 = 0; i2 < str.length(); i2++) {
            if (Character.isWhitespace(str.charAt(i2))) {
                i++;
            }
        }
        return i + 1;
    }

    private String getWordDelimitor(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        if (str == null && str.length() == 0) {
            return EngDocumentParser.defWordDelimitor;
        }
        for (int i = 0; i < EngDocumentParser.defWordDelimitor.length(); i++) {
            if (str.indexOf(EngDocumentParser.defWordDelimitor.charAt(i)) < 0) {
                stringBuffer.append(EngDocumentParser.defWordDelimitor.charAt(i));
            }
        }
        return stringBuffer.toString();
    }
}
