package uk.ac.shef.dcs.jate.util;

import dragon.nlp.tool.lemmatiser.EngLemmatiser;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.metrics2.sink.ganglia.AbstractGangliaSink;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.json.simple.parser.ParseException;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.app.App;
import uk.ac.shef.dcs.jate.eval.ATEResultLoader;
import uk.ac.shef.dcs.jate.eval.Scorer;
import uk.ac.shef.dcs.jate.nlp.Lemmatiser;

/* loaded from: input_file:uk/ac/shef/dcs/jate/util/ScienceIECorpusParser.class */
public class ScienceIECorpusParser {
    public static void mergeGS(String str, String str2) throws IOException {
        HashSet hashSet = new HashSet();
        for (File file : new File(str).listFiles()) {
            if (file.getName().endsWith(".ann")) {
                Iterator<String> it = FileUtils.readLines(file).iterator();
                while (it.hasNext()) {
                    String[] split = it.next().split("\t");
                    if (split.length >= 3) {
                        hashSet.add(split[2].trim());
                    }
                }
            }
        }
        ArrayList arrayList = new ArrayList(hashSet);
        Collections.sort(arrayList);
        PrintWriter printWriter = new PrintWriter(str2);
        Iterator it2 = arrayList.iterator();
        while (it2.hasNext()) {
            printWriter.println((String) it2.next());
        }
        printWriter.close();
    }

    public static void transformToKEAOutput(String str, int i, String str2, String str3, Lemmatiser lemmatiser, SolrCore solrCore, JATEProperties jATEProperties) throws IOException, ParseException {
        List<String> loadFromJSON = ATEResultLoader.loadFromJSON(str);
        if (i > loadFromJSON.size() || i < 1) {
            HashSet hashSet = new HashSet();
            for (File file : new File(str2).listFiles()) {
                if (file.getName().endsWith(".ann")) {
                    Iterator<String> it = FileUtils.readLines(file).iterator();
                    while (it.hasNext()) {
                        String[] split = it.next().split("\t");
                        if (split.length >= 3) {
                            String normaliseTerm = Scorer.normaliseTerm(split[2].trim(), lemmatiser);
                            if (normaliseTerm.length() > 1) {
                                hashSet.add(normaliseTerm);
                            }
                        }
                    }
                }
            }
            i = hashSet.size() > loadFromJSON.size() ? loadFromJSON.size() : hashSet.size();
            System.out.println("topn=" + i);
        }
        ArrayList arrayList = new ArrayList();
        for (int i2 = 0; i2 < i; i2++) {
            arrayList.add(loadFromJSON.get(i2));
        }
        SolrIndexSearcher solrIndexSearcher = solrCore.getSearcher().get();
        HashMap hashMap = new HashMap();
        for (int i3 = 0; i3 < solrIndexSearcher.maxDoc(); i3++) {
            String str4 = solrIndexSearcher.doc(i3).get("id");
            hashMap.put(str4.substring(str4.lastIndexOf("/") + 1, str4.lastIndexOf(".")) + ".ann", Integer.valueOf(i3));
        }
        new File(str3).mkdirs();
        for (File file2 : new File(str2).listFiles()) {
            if (file2.getName().endsWith(".ann")) {
                PrintWriter printWriter = new PrintWriter(str3 + File.separator + file2.getName());
                HashSet hashSet2 = new HashSet();
                for (String str5 : FileUtils.readLines(file2)) {
                    String[] split2 = str5.split("\t");
                    if (split2.length >= 3) {
                        String normaliseTerm2 = Scorer.normaliseTerm(split2[2].trim(), lemmatiser);
                        if (normaliseTerm2.length() > 1 && arrayList.contains(normaliseTerm2)) {
                            hashSet2.add(normaliseTerm2);
                            printWriter.println(str5);
                        }
                    }
                }
                int intValue = ((Integer) hashMap.get(file2.getName())).intValue();
                System.out.println("\t" + intValue + AbstractGangliaSink.EQUAL + file2.getName());
                try {
                    Terms termVector = SolrUtil.getTermVector(intValue, jATEProperties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);
                    if (termVector != null) {
                        TermsEnum it2 = termVector.iterator();
                        BytesRef next = it2.next();
                        while (next != null) {
                            if (next.length == 0) {
                                next = it2.next();
                            } else {
                                String utf8ToString = next.utf8ToString();
                                if (hashSet2.contains(utf8ToString)) {
                                    next = it2.next();
                                } else {
                                    if (arrayList.contains(utf8ToString)) {
                                        PostingsEnum postings = it2.postings(null, 56);
                                        postings.nextDoc();
                                        int freq = postings.freq();
                                        for (int i4 = 0; i4 < freq; i4++) {
                                            postings.nextPosition();
                                            StringBuilder sb = new StringBuilder();
                                            sb.append("T0\t").append("Process ").append(postings.startOffset()).append(" ").append(postings.endOffset()).append("\t").append(utf8ToString);
                                            printWriter.println(sb.toString());
                                        }
                                    }
                                    next = it2.next();
                                }
                            }
                        }
                    }
                } catch (JATEException e) {
                    e.printStackTrace();
                }
                printWriter.close();
            }
        }
    }

    public static void main(String[] strArr) throws IOException, ParseException, JATEException {
        Lemmatiser lemmatiser = new Lemmatiser(new EngLemmatiser(strArr[0], false, false));
        String str = strArr[4];
        String str2 = strArr[5];
        EmbeddedSolrServer embeddedSolrServer = new EmbeddedSolrServer(Paths.get(str, new String[0]), str2);
        JATEProperties jateProperties = App.getJateProperties(strArr[6]);
        for (File file : new File(strArr[1]).listFiles()) {
            File file2 = new File(strArr[2] + "/" + file.getName() + "/");
            file2.mkdirs();
            System.out.println(file2);
            transformToKEAOutput(file.toString(), 0, strArr[3], file2.toString(), lemmatiser, embeddedSolrServer.getCoreContainer().getCore(str2), jateProperties);
        }
        embeddedSolrServer.close();
        System.exit(0);
    }
}
