package uk.ac.shef.dcs.sti.TODO.gs;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.simmetrics.StringMetric;
import org.simmetrics.metrics.StringMetrics;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.kbsearch.freebase.FreebaseQueryProxy;
import uk.ac.shef.dcs.sti.core.model.TCellAnnotation;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.core.subjectcol.TColumnFeatureGenerator;
import uk.ac.shef.dcs.sti.parser.table.TableParserLimayeDataset;
import uk.ac.shef.dcs.sti.parser.table.TableParserWikipedia;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreatorWikipedia;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetectorByHTMLTag;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizerDiscardIrregularRows;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidatorGeneric;
import uk.ac.shef.dcs.sti.util.DataTypeClassifier;
import uk.ac.shef.dcs.sti.util.FileUtils;
import uk.ac.shef.dcs.util.SolrCache;
import uk.ac.shef.dcs.websearch.WebSearchFactory;
import uk.ac.shef.dcs.websearch.bing.v2.BingSearchResultParser;

/* loaded from: input_file:uk/ac/shef/dcs/sti/TODO/gs/GSBuilder_Limaye_Wikitables_with_Ref.class */
public class GSBuilder_Limaye_Wikitables_with_Ref extends GSBuilder_Limaye_Wikitables {
    private StringMetric stringSim = StringMetrics.levenshtein();

    public GSBuilder_Limaye_Wikitables_with_Ref(FreebaseQueryProxy freebaseQueryProxy, SolrCache solrCache, TableParserWikipedia tableParserWikipedia, String str) {
        this.queryHelper = freebaseQueryProxy;
        this.solrCache = solrCache;
        this.xtractor = tableParserWikipedia;
        try {
            this.searcher = new WebSearchFactory().createInstance(str);
        } catch (Exception e) {
            e.printStackTrace();
        }
        this.parser = new BingSearchResultParser();
    }

    public static void main(String[] strArr) throws IOException, SAXException, ParserConfigurationException {
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr[3];
        int intValue = new Integer(strArr[4]).intValue();
        HashMap hashMap = new HashMap();
        if (strArr.length == 6) {
            Iterator<String> it = FileUtils.readList(strArr[5], false).iterator();
            while (it.hasNext()) {
                String[] split = it.next().split("\t\t\t");
                hashMap.put(split[0].trim(), split[1].trim());
            }
        }
        EmbeddedSolrServer embeddedSolrServer = null;
        GSBuilder_Limaye_Wikitables_with_Ref gSBuilder_Limaye_Wikitables_with_Ref = new GSBuilder_Limaye_Wikitables_with_Ref(null, new SolrCache((EmbeddedSolrServer) null), new TableParserWikipedia(new TableNormalizerDiscardIrregularRows(true), new TableHODetectorByHTMLTag(), new TableObjCreatorWikipedia(false, false), new TableValidatorGeneric()), "nKegYOqCMXV0rjUHzKADJinbJ9NrkMyBMqm9h3X9vAo");
        int i = 0;
        File[] listFiles = new File(str).listFiles();
        ArrayList<File> arrayList = new ArrayList(Arrays.asList(listFiles));
        Collections.sort(arrayList);
        System.out.println(listFiles.length);
        for (File file : arrayList) {
            try {
                i++;
                if (intValue <= i) {
                    if (file.getName().startsWith("file")) {
                        System.err.println("ERROR:SKIPPED_NON_WIKI:" + file.getName());
                    } else {
                        System.out.println(i + "_" + file);
                        Table table = new TableParserLimayeDataset().extract(file.toString(), null).get(0);
                        String str4 = null;
                        if (hashMap.size() > 0) {
                            Iterator it2 = hashMap.keySet().iterator();
                            while (true) {
                                if (!it2.hasNext()) {
                                    break;
                                }
                                String str5 = (String) it2.next();
                                if (file.toString().endsWith(str5)) {
                                    str4 = fetchWikipediaWebpage((String) hashMap.get(str5));
                                    break;
                                }
                            }
                            if (str4 == null) {
                            }
                        } else {
                            int indexOf = file.getName().indexOf(".htm");
                            if (indexOf == -1) {
                                indexOf = file.getName().length();
                            }
                            String substring = file.getName().substring(0, indexOf);
                            if (substring.indexOf("~") != -1) {
                                System.err.println("ERROR:~DETECTED:" + file.getName());
                            } else {
                                try {
                                    str4 = fetchWikipediaWebpage(wikipediaURL + substring);
                                } catch (Exception e) {
                                    substring = parseToWikipediaTitle(file.toString());
                                    try {
                                        str4 = fetchWikipediaWebpage(wikipediaURL + substring);
                                    } catch (Exception e2) {
                                    }
                                }
                                if (str4 == null || str4.length() == 0) {
                                    str4 = gSBuilder_Limaye_Wikitables_with_Ref.tryWebSearch(substring);
                                    if (str4 == null) {
                                        System.err.println("ERROR:NO_WIKIPAGE:" + file.getName());
                                    }
                                }
                            }
                        }
                        Node findMatchingTable = findMatchingTable(table, extractWikiTables(StringUtils.stripAccents(str4), file.toURI().toString()));
                        if (findMatchingTable == null) {
                            System.err.println("ERROR:NO_TABLE:" + file.getName());
                        } else {
                            Table process_wikitable = gSBuilder_Limaye_Wikitables_with_Ref.process_wikitable(findMatchingTable, file.toURI().toString(), file.toURI().toString(), (TContext[]) table.getContexts().toArray(new TContext[0]));
                            if (process_wikitable == null) {
                                System.err.println("ERROR:IRREGULAR_TABLE:" + file.getName());
                            } else {
                                gSBuilder_Limaye_Wikitables_with_Ref.annotateTable(process_wikitable, table, str2 + "/" + file.getName() + ".cell.keys");
                                gSBuilder_Limaye_Wikitables_with_Ref.saveAsLimaye(process_wikitable, str2 + "/" + file.getName());
                            }
                        }
                    }
                }
            } catch (Exception e3) {
                System.err.println("ERROR:UNKNOWN:" + file.getName());
                e3.printStackTrace();
            }
        }
        embeddedSolrServer.close();
        System.exit(0);
    }

    private void annotateTable(Table table, Table table2, String str) throws IOException, TransformerException, ParserConfigurationException {
        String createCellAnnotation;
        StringBuilder sb = new StringBuilder();
        TColumnFeatureGenerator.setColumnDataTypes(table2);
        for (int i = 0; i < table2.getNumCols(); i++) {
            DataTypeClassifier.DataType type = table2.getColumnHeader(i).getTypes().get(0).getType();
            if (!type.equals(DataTypeClassifier.DataType.NUMBER) && !type.equals(DataTypeClassifier.DataType.DATE) && !type.equals(DataTypeClassifier.DataType.ORDERED_NUMBER) && !type.equals(DataTypeClassifier.DataType.LONG_TEXT) && !type.equals(DataTypeClassifier.DataType.LONG_STRING)) {
                int findMatchingColumn = findMatchingColumn(table2, i, table);
                if (findMatchingColumn == -1) {
                    System.err.println("\tERROR:no matching column=" + i + "," + table2.getColumnHeader(i).getHeaderText());
                } else {
                    for (int i2 = 0; i2 < table2.getNumRows(); i2++) {
                        String stripAccents = StringUtils.stripAccents(table2.getContentCell(i2, i).getText());
                        double d = 0.0d;
                        TCellAnnotation tCellAnnotation = null;
                        for (int i3 = 0; i3 < table.getNumRows(); i3++) {
                            for (TCellAnnotation tCellAnnotation2 : table.getTableAnnotations().getContentCellAnnotations(i3, findMatchingColumn)) {
                                double compare = this.stringSim.compare(stripAccents, tCellAnnotation2.getTerm());
                                if (compare > d) {
                                    d = compare;
                                    tCellAnnotation = tCellAnnotation2;
                                }
                            }
                        }
                        if (d > 0.9d) {
                            if (d != 1.0d) {
                                System.out.println("\t\t\tNoPerfectMatch:" + stripAccents + "(limaye)," + tCellAnnotation.getTerm() + "(wiki)");
                            }
                            String id = tCellAnnotation.getAnnotation().getId();
                            if (id.startsWith("/wiki/")) {
                                id = id.substring(6).trim();
                            }
                            String queryWikipediaPageid = queryWikipediaPageid(id, this.solrCache);
                            if (queryWikipediaPageid != null && (createCellAnnotation = createCellAnnotation(queryWikipediaPageid, this.solrCache)) != null && createCellAnnotation.length() > 0) {
                                sb.append(i2 + "," + i + "=").append(createCellAnnotation).append("\n");
                            }
                        }
                    }
                }
            }
        }
        PrintWriter printWriter = new PrintWriter(str);
        printWriter.println(sb);
        printWriter.close();
    }

    private int findMatchingColumn(Table table, int i, Table table2) {
        double d = 0.0d;
        int i2 = -1;
        String stripAccents = StringUtils.stripAccents(toString_Column(table, i));
        for (int i3 = 0; i3 < table2.getNumCols(); i3++) {
            double computeOverlap = computeOverlap(stripAccents, toString_Column_Annotation(table2, i3));
            if (computeOverlap > d) {
                d = computeOverlap;
                i2 = i3;
            }
        }
        return i2;
    }

    private String toString_Column(Table table, int i) {
        String str = "";
        for (int i2 = 0; i2 < table.getNumRows(); i2++) {
            str = str + table.getContentCell(i2, i).getText() + " ";
        }
        return str;
    }

    private String toString_Column_Annotation(Table table, int i) {
        String str = "";
        for (int i2 = 0; i2 < table.getNumRows(); i2++) {
            for (TCellAnnotation tCellAnnotation : table.getTableAnnotations().getContentCellAnnotations(i2, i)) {
                str = str + tCellAnnotation.getTerm() + " ";
            }
        }
        return str;
    }
}
