package uk.ac.shef.dcs.sti.TODO.gs;

import com.ctc.wstx.cfg.XmlConsts;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.any23.extractor.html.DomUtils;
import org.apache.any23.extractor.html.TagSoupParser;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.jena.sparql.ARQConstants;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.tika.io.IOUtils;
import org.apache.xml.serializer.SerializerConstants;
import org.openrdf.http.protocol.Protocol;
import org.restlet.engine.Engine;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.kbsearch.freebase.FreebaseQueryProxy;
import uk.ac.shef.dcs.kbsearch.model.Entity;
import uk.ac.shef.dcs.sti.STIEnum;
import uk.ac.shef.dcs.sti.core.model.TCell;
import uk.ac.shef.dcs.sti.core.model.TCellAnnotation;
import uk.ac.shef.dcs.sti.core.model.TColumnHeader;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.core.subjectcol.TColumnFeatureGenerator;
import uk.ac.shef.dcs.sti.parser.table.TableParserLimayeDataset;
import uk.ac.shef.dcs.sti.parser.table.TableParserWikipedia;
import uk.ac.shef.dcs.sti.parser.table.creator.TableObjCreatorWikipedia;
import uk.ac.shef.dcs.sti.parser.table.hodetector.TableHODetectorByHTMLTag;
import uk.ac.shef.dcs.sti.parser.table.normalizer.TableNormalizerDiscardIrregularRows;
import uk.ac.shef.dcs.sti.parser.table.validator.TableValidatorGeneric;
import uk.ac.shef.dcs.sti.util.CollectionUtils;
import uk.ac.shef.dcs.sti.util.DataTypeClassifier;
import uk.ac.shef.dcs.sti.util.FileUtils;
import uk.ac.shef.dcs.util.SolrCache;
import uk.ac.shef.dcs.websearch.WebSearch;
import uk.ac.shef.dcs.websearch.WebSearchFactory;
import uk.ac.shef.dcs.websearch.WebSearchResultDoc;
import uk.ac.shef.dcs.websearch.bing.v2.BingSearchResultParser;

/* loaded from: input_file:uk/ac/shef/dcs/sti/TODO/gs/GSBuilder_Limaye_Wikitables.class */
public class GSBuilder_Limaye_Wikitables {
    protected FreebaseQueryProxy queryHelper;
    protected SolrCache solrCache;
    protected TableParserWikipedia xtractor;
    protected WebSearch searcher;
    protected BingSearchResultParser parser;
    protected static String wikipediaURL = "http://en.wikipedia.org/wiki/";
    protected static int maxRows = 200;
    protected static Logger log = Logger.getLogger(GSBuilder_Limaye_Wikitables.class.getName());

    public GSBuilder_Limaye_Wikitables(FreebaseQueryProxy freebaseQueryProxy, SolrCache solrCache, TableParserWikipedia tableParserWikipedia, String str) throws IOException {
        this.queryHelper = freebaseQueryProxy;
        this.solrCache = solrCache;
        this.xtractor = tableParserWikipedia;
        this.queryHelper = freebaseQueryProxy;
        this.solrCache = solrCache;
        this.xtractor = tableParserWikipedia;
        try {
            this.searcher = new WebSearchFactory().createInstance(str);
        } catch (Exception e) {
            e.printStackTrace();
        }
        this.parser = new BingSearchResultParser();
        this.parser = new BingSearchResultParser();
    }

    public GSBuilder_Limaye_Wikitables() {
    }

    public static void main(String[] strArr) throws IOException, SAXException, ParserConfigurationException {
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr[2];
        String str4 = strArr[4];
        int intValue = new Integer(strArr[5]).intValue();
        HashMap hashMap = new HashMap();
        if (strArr.length == 7) {
            Iterator<String> it = FileUtils.readList(strArr[6], false).iterator();
            while (it.hasNext()) {
                String[] split = it.next().split("\t\t\t");
                hashMap.put(split[0].trim().replaceAll(":", ARQConstants.allocVarBNodeToVar), split[1].trim());
            }
        }
        AutoCloseable autoCloseable = null;
        GSBuilder_Limaye_Wikitables gSBuilder_Limaye_Wikitables = new GSBuilder_Limaye_Wikitables(null, new SolrCache(null), new TableParserWikipedia(new TableNormalizerDiscardIrregularRows(true), new TableHODetectorByHTMLTag(), new TableObjCreatorWikipedia(true, false), new TableValidatorGeneric()), "8Yr8amTvrm5SM4XK3vM3KrLqOCT/ZhkwCfLEDtslE7o=");
        int i = 0;
        File[] listFiles = new File(str).listFiles();
        ArrayList<File> arrayList = new ArrayList(Arrays.asList(listFiles));
        Collections.sort(arrayList);
        System.out.println(listFiles.length);
        for (File file : arrayList) {
            try {
                i++;
                if (intValue <= i) {
                    if (file.getName().startsWith("file")) {
                        System.err.println("ERROR:SKIPPED_NON_WIKI:" + file.getName());
                    } else {
                        if (hashMap.size() > 0) {
                            boolean z = false;
                            Iterator it2 = hashMap.keySet().iterator();
                            while (true) {
                                if (it2.hasNext()) {
                                    if (file.toString().endsWith((String) it2.next())) {
                                        z = true;
                                        break;
                                    }
                                } else {
                                    break;
                                }
                            }
                            if (!z) {
                            }
                        }
                        System.out.println(i + "_" + file);
                        Table table = new TableParserLimayeDataset().extract(file.toString(), str2 + "/" + file.getName()).get(0);
                        String str5 = null;
                        if (hashMap.size() > 0) {
                            Iterator it3 = hashMap.keySet().iterator();
                            while (true) {
                                if (!it3.hasNext()) {
                                    break;
                                }
                                String str6 = (String) it3.next();
                                if (file.toString().endsWith(str6)) {
                                    str5 = fetchWikipediaWebpage((String) hashMap.get(str6));
                                    break;
                                }
                            }
                            if (str5 == null) {
                            }
                        } else {
                            int indexOf = file.getName().indexOf(".htm");
                            if (indexOf == -1) {
                                indexOf = file.getName().length();
                            }
                            String substring = file.getName().substring(0, indexOf);
                            try {
                                str5 = fetchWikipediaWebpage(wikipediaURL + substring);
                            } catch (Exception e) {
                                substring = parseToWikipediaTitle(file.toString());
                                try {
                                    str5 = fetchWikipediaWebpage(wikipediaURL + substring);
                                } catch (Exception e2) {
                                    if (substring.indexOf(ARQConstants.allocVarBNodeToVar) != -1) {
                                        System.err.println("ERROR:~REPLACED_NO_MATCH:" + file.getName());
                                    }
                                }
                            }
                            if (str5 == null || str5.length() == 0) {
                                str5 = gSBuilder_Limaye_Wikitables.tryWebSearch(substring);
                                if (str5 == null) {
                                    System.err.println("ERROR:NO_WIKIPAGE:" + file.getName());
                                }
                            }
                        }
                        Map<String, Set<String>> extractLinksFromWikipediaPage = extractLinksFromWikipediaPage(str5, file.toURI().toString());
                        Node findMatchingTable = findMatchingTable(table, extractWikiTables(str5, file.toURI().toString()));
                        if (findMatchingTable != null) {
                            Table process_wikitable = gSBuilder_Limaye_Wikitables.process_wikitable(findMatchingTable, file.toURI().toString(), file.toURI().toString(), (TContext[]) table.getContexts().toArray(new TContext[0]));
                            if (process_wikitable == null) {
                                if (gSBuilder_Limaye_Wikitables.annotateTable_fuzzy(table, extractLinksFromWikipediaPage, str3 + "/" + file.getName() + ".cell.keys")) {
                                    System.err.println("WARNING:IRREGULAR_TABLE:" + file.getName());
                                    gSBuilder_Limaye_Wikitables.saveAsLimaye(table, str3 + "/" + file.getName());
                                } else {
                                    System.err.println("ERROR:IRREGULAR_TABLE:" + file.getName());
                                }
                            } else if (process_wikitable.getNumCols() != 1 || table.getNumCols() <= 0) {
                                if (process_wikitable.getNumRows() <= maxRows) {
                                    boolean z2 = false;
                                    for (int i2 = 0; i2 < process_wikitable.getNumCols(); i2++) {
                                        if (process_wikitable.getColumnHeader(i2).getHeaderText().equals(STIEnum.TABLE_HEADER_UNKNOWN.getValue())) {
                                            z2 = true;
                                        }
                                    }
                                    if (!z2) {
                                        gSBuilder_Limaye_Wikitables.annotateTable(process_wikitable, str3 + "/" + file.getName() + ".cell.keys", maxRows);
                                        gSBuilder_Limaye_Wikitables.saveAsLimaye(process_wikitable, str3 + "/" + file.getName());
                                    } else if (gSBuilder_Limaye_Wikitables.annotateTable_fuzzy(table, extractLinksFromWikipediaPage, str3 + "/" + file.getName() + ".cell.keys")) {
                                        System.err.println("WARNING:NO_HEADER:" + file.getName());
                                        gSBuilder_Limaye_Wikitables.saveAsLimaye(table, str3 + "/" + file.getName());
                                    } else {
                                        System.err.println("ERROR:NO_HEADER:" + file.getName());
                                    }
                                } else if (gSBuilder_Limaye_Wikitables.annotateTable_fuzzy(table, extractLinksFromWikipediaPage, str3 + "/" + file.getName() + ".cell.keys")) {
                                    System.err.println("WARNING:TOO_LARGE:" + file.getName());
                                    gSBuilder_Limaye_Wikitables.saveAsLimaye(table, str3 + "/" + file.getName());
                                } else {
                                    System.err.println("ERROR:TOO_LARGE:" + file.getName());
                                }
                            } else if (gSBuilder_Limaye_Wikitables.annotateTable_fuzzy(table, extractLinksFromWikipediaPage, str3 + "/" + file.getName() + ".cell.keys")) {
                                System.err.println("WARNING:NO_TABLE(only 1 column):" + file.getName());
                                gSBuilder_Limaye_Wikitables.saveAsLimaye(table, str3 + "/" + file.getName());
                            } else {
                                System.err.println("ERROR:NO_TABLE(only 1 column):" + file.getName());
                            }
                        } else if (gSBuilder_Limaye_Wikitables.annotateTable_fuzzy(table, extractLinksFromWikipediaPage, str3 + "/" + file.getName() + ".cell.keys")) {
                            System.err.println("WARNING:NO_TABLE:" + file.getName());
                            gSBuilder_Limaye_Wikitables.saveAsLimaye(table, str3 + "/" + file.getName());
                        } else {
                            System.err.println("ERROR:NO_TABLE:" + file.getName());
                        }
                    }
                }
            } catch (Exception e3) {
                System.err.println("ERROR:UNKNOWN:" + file.getName());
                e3.printStackTrace();
            }
        }
        autoCloseable.close();
        System.exit(0);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String tryWebSearch(String str) throws Exception {
        String trim = str.replaceAll("[^a-zA-Z0-9]", " ").trim();
        if (trim.startsWith("/")) {
            trim = trim.substring(1).trim();
        }
        String str2 = "";
        try {
            Object retrieve = this.solrCache.retrieve("websearch_" + trim);
            if (retrieve != null) {
                str2 = retrieve.toString();
            }
        } catch (Exception e) {
        }
        if (str2.equals("")) {
            List<WebSearchResultDoc> parse = this.parser.parse(this.searcher.search(trim + " wikipedia"));
            Iterator<WebSearchResultDoc> it = parse.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                WebSearchResultDoc next = it.next();
                String trim2 = next.getTitle().replaceAll("[^a-zA-Z0-9]", " ").trim();
                if (next.getUrl().indexOf("wikipedia.org/wiki") != -1 && trim2.startsWith(trim)) {
                    str2 = next.getUrl();
                    break;
                }
            }
            if (str2.equals("")) {
                int i = 0;
                int i2 = -1;
                ArrayList arrayList = new ArrayList();
                for (String str3 : trim.split("\\s+")) {
                    String trim3 = str3.trim();
                    if (trim3.length() > 0) {
                        arrayList.add(trim3);
                    }
                }
                for (int i3 = 0; i3 < parse.size(); i3++) {
                    String trim4 = parse.get(i3).getTitle().replaceAll("_", " ").trim();
                    if (trim4.indexOf("Wikipedia") != -1) {
                        ArrayList arrayList2 = new ArrayList();
                        for (String str4 : trim4.split("\\s+")) {
                            String trim5 = str4.trim();
                            if (trim5.length() > 0) {
                                arrayList2.add(trim5);
                            }
                        }
                        arrayList2.retainAll(arrayList);
                        if (arrayList2.size() > i) {
                            i = arrayList2.size();
                            i2 = i3;
                        }
                    }
                }
                if (i2 != -1) {
                    str2 = parse.get(i2).getUrl();
                }
            }
            try {
                this.solrCache.cache("websearch_" + trim, str2, true);
            } catch (SolrServerException e2) {
                e2.printStackTrace();
            }
        }
        if (!str2.equals("") && str2.contains("en.wikipedia")) {
            return fetchWikipediaWebpage(str2);
        }
        return null;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String parseToWikipediaTitle(String str) {
        String replaceAll = str.replaceAll("\\\\", "/");
        int indexOf = replaceAll.indexOf(".htm");
        if (indexOf == -1) {
            return null;
        }
        int lastIndexOf = replaceAll.lastIndexOf("/");
        String substring = replaceAll.substring(lastIndexOf == -1 ? 0 : lastIndexOf + 1, indexOf);
        int lastIndexOf2 = substring.lastIndexOf("_");
        String trim = substring.substring(0, lastIndexOf2 == -1 ? substring.length() : lastIndexOf2).trim();
        if (trim.indexOf(ARQConstants.allocVarBNodeToVar) != -1) {
            trim = trim.replaceAll(ARQConstants.allocVarBNodeToVar, ":");
        }
        return trim;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String fetchWikipediaWebpage(String str) throws IOException, URISyntaxException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new URL(str).openConnection().getInputStream()));
        StringBuilder sb = new StringBuilder();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                return sb.toString();
            }
            sb.append(readLine);
        }
    }

    protected static String toString_LTable(Table table) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < table.getNumCols(); i++) {
            TColumnHeader columnHeader = table.getColumnHeader(i);
            if (columnHeader != null && columnHeader.getHeaderText() != null && !columnHeader.getHeaderText().equals(STIEnum.TABLE_HEADER_UNKNOWN)) {
                sb.append(columnHeader.getHeaderText()).append(" ");
            }
        }
        for (int i2 = 0; i2 < table.getNumRows(); i2++) {
            for (int i3 = 0; i3 < table.getNumCols(); i3++) {
                TCell contentCell = table.getContentCell(i2, i3);
                if (contentCell != null && contentCell.getText() != null) {
                    sb.append(contentCell.getText()).append(" ");
                }
            }
        }
        return sb.toString().trim();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void saveAsLimaye(Table table, String str) throws TransformerException, ParserConfigurationException {
        Document newDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        Element createElement = newDocument.createElement("entity");
        newDocument.appendChild(createElement);
        Element createElement2 = newDocument.createElement("logicalTable");
        Element createElement3 = newDocument.createElement("content");
        Element createElement4 = newDocument.createElement("tableContext");
        Element createElement5 = newDocument.createElement("header");
        for (int i = 0; i < table.getNumCols(); i++) {
            Element createElement6 = newDocument.createElement("cell");
            String str2 = "";
            TColumnHeader columnHeader = table.getColumnHeader(i);
            if (columnHeader != null && !columnHeader.getHeaderText().equals(STIEnum.TABLE_HEADER_UNKNOWN)) {
                str2 = columnHeader.getHeaderText();
            }
            Element createElement7 = newDocument.createElement("html");
            createElement7.appendChild(newDocument.createTextNode(StringEscapeUtils.escapeHtml4(str2).replaceAll("&nbsp;", " ").trim()));
            Element createElement8 = newDocument.createElement("wikipedia");
            createElement8.appendChild(newDocument.createTextNode(""));
            createElement6.appendChild(createElement7);
            createElement6.appendChild(createElement8);
            createElement5.appendChild(createElement6);
        }
        createElement3.appendChild(createElement5);
        for (int i2 = 0; i2 < table.getNumRows(); i2++) {
            Element createElement9 = newDocument.createElement("row");
            for (int i3 = 0; i3 < table.getNumCols(); i3++) {
                Element createElement10 = newDocument.createElement("cell");
                String str3 = "";
                String str4 = "";
                TCell contentCell = table.getContentCell(i2, i3);
                if (contentCell != null && contentCell.getText() != null) {
                    str3 = contentCell.getText();
                }
                TCellAnnotation[] contentCellAnnotations = table.getTableAnnotations().getContentCellAnnotations(i2, i3);
                if (contentCellAnnotations != null && contentCellAnnotations.length > 0) {
                    str4 = contentCellAnnotations[0].getAnnotation().getId();
                    if (str4.startsWith("/wiki/")) {
                        str4 = str4.substring(6).trim();
                    }
                }
                Element createElement11 = newDocument.createElement("html");
                createElement11.appendChild(newDocument.createTextNode(StringEscapeUtils.escapeHtml4(str3).replaceAll("&nbsp;", " ").trim()));
                Element createElement12 = newDocument.createElement("wikipedia");
                createElement12.appendChild(newDocument.createTextNode(str4));
                createElement10.appendChild(createElement11);
                createElement10.appendChild(createElement12);
                createElement9.appendChild(createElement10);
            }
            createElement3.appendChild(createElement9);
        }
        for (TContext tContext : table.getContexts()) {
            Element createElement13 = newDocument.createElement(Protocol.CONTEXT_PARAM_NAME);
            Element createElement14 = newDocument.createElement("computeElementScores");
            createElement14.appendChild(newDocument.createTextNode(String.valueOf(tContext.getImportanceScore())));
            Element createElement15 = newDocument.createElement("text");
            createElement15.appendChild(newDocument.createTextNode(tContext.getText()));
            createElement13.appendChild(createElement14);
            createElement13.appendChild(createElement15);
            createElement4.appendChild(createElement13);
        }
        createElement2.appendChild(createElement3);
        createElement2.appendChild(createElement4);
        createElement.appendChild(createElement2);
        Transformer newTransformer = TransformerFactory.newInstance().newTransformer();
        newTransformer.setOutputProperty("indent", XmlConsts.XML_SA_YES);
        newTransformer.setOutputProperty("method", "xml");
        newTransformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", Engine.MAJOR_NUMBER);
        newTransformer.transform(new DOMSource(newDocument), new StreamResult(new File(str)));
    }

    private boolean annotateTable(Table table, String str, int i) throws IOException, TransformerException, ParserConfigurationException {
        String queryWikipediaPageid;
        String createCellAnnotation;
        TColumnFeatureGenerator.setColumnDataTypes(table);
        StringBuilder sb = new StringBuilder();
        int i2 = 0;
        for (int i3 = 0; i3 < table.getNumRows() && i3 < i; i3++) {
            for (int i4 = 0; i4 < table.getNumCols(); i4++) {
                DataTypeClassifier.DataType type = table.getColumnHeader(i4).getTypes().get(0).getType();
                if (!type.equals(DataTypeClassifier.DataType.NUMBER) && !type.equals(DataTypeClassifier.DataType.DATE) && !type.equals(DataTypeClassifier.DataType.ORDERED_NUMBER) && !type.equals(DataTypeClassifier.DataType.LONG_TEXT) && !type.equals(DataTypeClassifier.DataType.LONG_STRING)) {
                    System.out.println("\t\tr=" + i3 + ",c=" + i4);
                    if (table.getContentCell(i3, i4) != null) {
                        TCellAnnotation[] contentCellAnnotations = table.getTableAnnotations().getContentCellAnnotations(i3, i4);
                        String str2 = "";
                        if (contentCellAnnotations != null && contentCellAnnotations.length > 0) {
                            str2 = contentCellAnnotations[0].getAnnotation().getId();
                            if (str2.startsWith("/wiki/")) {
                                str2 = str2.substring(6).trim();
                                i2++;
                            }
                        }
                        if (str2.length() > 0 && (queryWikipediaPageid = queryWikipediaPageid(str2, this.solrCache)) != null && (createCellAnnotation = createCellAnnotation(queryWikipediaPageid, this.solrCache)) != null && createCellAnnotation.length() > 0) {
                            sb.append(i3 + StringUtils.COMMA_STR + i4 + "=").append(createCellAnnotation).append("\n");
                        }
                    }
                }
            }
        }
        int i5 = 0;
        if (table.getTableAnnotations() != null) {
            for (int i6 = 0; i6 < table.getNumRows(); i6++) {
                for (int i7 = 0; i7 < table.getNumCols(); i7++) {
                    TCellAnnotation[] contentCellAnnotations2 = table.getTableAnnotations().getContentCellAnnotations(i6, i7);
                    if (contentCellAnnotations2 != null && contentCellAnnotations2.length > 0) {
                        i5++;
                    }
                }
            }
            if (i2 >= i5) {
            }
        }
        PrintWriter printWriter = new PrintWriter(str);
        printWriter.println(sb);
        printWriter.close();
        return true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Table process_wikitable(Node node, String str, String str2, TContext... tContextArr) {
        return this.xtractor.extractTable(node, str, str2, tContextArr);
    }

    private static String toString_Node(Node node) {
        String str = "";
        if (node.hasChildNodes()) {
            for (int i = 0; i < node.getChildNodes().getLength(); i++) {
                str = str + toString_Node(node.getChildNodes().item(i));
            }
        } else {
            str = node.getTextContent() + " ";
        }
        return str;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static double computeOverlap(String str, String str2) {
        return CollectionUtils.computeFrequencyWeightedDice(new ArrayList(Arrays.asList(str.split("\\s+"))), new ArrayList(Arrays.asList(str2.split("\\s+"))));
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static Node findMatchingTable(Table table, List<Node> list) {
        double d = 0.0d;
        Node node = null;
        String string_LTable = toString_LTable(table);
        for (Node node2 : list) {
            double computeOverlap = computeOverlap(string_LTable, toString_Node(node2));
            if (computeOverlap > d) {
                d = computeOverlap;
                node = node2;
            }
        }
        int i = 0;
        if (node != null) {
            List<Node> findAll = DomUtils.findAll(node, "//TR");
            r11 = findAll != null ? findAll.size() : 0;
            for (Node node3 : findAll) {
                if (node3.getChildNodes().getLength() > i) {
                    i = node3.getChildNodes().getLength();
                }
            }
        }
        if (node != null && ((d >= 1.0d || r11 >= table.getNumRows()) && (i != 1 || table.getNumCols() <= 1))) {
            return node;
        }
        System.err.println("(candidate table too small, likely to be incorrect so skipped)");
        return null;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static List<Node> extractWikiTables(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        try {
            List<Node> findAll = DomUtils.findAll(new TagSoupParser(IOUtils.toInputStream(str), str2, "UTF-8").getDOM(), "//TABLE");
            if (findAll.size() > 0) {
                arrayList.addAll(findAll);
            }
            return arrayList;
        } catch (IOException e) {
            return arrayList;
        }
    }

    public static String queryWikipediaPageid(String str, SolrCache solrCache) throws IOException {
        String substring;
        int indexOf;
        String substring2;
        int indexOf2;
        Date date = new Date();
        String str2 = "https://en.wikipedia.org/w/api.php?action=query&titles=" + str;
        String str3 = null;
        try {
            Object retrieve = solrCache.retrieve(str2);
            if (retrieve == null) {
                str3 = null;
            } else {
                str3 = retrieve.toString();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (str3 == null) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new URL(str2).openConnection().getInputStream()));
            StringBuilder sb = new StringBuilder();
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                sb.append(readLine);
            }
            bufferedReader.close();
            String sb2 = sb.toString();
            int indexOf3 = sb2.indexOf("page pageid=");
            if (indexOf3 != -1 && (indexOf = (substring = sb2.substring(indexOf3)).indexOf(SerializerConstants.ENTITY_QUOT)) != -1 && (indexOf2 = (substring2 = substring.substring(indexOf + 6)).indexOf(SerializerConstants.ENTITY_QUOT)) != -1) {
                try {
                    str3 = String.valueOf(Long.valueOf(substring2.substring(0, indexOf2).trim()));
                } catch (Exception e2) {
                    str3 = "";
                }
            }
            if (str3 != null) {
                try {
                    solrCache.cache(str2, str3, true);
                } catch (SolrServerException e3) {
                    e3.printStackTrace();
                }
            }
        }
        log.info("queryWikipedia:" + (new Date().getTime() - date.getTime()));
        if (str3 == null || str3.length() <= 0) {
            return null;
        }
        return str3;
    }

    public String createCellAnnotation(String str, SolrCache solrCache) throws IOException {
        String str2 = null;
        try {
            Object retrieve = solrCache.retrieve(str);
            if (retrieve != null) {
                str2 = retrieve.toString();
            }
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SolrServerException e2) {
            e2.printStackTrace();
        }
        if (str2 == null) {
            List<String> mqlapi_topic_mids_with_wikipedia_pageid = this.queryHelper.mqlapi_topic_mids_with_wikipedia_pageid(str);
            str2 = (mqlapi_topic_mids_with_wikipedia_pageid == null || mqlapi_topic_mids_with_wikipedia_pageid.size() == 0) ? "" : mqlapi_topic_mids_with_wikipedia_pageid.get(0);
            try {
                solrCache.cache(str, str2, true);
            } catch (SolrServerException e3) {
                e3.printStackTrace();
            }
        }
        if (str2 == null || str2.equals("")) {
            return null;
        }
        return str2;
    }

    public static void find_missed_files(String str, String str2) throws IOException {
        PrintWriter printWriter = new PrintWriter(str2);
        for (String str3 : FileUtils.readList(str, false)) {
            if (str3.startsWith("ERROR:")) {
                printWriter.println(str3.substring(str3.indexOf(":") + 1).trim());
            }
        }
        printWriter.close();
    }

    public static void find_missed_files_by_folder(String str, String str2, String str3) throws IOException {
        PrintWriter printWriter = new PrintWriter(str3);
        ArrayList arrayList = new ArrayList();
        for (File file : new File(str).listFiles()) {
            arrayList.add(file.getName());
        }
        for (File file2 : new File(str2).listFiles()) {
            if (!arrayList.contains(file2.getName() + ".cell.keys")) {
                printWriter.println(file2.getName());
            }
        }
        printWriter.close();
    }

    public static Map<String, Set<String>> extractLinksFromWikipediaPage(String str, String str2) {
        try {
            List<Node> findAll = DomUtils.findAll(new TagSoupParser(IOUtils.toInputStream(str), str2, "UTF-8").getDOM(), "//A");
            HashMap hashMap = new HashMap();
            for (Node node : findAll) {
                String textContent = node.getTextContent();
                if (textContent.trim().length() >= 1) {
                    String str3 = "";
                    try {
                        str3 = node.getAttributes().getNamedItem("href").getTextContent();
                    } catch (Exception e) {
                    }
                    if (textContent != null && textContent.length() > 0 && str3.length() > 0) {
                        Set set = (Set) hashMap.get(textContent);
                        Set hashSet = set == null ? new HashSet() : set;
                        hashSet.add(str3);
                        hashMap.put(textContent, hashSet);
                    }
                }
            }
            return hashMap;
        } catch (IOException e2) {
            return null;
        }
    }

    private boolean annotateTable_fuzzy(Table table, Map<String, Set<String>> map, String str) throws IOException, TransformerException, ParserConfigurationException {
        String queryWikipediaPageid;
        TColumnFeatureGenerator.setColumnDataTypes(table);
        int i = 0;
        if (table.getTableAnnotations() != null) {
            for (int i2 = 0; i2 < table.getNumRows(); i2++) {
                for (int i3 = 0; i3 < table.getNumCols(); i3++) {
                    TCellAnnotation[] contentCellAnnotations = table.getTableAnnotations().getContentCellAnnotations(i2, i3);
                    if (contentCellAnnotations != null && contentCellAnnotations.length > 0) {
                        i++;
                        table.getTableAnnotations().setContentCellAnnotations(i2, i3, new TCellAnnotation[0]);
                    }
                }
            }
        }
        StringBuilder sb = new StringBuilder();
        int i4 = 0;
        for (int i5 = 0; i5 < table.getNumRows(); i5++) {
            for (int i6 = 0; i6 < table.getNumCols(); i6++) {
                DataTypeClassifier.DataType type = table.getColumnHeader(i6).getTypes().get(0).getType();
                if (!type.equals(DataTypeClassifier.DataType.NUMBER) && !type.equals(DataTypeClassifier.DataType.DATE) && !type.equals(DataTypeClassifier.DataType.ORDERED_NUMBER) && !type.equals(DataTypeClassifier.DataType.LONG_TEXT) && !type.equals(DataTypeClassifier.DataType.LONG_STRING)) {
                    System.out.println("\t\tr=" + i5 + ",c=" + i6);
                    TCell contentCell = table.getContentCell(i5, i6);
                    if (contentCell != null) {
                        Set<String> set = map.get(contentCell.getText().trim());
                        String str2 = "";
                        if (set != null && set.size() == 1) {
                            str2 = set.iterator().next();
                            if (str2.startsWith("/wiki/")) {
                                str2 = str2.substring(6).trim();
                                i4++;
                            }
                        }
                        if (str2.length() > 0 && (queryWikipediaPageid = queryWikipediaPageid(str2, this.solrCache)) != null) {
                            String createCellAnnotation = createCellAnnotation(queryWikipediaPageid, this.solrCache);
                            if (createCellAnnotation != null && createCellAnnotation.length() > 0) {
                                sb.append(i5 + StringUtils.COMMA_STR + i6 + "=").append(createCellAnnotation).append("\n");
                            }
                            table.getTableAnnotations().setContentCellAnnotations(i5, i6, new TCellAnnotation[]{new TCellAnnotation(contentCell.getText(), new Entity(str2, str2), 1.0d, new HashMap())});
                        }
                    }
                }
            }
        }
        boolean z = i4 >= i;
        if (z) {
            PrintWriter printWriter = new PrintWriter(str);
            printWriter.println(sb);
            printWriter.close();
        }
        return z;
    }
}
