package uk.ac.shef.dcs.sti.TODO.evaluation;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.any23.extractor.html.DomUtils;
import org.apache.any23.util.FileUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.lang3.StringEscapeUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.kbsearch.model.Clazz;
import uk.ac.shef.dcs.kbsearch.model.Entity;
import uk.ac.shef.dcs.sti.STIEnum;
import uk.ac.shef.dcs.sti.core.model.TCell;
import uk.ac.shef.dcs.sti.core.model.TCellAnnotation;
import uk.ac.shef.dcs.sti.core.model.TColumnHeader;
import uk.ac.shef.dcs.sti.core.model.TColumnHeaderAnnotation;
import uk.ac.shef.dcs.sti.core.model.TContext;
import uk.ac.shef.dcs.sti.core.model.Table;

/* loaded from: input_file:uk/ac/shef/dcs/sti/TODO/evaluation/DataStats_TableSize_NameLength_Anaylsis_LimayeOld.class */
public class DataStats_TableSize_NameLength_Anaylsis_LimayeOld {
    public static void main(String[] strArr) throws IOException, SAXException, ParserConfigurationException {
        PrintWriter printWriter = new PrintWriter("D:\\Work\\lodie\\tmp_result/nl.csv");
        PrintWriter printWriter2 = new PrintWriter("D:\\Work\\lodie\\tmp_result/rows_cols.csv");
        for (File file : FileUtils.listFilesRecursively(new File("E:\\Data\\table_annotation\\limaye\\all_tables_raw(limaye_original)"), new SuffixFileFilter(".xml"))) {
            String str = "E:\\Data\\table_annotation\\limaye\\all_tables_groundtruth_xml(limaye_original)" + file.getAbsolutePath().replaceAll("\\\\", "/").substring("E:\\Data\\table_annotation\\limaye\\all_tables_raw(limaye_original)".length());
            if (new File(str).exists()) {
                System.out.println(str);
                checkGroundTruthLimaye(file.toString(), str, printWriter, printWriter2);
            } else {
                System.err.println("annotated file does not exist: " + str);
            }
        }
        System.out.println();
        printWriter.close();
        printWriter2.close();
    }

    public static Table checkGroundTruthLimaye(String str, String str2, PrintWriter printWriter, PrintWriter printWriter2) throws IOException, ParserConfigurationException, SAXException {
        Table table;
        List findAllByTag;
        String textContent;
        List findAll;
        DocumentBuilder newDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        Document parse = newDocumentBuilder.parse(str);
        List findAll2 = DomUtils.findAll(parse, "//logicalTable/content");
        if (findAll2 == null || findAll2.size() == 0) {
            return null;
        }
        boolean z = false;
        ArrayList<String[]> arrayList = new ArrayList();
        NodeList childNodes = ((Node) findAll2.get(0)).getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (!item.getNodeName().equals("#text")) {
                if (item.getNodeName().equals("header")) {
                    z = true;
                }
                List findAll3 = DomUtils.findAll(item, "cell");
                String[] strArr = new String[findAll3.size()];
                for (int i2 = 0; i2 < findAll3.size(); i2++) {
                    Node node = (Node) findAll3.get(i2);
                    List findAll4 = DomUtils.findAll(node, "html");
                    String str3 = "";
                    if (findAll4 != null && findAll4.size() > 0) {
                        str3 = extract_text_content_from_html(findAll4);
                    }
                    if (str3.equals("") && (findAll = DomUtils.findAll(node, "text")) != null && findAll.size() > 0) {
                        str3 = ((Node) findAll.get(0)).getTextContent();
                    }
                    strArr[i2] = str3;
                }
                arrayList.add(strArr);
            }
        }
        int i3 = 0;
        for (String[] strArr2 : arrayList) {
            if (strArr2.length > i3) {
                i3 = strArr2.length;
            }
        }
        int i4 = 0;
        if (z) {
            table = new Table(String.valueOf(str.hashCode()), str, arrayList.size() - 1, i3);
            i4 = 1;
            if (((String[]) arrayList.get(0)).length < i3) {
                System.err.println("WARNING:Artificial header added, check manually. " + str);
                String[] strArr3 = (String[]) arrayList.get(0);
                String[] strArr4 = new String[i3];
                for (int i5 = 0; i5 < strArr4.length; i5++) {
                    if (i5 < strArr3.length) {
                        strArr4[i5] = strArr3[i5];
                    } else {
                        strArr4[i5] = STIEnum.TABLE_HEADER_UNKNOWN.getValue();
                    }
                }
                arrayList.set(0, strArr4);
            }
            for (int i6 = 0; i6 < i3; i6++) {
                table.setColumnHeader(i6, new TColumnHeader(((String[]) arrayList.get(0))[i6]));
            }
        } else {
            table = new Table(String.valueOf(str.hashCode()), str, arrayList.size(), i3);
            for (int i7 = 0; i7 < i3; i7++) {
                table.setColumnHeader(i7, new TColumnHeader(STIEnum.TABLE_HEADER_UNKNOWN.getValue()));
            }
        }
        for (int i8 = i4; i8 < arrayList.size(); i8++) {
            String[] strArr5 = (String[]) arrayList.get(i8);
            for (int i9 = 0; i9 < strArr5.length; i9++) {
                table.setContentCell(i8 - i4, i9, new TCell(strArr5[i9]));
            }
        }
        List findAll5 = DomUtils.findAll(parse, "//logicalTable/tableContext");
        if (findAll5 != null || findAll5.size() != 0) {
            NodeList childNodes2 = ((Node) findAll5.get(0)).getChildNodes();
            int i10 = 0;
            while (i10 < childNodes2.getLength()) {
                Node item2 = childNodes2.item(i10);
                if (!item2.getNodeName().equals("#text") && (findAllByTag = DomUtils.findAllByTag(item2, "text")) != null && findAllByTag.size() > 0 && (textContent = ((Node) findAllByTag.get(0)).getTextContent()) != null) {
                    table.addContext(i10 == 1 ? new TContext(textContent, TContext.TableContextType.PAGETITLE, 1.0d) : new TContext(textContent, TContext.TableContextType.PARAGRAPH_BEFORE, 1.0d));
                }
                i10++;
            }
        }
        if (table.getContexts().size() > 1) {
            table.getContexts().remove(1);
        }
        if (str2 == null) {
            return table;
        }
        if (new File(str2).exists()) {
            Document parse2 = newDocumentBuilder.parse(str2);
            List findAll6 = DomUtils.findAll(parse2, "//columnAnnotations/colAnnos");
            for (int i11 = 0; i11 < findAll6.size(); i11++) {
                Node node2 = (Node) findAll6.get(i11);
                int intValue = Integer.valueOf(node2.getAttributes().getNamedItem("col").getTextContent()).intValue();
                NodeList childNodes3 = node2.getChildNodes();
                ArrayList arrayList2 = new ArrayList();
                for (int i12 = 0; i12 < childNodes3.getLength(); i12++) {
                    Node item3 = childNodes3.item(i12);
                    if (item3.getNodeName().equals("anno")) {
                        arrayList2.add(new TColumnHeaderAnnotation(table.getColumnHeader(intValue).getHeaderText(), new Clazz(item3.getAttributes().getNamedItem("name").getTextContent(), item3.getAttributes().getNamedItem("name").getTextContent()), Double.valueOf(item3.getAttributes().getNamedItem("value").getTextContent().trim()).doubleValue()));
                    }
                }
                table.getTableAnnotations().setHeaderAnnotation(intValue, (TColumnHeaderAnnotation[]) arrayList2.toArray(new TColumnHeaderAnnotation[0]));
            }
            List findAll7 = DomUtils.findAll(parse2, "//cellAnnotatoons/row");
            HashSet hashSet = new HashSet();
            for (int i13 = 0; i13 < findAll7.size(); i13++) {
                List findAll8 = DomUtils.findAll((Node) findAll7.get(i13), "entity");
                for (int i14 = 0; i14 < findAll8.size(); i14++) {
                    Node node3 = (Node) findAll8.get(i14);
                    if (node3.getTextContent() != null && node3.getTextContent().length() != 0) {
                        String text = table.getContentCell(i13, i14).getText();
                        TCellAnnotation tCellAnnotation = new TCellAnnotation(text, new Entity(node3.getTextContent(), node3.getTextContent()), 1.0d, new HashMap());
                        printWriter.println((text.length() <= 20 || text.contains(" ") || !text.contains("/")) ? text.replaceAll("[\\-_]", " ").trim().split("\\s+").length : 0);
                        table.getTableAnnotations().setContentCellAnnotations(i13, i14, new TCellAnnotation[]{tCellAnnotation});
                        hashSet.add(Integer.valueOf(i14));
                    }
                }
            }
            printWriter2.println(findAll7.size() + "," + hashSet.size());
        }
        return table;
    }

    private static String extract_text_content_from_html(List<Node> list) {
        String textContent = list.get(0).getTextContent();
        int indexOf = textContent.indexOf("<td>");
        if (indexOf == -1) {
            indexOf = textContent.indexOf("<th>");
        }
        if (indexOf == -1) {
            indexOf = textContent.indexOf("<thead>");
        }
        if (indexOf != -1) {
            textContent = textContent.substring(indexOf + 4);
            int indexOf2 = textContent.indexOf("</td>");
            if (indexOf2 == -1) {
                indexOf2 = textContent.indexOf("</th>");
            }
            if (indexOf2 == -1) {
                indexOf2 = textContent.indexOf("</thead>");
            }
            if (indexOf2 != -1) {
                textContent = textContent.substring(0, indexOf2).trim();
            }
        }
        return StringEscapeUtils.unescapeHtml4(textContent);
    }

    private static void dumpHTMLContent(Node node, String str, String str2) throws FileNotFoundException {
        String textContent = node.getTextContent();
        int indexOf = textContent.indexOf("CDATA[");
        int i = indexOf == -1 ? 0 : indexOf + 7;
        int lastIndexOf = textContent.lastIndexOf("]]>");
        String trim = textContent.substring(i, lastIndexOf == -1 ? textContent.length() : lastIndexOf).trim();
        PrintWriter printWriter = new PrintWriter(str + File.separator + new File(str2).getName() + "_" + str2.hashCode() + ".html");
        printWriter.println("<html><body><p>");
        printWriter.println(str2);
        printWriter.println("</p>");
        printWriter.println(trim);
        printWriter.println("</body></html>");
        printWriter.close();
    }
}
