package uk.ac.shef.dcs.sti.TODO.gs;

import info.bliki.wiki.dump.IArticleFilter;
import info.bliki.wiki.dump.Siteinfo;
import info.bliki.wiki.dump.WikiArticle;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.logging.Logger;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.sti.STIException;
import uk.ac.shef.dcs.sti.core.model.Table;
import uk.ac.shef.dcs.sti.parser.list.ListXtractor;
import uk.ac.shef.dcs.sti.parser.table.TableParser;

/* loaded from: input_file:uk/ac/shef/dcs/sti/TODO/gs/WikipediaTableListPageFilter.class */
public class WikipediaTableListPageFilter implements IArticleFilter {
    private TableParser tXtractor;
    private String targetTableDir;
    private ListXtractor lXtractor;
    private String targetListDir;
    private int countMainPages;
    private int countTables;
    private int countLists;
    private int countTableDirs;
    private int countListDirs;
    private final int tablesPerDir = 5000;
    private final int listsPerDir = 5000;
    private static Logger logger = Logger.getLogger(WikipediaTableListPageFilter.class.getName());

    public WikipediaTableListPageFilter(TableParser tableParser, String str, ListXtractor listXtractor, String str2) {
        this.tXtractor = tableParser;
        this.targetTableDir = str;
        this.lXtractor = listXtractor;
        this.targetListDir = str2;
    }

    public void process(WikiArticle wikiArticle, Siteinfo siteinfo) throws SAXException {
        if (this.countMainPages % 200 == 0) {
            logger.info("Pages processed: " + this.countMainPages + ", tables " + this.countTables + ", lists " + this.countLists);
        }
        if (!wikiArticle.isMain() || wikiArticle.getText() == null) {
            return;
        }
        this.countMainPages++;
        String lowerCase = wikiArticle.getText().toLowerCase();
        if (lowerCase.indexOf("wikitable") != -1) {
            List<Table> list = null;
            try {
                list = this.tXtractor.extract(wikiArticle.getText(), wikiArticle.getTitle() + "_" + wikiArticle.getId());
            } catch (STIException e) {
                e.printStackTrace();
            }
            for (Table table : list) {
                try {
                    TableParser.serialize(table, this.targetTableDir + File.separator + this.countTableDirs);
                    this.countTables++;
                } catch (IOException e2) {
                    logger.warning("Serialization failed for table " + table.toString());
                }
                if (this.countTables != 0 && this.countTables % 5000 == 0) {
                    this.countTableDirs++;
                }
            }
        }
        if (lowerCase.indexOf("* ") != -1) {
            for (uk.ac.shef.dcs.sti.core.model.List list2 : this.lXtractor.extract(wikiArticle.getText(), wikiArticle.getTitle() + "_" + wikiArticle.getId())) {
                try {
                    ListXtractor.serialize(list2, this.targetListDir + File.separator + this.countListDirs);
                    this.countLists++;
                } catch (IOException e3) {
                    logger.warning("Serialization failed for list " + list2.toString());
                }
                if (this.countLists != 0 && this.countLists % 5000 == 0) {
                    this.countListDirs++;
                }
            }
        }
    }
}
