/*
 * Decompiled with CFR 0.152.
 */
package de.jungblut.crawl.extraction;

import de.jungblut.crawl.ConsoleResultWriter;
import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.SequentialCrawler;
import de.jungblut.crawl.extraction.Extractor;
import de.jungblut.crawl.extraction.OutlinkExtractor;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.concurrent.ExecutionException;
import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

public final class ArticleContentExtrator
implements Extractor<ContentFetchResult> {
    private final BoilerpipeExtractor extractor = ArticleExtractor.getInstance();
    private static final NodeFilter TITLE_FILTER = new NodeClassFilter(TitleTag.class);

    @Override
    public ContentFetchResult extract(String site) {
        if (site == null || !site.startsWith("http") || site.length() > 500) {
            return null;
        }
        try {
            InputStream connection = OutlinkExtractor.getConnection(site);
            String html = OutlinkExtractor.consumeStream(connection);
            html = StringEscapeUtils.unescapeHtml((String)html);
            HashSet<String> outlinkSet = OutlinkExtractor.extractOutlinks(html, site);
            String title = ArticleContentExtrator.extractTitle(html);
            String extractedLargestText = this.extractor.getText(html);
            return new ContentFetchResult(site, outlinkSet, title, extractedLargestText);
        }
        catch (ParserException connection) {
        }
        catch (RuntimeException rEx) {
            rEx.printStackTrace();
        }
        catch (Exception e) {
            System.err.println(e.toString().replace("\n", "; ") + " >>> URL was: \"" + site + "\"");
        }
        return null;
    }

    public static String extractTitle(String html) throws ParserException {
        String title = "";
        Parser parser = new Parser(html);
        NodeList matches = parser.extractAllNodesThatMatch(TITLE_FILTER);
        SimpleNodeIterator it = matches.elements();
        while (it.hasMoreNodes()) {
            TitleTag node = (TitleTag)it.nextNode();
            title = node.getTitle().trim();
        }
        return title;
    }

    public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
        String start = "http://www.spiegel.de/wissenschaft/natur/erbgut-entziffert-austern-haben-viele-anti-stress-gene-a-856902.html";
        SequentialCrawler<ContentFetchResult> crawler = new SequentialCrawler<ContentFetchResult>(1, new ArticleContentExtrator(), new ConsoleResultWriter());
        crawler.process(start);
    }

    public static class ContentFetchResult
    extends FetchResult {
        private final String title;
        private final String text;

        public ContentFetchResult(String url, HashSet<String> outlinks) {
            super(url, outlinks);
            this.title = null;
            this.text = null;
        }

        public ContentFetchResult(String url, HashSet<String> outlinks, String title, String text) {
            super(url, outlinks);
            this.title = title;
            this.text = text;
        }

        public String getTitle() {
            return this.title;
        }

        public String getText() {
            return this.text;
        }

        @Override
        public String toString() {
            return this.title + "\n\n" + this.text;
        }
    }
}

