/*
 * Decompiled with CFR 0.152.
 */
package de.jungblut.crawl.extraction;

import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.extraction.Extractor;
import de.jungblut.crawl.extraction.OutlinkExtractor;
import java.io.InputStream;
import java.util.HashSet;
import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.util.ParserException;

public final class HtmlExtrator
implements Extractor<HtmlFetchResult> {
    @Override
    public final HtmlFetchResult extract(String site) {
        if (site == null || !site.startsWith("http") || site.length() > 500) {
            return null;
        }
        try {
            InputStream connection = OutlinkExtractor.getConnection(site);
            String html = OutlinkExtractor.consumeStream(connection);
            html = StringEscapeUtils.unescapeHtml((String)html);
            HashSet<String> outlinkSet = OutlinkExtractor.extractOutlinks(html, site);
            return new HtmlFetchResult(site, outlinkSet, html);
        }
        catch (ParserException connection) {
        }
        catch (Exception e) {
            String errMsg = e.getMessage().length() > 150 ? e.getMessage().substring(0, 150) : e.getMessage();
            System.err.println(errMsg.replace("\n", "") + " >>> URL was: \"" + site + "\"");
        }
        return null;
    }

    public static class HtmlFetchResult
    extends FetchResult {
        private final String html;

        public HtmlFetchResult(String url, HashSet<String> outlinks) {
            super(url, outlinks);
            this.html = null;
        }

        public HtmlFetchResult(String url, HashSet<String> outlinks, String html) {
            super(url, outlinks);
            this.html = html;
        }

        public String getHtml() {
            return this.html;
        }

        @Override
        public String toString() {
            return this.html;
        }
    }
}

