/*
 * Decompiled with CFR 0.152.
 */
package de.jungblut.crawl.extraction;

import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.extraction.Extractor;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.mozilla.universalchardet.UniversalDetector;

public final class OutlinkExtractor
implements Extractor<FetchResult> {
    private static final int BUFFER_SIZE = 0x100000;
    private static final String USER_AGENT_KEY = "User-Agent";
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11";
    private static final NodeFilter LINK_FILTER = new NodeClassFilter(LinkTag.class);
    private static final Pattern IGNORE_SUFFIX_PATTERN = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf|iso|rm|smil|wmv|swf|wma|zip|rar|gz))$");
    private static final Pattern BASE_URL_PATTERN = Pattern.compile("(http[s]*://[a-z0-9.-]+)");
    private static final Pattern GENERAL_URL_PATTERN = Pattern.compile("\\bhttps?://[-a-zA-Z0-9+&#/%?=~_|!:,.;]*[-a-zA-Z0-9+&#/%=~_|]");

    @Override
    public FetchResult extract(String realUrl) {
        if (realUrl == null || !realUrl.startsWith("http") || realUrl.length() > 500) {
            return null;
        }
        try {
            InputStream connection = OutlinkExtractor.getConnection(realUrl);
            String html = OutlinkExtractor.consumeStream(connection);
            HashSet<String> set = OutlinkExtractor.extractOutlinks(html, realUrl);
            return new FetchResult(realUrl, set);
        }
        catch (ParserException connection) {
        }
        catch (RuntimeException rEx) {
            rEx.printStackTrace();
        }
        catch (Exception e) {
            System.err.println(e.toString().replace("\n", "; ") + " >>> URL was: \"" + realUrl + "\"");
        }
        return null;
    }

    public static InputStream getConnection(String realUrl) throws IOException {
        URL url = new URL(realUrl);
        URLConnection con = url.openConnection();
        con.addRequestProperty(USER_AGENT_KEY, USER_AGENT);
        return con.getInputStream();
    }

    public static HashSet<String> filter(HashSet<String> set, Pattern matcher) {
        if (matcher != null) {
            Iterator<String> iterator = set.iterator();
            while (iterator.hasNext()) {
                if (matcher.matcher(iterator.next()).matches()) continue;
                iterator.remove();
            }
        }
        return set;
    }

    public static HashSet<String> extractOutlinks(String html, String url) throws ParserException {
        String baseUrl = OutlinkExtractor.extractBaseUrl(url);
        if (baseUrl == null) {
            return null;
        }
        HashSet<String> set = new HashSet<String>();
        Parser parser = new Parser(html);
        NodeList matches = parser.extractAllNodesThatMatch(LINK_FILTER);
        SimpleNodeIterator it = matches.elements();
        while (it.hasMoreNodes()) {
            LinkTag node = (LinkTag)it.nextNode();
            Object link = node.getLink().trim();
            if (((String)link).contains("#")) {
                link = ((String)link).substring(0, ((String)link).lastIndexOf(35));
            }
            if (link == null || ((String)link).isEmpty()) continue;
            if (OutlinkExtractor.isValid((String)link)) {
                set.add((String)link);
                continue;
            }
            if (((String)link).startsWith("//") && OutlinkExtractor.isValid((String)(link = "http:" + (String)link))) {
                set.add((String)link);
                continue;
            }
            if (((String)link).charAt(0) == '/' && OutlinkExtractor.isValid((String)(link = baseUrl + (String)link))) {
                set.add((String)link);
                continue;
            }
            link = url.endsWith("/") ? url + (String)link : url.substring(0, url.lastIndexOf(47) + 1) + (String)link;
            if (!OutlinkExtractor.isValid((String)link)) continue;
            set.add((String)link);
        }
        return set;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static String consumeStream(InputStream stream) throws IOException {
        try {
            UniversalDetector detector = new UniversalDetector(null);
            ReadableByteChannel bc = Channels.newChannel(stream);
            ByteBuffer buffer = ByteBuffer.allocate(0x100000);
            int read = 0;
            while ((read = bc.read(buffer)) != -1) {
                detector.handleData(buffer.array(), buffer.position() - read, read);
                buffer = OutlinkExtractor.resizeBuffer(buffer);
            }
            detector.dataEnd();
            String encoding = detector.getDetectedCharset();
            String string = new String(buffer.array(), 0, buffer.position(), encoding == null ? "UTF-8" : encoding);
            return string;
        }
        finally {
            if (stream != null) {
                stream.close();
            }
        }
    }

    private static ByteBuffer resizeBuffer(ByteBuffer buffer) {
        ByteBuffer result = buffer;
        if (buffer.remaining() < (int)((float)buffer.capacity() * 0.1f)) {
            result = ByteBuffer.allocate(buffer.capacity() * 2);
            buffer.flip();
            result.put(buffer);
        }
        return result;
    }

    public static String extractBaseUrl(String url) {
        Matcher matcher = BASE_URL_PATTERN.matcher(url);
        if (matcher.find()) {
            return matcher.group();
        }
        return null;
    }

    public static boolean isValid(String s) {
        Matcher baseMatcher = BASE_URL_PATTERN.matcher(s);
        return baseMatcher.find() && baseMatcher.start() == 0 && !IGNORE_SUFFIX_PATTERN.matcher(s).matches() && GENERAL_URL_PATTERN.matcher(s).matches();
    }
}

