package cn.dolphin.core.html;

import cn.dolphin.core.consts.StringConstant;
import cn.dolphin.core.regex.CoreRegex;
import cn.dolphin.core.util.StrUtil;
import cn.dolphin.core.util.StrFormatter;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 匹配html内的元素或属性值
 */
@SuppressWarnings("all")
public class HtmlUtil {

    private static final char[][] TEXT = new char[64][];

    private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>";
    private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>";
    private static final String regEx_html = "<[^>]+>";


    static {
        for (int i = 0; i < 64; i++) {
            TEXT[i] = new char[] { (char) i };
        }

        // special HTML characters
        TEXT['\''] = "&#039;".toCharArray(); // 单引号 ('&apos;' doesn't work - it is not by the w3 specs)
        TEXT['"'] = StringConstant.HTML_QUOTE.toCharArray(); // 双引号
        TEXT['&'] = StringConstant.HTML_AMP.toCharArray(); // &符
        TEXT['<'] = StringConstant.HTML_LT.toCharArray(); // 小于号
        TEXT['>'] = StringConstant.HTML_GT.toCharArray(); // 大于号
    }

    /**
     * 清除HTML标签
     *
     * @param htmlStr
     * @return
     */
    public static String delHTMLTag(String htmlStr) {
        Pattern p_script = Pattern.compile("<script[^>]*?>[\\s\\S]*?<\\/script>", 2);
        Matcher m_script = p_script.matcher(htmlStr);
        htmlStr = m_script.replaceAll("");

        Pattern p_style = Pattern.compile("<style[^>]*?>[\\s\\S]*?<\\/style>", 2);
        Matcher m_style = p_style.matcher(htmlStr);
        htmlStr = m_style.replaceAll("");

        Pattern p_html = Pattern.compile("<[^>]+>", 2);
        Matcher m_html = p_html.matcher(htmlStr);
        htmlStr = m_html.replaceAll("");

        return htmlStr.trim();
    }

    /**
     * 转换特殊的HTML标签
     *
     * @param content
     * @return
     */
    public static String changeTag(String content) {
        content = content.replaceAll("&", "&amp;");
        content = content.replaceAll("<", "&lt;");
        content = content.replaceAll(">", "&gt;");
        content = content.replaceAll("\"", "&quot;");
        content = content.replaceAll("'", "&#x27;");
        content = content.replaceAll("/", "&#x2f;");
        return content;
    }


    /**
     * 还原被转义的HTML特殊字符
     *
     * @param htmlStr 包含转义符的HTML内容
     * @return 转换后的字符串
     */
    public static String restoreEscaped(String htmlStr) {
        if (StrUtil.isBlank(htmlStr)) {
            return htmlStr;
        }
        return htmlStr
                .replace("&#39;", "'")
                .replace(StringConstant.HTML_LT, "<")
                .replace(StringConstant.HTML_GT, ">")
                .replace(StringConstant.HTML_AMP, "&")
                .replace(StringConstant.HTML_QUOTE, "\"")
                .replace(StringConstant.HTML_NBSP, " ");
    }


    /**
     * 获取指定HTML标签的指定属性的值
     * @param source 要匹配的源文本
     * @param element 标签名称
     * @param attr 标签的属性名称
     * @return 属性值列表
     */
    public static List<String> matchAll(String source, String element, String attr) {
        List<String> result = new ArrayList<String>();
        String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?(/)?>";
        Matcher m = Pattern.compile(reg).matcher(source);
        while (m.find()) {
            String r = m.group(1);
            result.add(r);
        }
        return result;
    }


    /**
     * 只匹配一个结果
     * @param source 要匹配的源文本
     * @param element 标签名称
     * @param attr 标签的属性名称
     * @return
     */
    public static String matchOne(String source, String element, String attr) {
        String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?(/)?>";
        Matcher m = Pattern.compile(reg).matcher(source);
        if (m.find()) {
            String r = m.group(1);
            return r;
        }
        return null;
    }

    /**
     * 转义文本中的HTML字符为安全的字符，以下字符被转义：
     * <ul>
     * <li>' with &amp;#039; (&amp;apos; doesn't work in HTML4)</li>
     * <li>" with &amp;quot;</li>
     * <li>&amp; with &amp;amp;</li>
     * <li>&lt; with &amp;lt;</li>
     * <li>&gt; with &amp;gt;</li>
     * </ul>
     *
     * @param text 被转义的文本
     * @return 转义后的文本
     */
    public static String encode(String text) {
        return encode(text, TEXT);
    }

    /**
     * 清除所有HTML标签
     *
     * @param content 文本
     * @return 清除标签后的文本
     */
    public static String cleanHtmlTag(String content) {
        return content.replaceAll(CoreRegex.RE_HTML_MARK, "");
    }


    /**
     * 清除指定HTML标签和被标签包围的内容<br>
     * 不区分大小写
     * @param content 文本
     * @param tagNames 要清除的标签
     * @return 去除标签后的文本
     */
    public static String removeHtmlTag(String content, String... tagNames) {
        return removeHtmlTag(content, true, tagNames);
    }

    /**
     * 清除指定HTML标签，不包括内容<br>
     * 不区分大小写
     * @param content 文本
     * @param tagNames 要清除的标签
     * @return 去除标签后的文本
     */
    public static String unwrapHtmlTag(String content, String... tagNames) {
        return removeHtmlTag(content, false, tagNames);
    }



    /**
     * 清除指定HTML标签<br>
     * 不区分大小写
     * @param content 文本
     * @param withTagContent 是否去掉被包含在标签中的内容
     * @param tagNames 要清除的标签
     * @return 去除标签后的文本
     */
    public static String removeHtmlTag(String content, boolean withTagContent, String... tagNames) {
        String regex1 = null;
        String regex2 = null;
        for (String tagName : tagNames) {
            if(StrUtil.isBlank(tagName)) {
                continue;
            }
            tagName = tagName.trim();
            //(?i)表示其后面的表达式忽略大小写
            regex1 = StrFormatter.format("(?i)<{}\\s?[^>]*?/>", tagName);
            if(withTagContent) {
                //标签及其包含内容
                regex2 = StrFormatter.format("(?i)(?s)<{}\\s*?[^>]*?>.*?</{}>", tagName, tagName);
            }else {
                //标签不包含内容
                regex2 = StrFormatter.format("(?i)<{}\\s*?[^>]*?>|</{}>", tagName, tagName);
            }

            content = content
                    .replaceAll(regex1, StrUtil.EMPTY)									//自闭标签小写
                    .replaceAll(regex2, StrUtil.EMPTY);									//非自闭标签小写
        }
        return content;
    }

    /**
     * 去除HTML标签中的属性
     * @param content 文本
     * @param attrs 属性名（不区分大小写）
     * @return 处理后的文本
     */
    public static String removeHtmlAttr(String content, String... attrs) {
        String regex = null;
        for (String attr : attrs) {
            regex = StrFormatter.format("(?i)\\s*{}=([\"']).*?\\1", attr);
            content = content.replaceAll(regex, StrUtil.EMPTY);
        }
        return content;
    }

    /**
     * 去除指定标签的所有属性
     * @param content 内容
     * @param tagNames 指定标签
     * @return 处理后的文本
     */
    public static String removeAllHtmlAttr(String content, String... tagNames) {
        String regex = null;
        for (String tagName : tagNames) {
            regex = StrFormatter.format("(?i)<{}[^>]*?>", tagName);
            content.replaceAll(regex, StrFormatter.format("<{}>", tagName));
        }
        return content;
    }


    /**
     * Encoder
     * @param text 被编码的文本
     * @param array 特殊字符集合
     * @return 编码后的字符
     */
    private static String encode(String text, char[][] array) {
        int len;
        if ((text == null) || ((len = text.length()) == 0)) {
            return StrUtil.EMPTY;
        }
        StringBuilder buffer = new StringBuilder(len + (len >> 2));
        for (int i = 0; i < len; i++) {
            char c = text.charAt(i);
            if (c < 64) {
                buffer.append(array[c]);
            } else {
                buffer.append(c);
            }
        }
        return buffer.toString();
    }

    // 获取html首个图片
    public static String getHtmlPic(String content) {
        Matcher m = Pattern.compile(" src=\"http://.*?/[0-9]+(.jpg|.png|.gif|.bmp|.jpeg)", Pattern.CASE_INSENSITIVE)
                .matcher(content);
        while (m.find()) {
            String match = m.group();
            match = match.substring(6, match.length()).replace(" ", "");
            return match;
        }
        return "";
    }



    /**
     * 获取html文件内容
     * @param html
     * @return
     * @throws Exception
     */
    public static String getHtmlContent(File html) throws Exception {
        StringBuffer content = new StringBuffer();
        FileInputStream ism=null;
        InputStreamReader isr =null;
        BufferedReader bs =null;
        try {
            String info;
            ism = new FileInputStream(html);
            isr = new InputStreamReader(ism);
            bs = new BufferedReader(isr);
            info = bs.readLine();
            while (info != null) {
                content.append(info);
                info = bs.readLine();
            }
        } catch (IOException e) {

        }finally {
            if (ism!=null) {
                ism.close();
            }
            if (isr!=null) {
                isr.close();
            }
            if (bs!=null) {
                bs.close();
            }
        }
        return content.toString();
    }


}
