package itez.plat.site.service.impl;

import itez.core.runtime.service.EModelService;
import itez.core.wrapper.dbo.model.Query;
import itez.core.wrapper.dbo.model.Querys;
import itez.kit.EArr;
import itez.kit.EClean;
import itez.kit.EDate;
import itez.kit.EHttp;
import itez.kit.EProp;
import itez.kit.ERegex;
import itez.kit.EStr;
import itez.kit.EUid;
import itez.kit.fileup.EFileKit;
import itez.kit.restful.EMap;
import itez.core.runtime.service.Define;
import itez.plat.site.ModuleConfig;
import itez.plat.site.model.Channel;
import itez.plat.site.model.CollectorItem;
import itez.plat.site.model.CollectorSrc;
import itez.plat.site.model.CollectorTask;
import itez.plat.site.model.Content;
import itez.plat.site.service.ChannelService;
import itez.plat.site.service.CollectorItemService;
import itez.plat.site.service.CollectorSrcService;
import itez.plat.site.service.CollectorTaskService;

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Set;

import javax.imageio.ImageIO;

import org.apache.commons.lang3.ArrayUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

import com.beust.jcommander.internal.Lists;
import com.beust.jcommander.internal.Sets;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.jfinal.plugin.activerecord.SqlPara;

/**
 * 由JWinner Service Generator自动生成。
 */
@Define
@Singleton
public class CollectorTaskServiceImpl extends EModelService<CollectorTask> implements CollectorTaskService {

	@Inject
	ChannelService chnSer;
	
	@Inject
	CollectorSrcService srcSer;

	@Inject
	CollectorItemService itemSer;
	
	private Whitelist whitelist;
	
	public CollectorTaskServiceImpl() {
		whitelist = Whitelist.basic()
                .addTags(
                        "a", "b", "blockquote", "br", "caption", "cite", "code", "col",
                        "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6",
                        "i", "img", "li", "ol", "p", "pre", "small", "span", "strike", "strong",
                        "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
                        "ul")
                .addAttributes("div", "style", "class")
                .addAttributes("p", "style", "class")
                .addAttributes("span", "style", "class")
                .addAttributes("i", "style", "class")
                .addAttributes("a", "href", "title", "style", "class")
                .addAttributes("img", "src", "title", "style", "class")
                .addAttributes("ul", "type", "style", "class")
                .addAttributes("ol", "start", "type", "style", "class")
                .addAttributes("table", "summary", "style", "class")
                .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "style", "class")
                .addAttributes("th", "abbr", "axis", "colspan", "rowspan", "scope", "style", "class");
	}
		
	@Override
	public List<CollectorTask> getTasks(String channelCode) {
		Querys qs = Querys.and(Query.eq("channelCode", channelCode));
		return select(qs);
	}
	
	@Override
	public CollectorTask getTask(String channelCode, String srcId, String url) {
		Querys qs = Querys.and(Query.eq("channelCode", channelCode)).add(Query.eq("srcId", srcId)).add(Query.eq("url", url));
		return selectFirst(qs);
	}

	@Override
	public int collList(CollectorTask task) {
		//已采集的文章地址Hash串
		String itemUrls = itemSer.getItemsHash(task.getId());
		Integer[] urlHash = {};
		if(EStr.notEmpty(itemUrls)) urlHash = Arrays.stream(itemUrls.split(",")).map(i -> Integer.parseInt(i)).toArray(Integer[]::new);
		//文章清单
		List<CollectorItem> items = Lists.newArrayList();
		//列表页地址Hash串，避免重复采集
		Set<Integer> listUrls = Sets.newHashSet();
		//获取采集源
		CollectorSrc src = srcSer.findById(task.getSrcId());
		
		//开始采集当前页
		collListPage(task, src, listUrls, urlHash, items, task.getUrl(), 0);
		//保存采集结果
		if(items.size() > 0){
			dbo().tx(() -> {
				int[] ret1 = dbo().batchSave(items, 50);
				return EArr.vali(ret1);
			});
		}
		return items.size();
	}
	
	private void collListPage(CollectorTask task, CollectorSrc src, Set<Integer> listUrls, Integer[] urlHash, List<CollectorItem> items, String url, int dir){
		String domain = $domain();
		url = getUrlContent(url);
		
		//目录页重复校验
		int hash = url.hashCode();
		if(listUrls.contains(hash)) return;
		listUrls.add(hash);

		//采集目录清单
		byte[] bytes = EHttp.me.getByte(url);
		String html = new String(bytes, src.getCharsetList().equals("UTF8") ? EStr.UTF_8 : EStr.GBK);
		Document doc = Jsoup.parse(html, url);
		Elements elLists = doc.select(src.getElList());
		if(elLists.size() > 0){
			Elements elLinks = elLists.select(src.getElLink());
			elLinks.forEach(elLink -> {
				String link = getUrlContent(elLink.absUrl("href"));
				String caption = elLink.text();
				if(EStr.notEmpty(caption) && EStr.notEmpty(link) && !link.startsWith("javascript")){
					int ha = link.hashCode();
					if(!ArrayUtils.contains(urlHash, ha)){
						CollectorItem item = new CollectorItem();
						item.setId(EUid.generator()).setDomain(domain).setTaskId(task.getId());
						item.setUrl(link).setUrlHash(ha).setCaption(caption);
						item.setState(false).setCdate(EDate.getDate());
						items.add(item);
					}
				}
			});
		}
		
		//上一页
		if(EStr.notEmpty(src.getElPrev())){
			Elements elPrevs = doc.select(src.getElPrev());
			if((dir == 0 || dir == -1) && elPrevs.size() > 0){
				String link = getUrlContent(elPrevs.get(0).absUrl("href"));
				if(EStr.notEmpty(link) && !link.startsWith("javascript")) collListPage(task, src, listUrls, urlHash, items, link, -1);
			}
		}
		//下一页
		if(EStr.notEmpty(src.getElNext())){
			Elements elNexts = doc.select(src.getElNext());
			if((dir == 0 || dir == 1) && elNexts.size() > 0){
				String link = getUrlContent(elNexts.get(0).absUrl("href"));
				if(EStr.notEmpty(link) && !link.startsWith("javascript")) collListPage(task, src, listUrls, urlHash, items, link, 1);
			}
		}
	}

	@Override
	public int collItem(CollectorTask task, Boolean repeat, Boolean odate) {
		//获取栏目信息
		Channel channel = chnSer.findByCode(task.getChannelCode());
		
		String domain = $domain();
		String author = $comp().getCaption();
		CollectorSrc src = srcSer.findById(task.getSrcId());
		List<Content> conts = Lists.newArrayList();
		List<CollectorItem> items = itemSer.getItems(task.getId(), repeat ? null : false);
		
		//遍历文章采集列表
		for(CollectorItem item : items){
			Content cont = collItemDetail(domain, channel, src, odate, item, false);
			if(null != cont){
				cont.setAuthor(author);
				conts.add(cont);
			}
		}
		//保存采集结果
		if(conts.size() > 0){
			try {
				dbo().tx(() -> {
					int[] ret1 = dbo().batchSave(conts, 50);
					int[] ret2 = dbo().batchUpdate(items, 50);
					return EArr.vali(ret1, ret2);
				});
			} catch (Exception e) {
				throw e;
			}
		}
		return conts.size();
	}

	@Override
	public Content collItemDetail(String domain, Channel channel, CollectorSrc src, Boolean origDate, CollectorItem item, boolean throwException){
		String url = item.getUrl();
		byte[] bytes = EHttp.me.getByte(url);
		if(bytes == null) {
			if(throwException) throw new RuntimeException("访问URL失败！");
			else return null;
		}
		String html = new String(bytes, src.getCharsetItem().equals("UTF8") ? EStr.UTF_8 : EStr.GBK);
		Document doc = Jsoup.parse(html, url);
		Elements elCaption = doc.select(src.getElCaption());
		Elements elContent = doc.select(src.getElContent());
		
		if(elCaption.size() == 0) {
			if(throwException) throw new RuntimeException("无法匹配文章标题区域！");
			else return null;
		}
		if(elContent.size() == 0) {
			if(throwException) throw new RuntimeException("无法匹配文章正文区域！");
			else return null;
		}
		
		//处理日期
		Date cdate = EDate.getDate();
		Date odate = null;
		if(EStr.notEmpty(src.getElDate())){
			Elements elDate = doc.select(src.getElDate());
			for(int i = 0, len = elDate.size(); i < len; i++){
				String strDate = elDate.get(i).text();
				if(EStr.isEmpty(strDate)) continue;
				odate = matchDate(strDate);
				if(null != odate) break;
			}
		}
		if(null == odate) odate = cdate;

		//处理标题、内容、图片、封面
		String caption = elCaption.text();
		String firstImgUrl = "";
		Elements imgs = elContent.select("img");
		for (int i = 0, len = imgs.size(); i < len; i++) {
			Element img = imgs.get(i);
	        String imgUrl = EStr.findUseful(img.absUrl("src"), img.absUrl("data-src"));
	        if(EStr.isEmpty(imgUrl)) continue;
	        if(!imgUrl.startsWith("http")) continue;
	        String imgPath = collImg(domain, imgUrl);
	        if(EStr.isEmpty(imgPath)) continue;
	        if(EStr.isEmpty(firstImgUrl)) firstImgUrl = imgPath;
	        img.attr("src", imgPath);
		}
		formatContent(elContent); //清除原正文中的hidden等style样式
		String content = EClean.clean(elContent.outerHtml(), whitelist);
		
		//处理副标题
		String subCaption = null, summary = null, cover = null;
		if(EStr.notEmpty(src.getElSubCaption())){
			Elements elSubCaption = doc.select(src.getElSubCaption());
			if(elSubCaption.size() > 0) subCaption = elSubCaption.text();
		}
		//处理概述
		if(EStr.notEmpty(src.getElSummary())){
			Elements elSummary = doc.select(src.getElSummary());
			if(elSummary.size() > 0) summary = elSummary.text();
		}
		//处理封面图
		if(EStr.notEmpty(src.getElCover())){
			Elements elCover = doc.select(src.getElCover());
			if(elCover.size() > 0){
				cover = EStr.findUseful(elCover.get(0).absUrl("src"), elCover.get(0).absUrl("data-src"));
				if(EStr.notEmpty(cover)) cover = collImg(domain, cover);
			}
		}
		
		//未采集到封面图时，自动使用文章内容中的第一个张图作为封面
		if(EStr.isEmpty(cover) && EStr.notEmpty(firstImgUrl)) cover = firstImgUrl;
		
		//创建新的网站文章实例
		String contId = EUid.generator();
		Content cont = new Content();
		cont.setId(contId).setDomain(domain).setChannelId(channel.getId()).setChannelCaption(channel.getCaption());
		cont.setCaption(caption).setContent(content);
		cont.setSubCaption(subCaption).setSummary(summary).setPic(cover).setThum(cover);
		cont.setCdate(origDate ? odate : cdate).setMdate(origDate ? odate : cdate);
		cont.setCaptionColor("").setLink("").setSort(0).setUsed(1);
		
		//同步更新文章采集记录
		item.setContId(contId).setCaption(caption).setState(true).setOdate(odate).setMdate(cdate);
		
		return cont;
	}
	
	private void formatContent(Elements content){
		for(Element el : content){
			String styleStr = el.attr("style");
			if(EStr.notEmpty(styleStr)){
				styleStr = styleStr.replaceAll("visibility: hidden", "visibility: visible");
				el.attr("style", styleStr);
			}
		}
	}
	
	private String collImg(String domain, String imgUrl){
        byte[] imgByte = EHttp.me.getByte(imgUrl);
        if(imgByte == null) return null;
        String imgType = ERegex.findFirst(imgUrl, "[\\.\\=](jpg|png|gif|bmp)");
        if(EStr.isEmpty(imgType)) imgType = "jpg";
        String TempPath = EProp.FileUploadTemp;
        String rootPath = String.join(EStr.FileSep, domain, ModuleConfig.MODULE_CODE);
		try {
	        ByteArrayInputStream imgIn = new ByteArrayInputStream(imgByte);
	        BufferedImage imgBuff = ImageIO.read(imgIn);
	        File imgTemp = new File(TempPath.concat("thu.").concat(imgType));
	        ImageIO.write(imgBuff, imgType, imgTemp);
	        imgIn.close();
	        String imgPath = EFileKit.upload(imgTemp, rootPath);
	        return imgPath;
		} catch (Exception e) {
			if(EProp.DevMode) e.printStackTrace();
			return null;
		}
	}
	
	private Date matchDate(String dateStr){
		if(EStr.isEmpty(dateStr)) return null;
		String dt = ERegex.find(dateStr, "\\d{4}[-/年]\\d{1,2}[-/月]\\d{1,2}日?");
		if(EStr.isEmpty(dt)) return null;
		if(dt.indexOf("-") > 0) return EDate.parse(dt);
		else if(dt.indexOf("/") > 0) return EDate.parse(dt, "yyyy/MM/dd");
		else if(dt.indexOf("年") > 0) return EDate.parse(dt, "yyyy年MM月dd日");
		else return null;
	}
	
	@Override
	public void delItem(String taskId, Boolean delContent) {
		EMap paras = EMap.by("taskId", taskId);
		SqlPara sql;
		if(delContent){
			sql = dbo().getSqlPara("site.DelCollectorContent", paras);
			dbo().update(sql);
		}
		sql = dbo().getSqlPara("site.DelCollectorItem", paras);
		dbo().update(sql);
		CollectorTask task = findById(taskId);
		task.setListState(0).setListSize(0);
		if(delContent) task.setItemState(0).setItemSize(0);
		update(task);
	}
	
	/**
	 * <p>
	 * 返回URL中的有效部分，放弃#号后面的部分
	 * </p>
	 * 
	 * @param url
	 * @return
	 */
	private String getUrlContent(String url){
		if(EStr.isEmpty(url)) return "";
		int point = url.indexOf("#");
		return point == -1 ? url : url.substring(0, point);
	}

}