package com.xdja.spider.robot.grab;

import com.xdja.spider.core.bean.Article;
import com.xdja.spider.core.grab.GrabUtil;
import com.xdja.spider.core.grab.HtmlClear;
import com.xdja.spider.core.util.HtmlGenerator;
import com.xdja.spider.robot.service.ISpiderRobotService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;

import java.util.ArrayList;
import java.util.List;

/**
 * 特殊抓取数据处理
 * 1、内容清理
 * 2、图片抓取
 * 3、静态化
 *
 * @author hsun
 * @version 1.0
 * @since 2017/9/20 下午7:19
 */
public class SpecialHandle {

    private static Logger logger = LoggerFactory.getLogger(SpecialHandle.class);

    private long colId;
    private ISpiderRobotService spiderRobotService;

    public SpecialHandle(ISpiderRobotService spiderRobotService, long colId) {
        this.spiderRobotService = spiderRobotService;
        this.colId = colId;
    }

    public void handle() {
        //HTML格式清理   静态化
        List<Article> list = this.spiderRobotService.wait4Static(this.colId);
        if (CollectionUtils.isEmpty(list)) {
            return;
        }

        List<Article> successResults = new ArrayList<>();
        List<String> images = new ArrayList<>();
        for (Article article : list) {
            String content = article.getContent();
            if (StringUtils.isEmpty(content)) {
                continue;
            }
            Document parse = Jsoup.parse(content, article.getSourceUrl());

            Elements elements = parse.body().children();
            for (Element ele : elements) {
                HtmlClear.clear(ele);
            }

            List<String> imgs = GrabUtil.grabImgs(elements);
            if (!CollectionUtils.isEmpty(imgs)) {
                for (String img : imgs) {
                    images.add(String.format("%s#%s", article.getId(), img));
                }
            }

            content = elements.html();

            String description = elements.text();
            if (null != description) {
                article.setDescription(description.length() > 100 ? description.trim().substring(0, 100) : description.trim());
            }

            article.setContent(content);
            article.setReleaseTime(System.currentTimeMillis());
            article.setReleaseStatus(Article.ReleaseStatus.RELEASE.value);

            try {
                HtmlGenerator.generateDefault(article, article.getViewUrl());
                article.setStaticTime(System.currentTimeMillis());
            } catch (Exception e) {
                logger.debug("静态化失败", e);
            }
            successResults.add(article);
        }

        this.spiderRobotService.updateArticle(successResults);
        logger.info("修改数据");
        this.spiderRobotService.saveArticleImgs(images);
        logger.info("保存图片");
        this.spiderRobotService.updateStaticTime(successResults);
        logger.info("修改静态化时间");

        //继续
        handle();
    }
}
