抓取网站:
lib包:
代码
import java.util.List;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.pipeline.JsonFilePipeline;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.selector.Selectable;public class CrawlAliyun implements PageProcessor{ //入口 public static final String URL_START ="https://yq.aliyun.com/articles/type_all"; //分页 public static final String URL_PAGE = "https://yq.aliyun.com/articles/type_all-order_createtime-page_[0-9]+"; //标签 public static final String URL_TAGS = "https://yq.aliyun.com/tags/type_blog-tagid_[0-9]+"; //博文 public static final String URL_CONTENT = "https://yq.aliyun.com/articles/[0-9]+"; // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setCharset("utf-8"); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { // 部分二:定义如何抽取页面信息,并保存下来 Selectable select=null; Listurls=null; if (page.getUrl().regex(URL_PAGE).match()||page.getUrl().regex(URL_START).match()) { System.out.println("=====pageurl====="+page.getUrl()); select = page.getHtml().xpath("//section[@class='yq-new-list yq-n-l-blog']"); urls = select.links().regex(URL_PAGE).all(); page.addTargetRequests(urls); urls = select.links().regex(URL_CONTENT).all(); page.addTargetRequests(urls); } if (page.getUrl().regex(URL_CONTENT).match()) { System.out.println("=====arturl====="+page.getUrl()); select = page.getHtml().xpath("//p[@class='blog-tags']"); urls = select.links().regex(URL_TAGS).all(); page.addTargetRequests(urls); page.putField("url", page.getUrl().toString()); page.putField("title", page.getHtml().xpath("//h2[@class='blog-title']/text()").toString() ); page.putField("author", page.getHtml().xpath("//a[@class='b-author']/text()").toString()); page.putField("authorUrl", page.getHtml().xpath("//a[@class='b-author']").$("a","href").toString()); page.putField("createtime", page.getHtml().xpath("//span[@class='b-time']/text()").toString()); page.putField("watched", page.getHtml().xpath("//span[@class='b-watch']/text()").toString().replace("浏览", "")); page.putField("tags",page.getHtml().xpath("//p[@class='blog-tags']/a/text()").all() ); page.putField("summary", page.getHtml().xpath("//p[@class='blog-summary']/text()").toString()); page.putField("content", page.getHtml().xpath("//div[@class='content-detail']/html()").toString()); } } @Override public Site getSite() { site.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); return site; } public static void main(String args[]) { Spider.create(new CrawlAliyun()) //从"https://github.com/code4craft"开始抓 .addUrl(URL_START) .addPipeline(new JsonFilePipeline("F:\\webmagic\\")) //开启5个线程抓取 .thread(5) //启动爬虫 .run(); } }
结果