1、引入Maven依赖(xxl-crawler) com.xuxueli xxl-crawler 1.0.0 2、定义代理对象VO @PageSelect(cssQuery = ".row table tr")public static class PageVo { @PageFieldSelect(cssQuery = "td:eq(0)", selectType = XxlCrawlerConf.SelectType
2、定义代理对象VOcom.xuxueli xxl-crawler1.0.0
@PageSelect(cssQuery = ".row table tr") public static class PageVo { @PageFieldSelect(cssQuery = "td:eq(0)", selectType = XxlCrawlerConf.SelectType.TEXT) private String ip; @PageFieldSelect(cssQuery = "td:eq(1)", selectType = XxlCrawlerConf.SelectType.TEXT) private int port; public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public int getPort() { return port; } public void setPort(int port) { this.port = port; } @Override public String toString() { return "PageVo{" + "ip='" + ip + '\'' + ", port=" + port + '}'; } }3、创建并启动爬虫
// 代理池 final ListproxyPool = new ArrayList (); // 构造爬虫 (仅供学习测试使用,如有侵犯请联系删除; ) XxlCrawler crawler = new XxlCrawler.Builder() .setUrls(new HashSet (Arrays.asList("http://www.ip181.com/daili/1.html"))) .setWhiteUrlRegexs(new HashSet (Arrays.asList("http://www.ip181.com/daili/\\b[1-2].html"))) // 前2页数据 //.setWhiteUrlRegexs(new HashSet (Arrays.asList("http://www.ip181.com/daili/\\\\d+.html"))) // 全部数据 .setThreadCount(10) .setPageParser(new PageParser () { @Override public void parse(Document html, PageVo pageVo) { if (pageVo.getPort() == 0) { return; } //Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(pageVo.getIp(), pageVo.getPort())); //if (ProxyIpUtil.checkProxy(proxy, null) == 200) { proxyPool.add(pageVo); logger.info("proxy pool size : " + proxyPool.size()); //} } }) .build(); // 启动 crawler.start(true); // 代理池数据 logger.info("----------- proxy pool total size : {} -----------", proxyPool.size()); logger.info(proxyPool.toString());