JS爬虫源码,实现了采集糗事百科上的所有段子,代码可以拷贝到神箭手云爬虫(http://www.shenjianshou.cn/)上直接跑。对爬虫感兴趣的可以加qq群讨论:342953471。 1. [代码] [JavaScript]代码
1. [代码][JavaScript]代码
var configs = {
domains: ["www.qiushibaike.com"],
scanUrls: ["http://www.qiushibaike.com/"],
contentUrlRegexes: ["http://www\\.qiushibaike\\.com/article/\\d+"],
fields: [
{
name: "content",
selector: "//*[@id='single-next-link']",
required: true
},
{
name: "author",
selector: "//div[contains(@class,'author')]//h2"
}
]
};
var crawler = new Crawler(configs);
crawler.start();
