当前位置 : 主页 > 网络编程 > JavaScript >

runoob 爬虫

来源:互联网 收集:自由互联 发布时间:2021-06-28
runoob.js var cheerio = require('cheerio');var request = require('sync-request');var fs = require('fs');var process = require('process');var url = process.argv[2];if(!url){console.log('请指定页面。');process.exit(0);}var ofile = fs.op
runoob.js
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');

var url = process.argv[2];
if(!url)
{
	console.log('请指定页面。');
	process.exit(0);
}

var ofile = fs.openSync('out.html', 'w');

var html = request('GET', url).getBody().toString();

var toc = getToc(html);
for(var i in toc) {
    try {
        var url = toc[i];
        console.log('page: ' + url);
        html = request('GET', url).getBody().toString();
        var content = getContent(html);
        fs.writeSync(ofile, content, null, 'utf-8');
        fs.writeSync(ofile, '\n
 \n', null, 'utf-8');
    } catch(ex) {
        console.log(ex);
    }
}

fs.closeSync(ofile);
console.log('Done..');


function getToc(html)  {
	
	var $ = cheerio.load(html);
	
	var $list = $('#leftcolumn').find('a');
	var res = [];
	for(var i = 0; i < $list.length; i++)
	{
		var url = $list.eq(i).attr('href');
		res.push('http://www.runoob.com/' + url);
	}
	return res;
}


function getContent(html) {
	var $ = cheerio.load(html);
    $('.tryitbtn').remove();
	var content = $('#content').html();
	return content;
}
网友评论