当前位置 : 主页 > 网络编程 > JavaScript >

cppreference.com 爬虫

来源:互联网 收集:自由互联 发布时间:2021-06-28
cpp_toc.js var cheerio = require('cheerio')var request = require('sync-request')var fs = require('fs')function arrayUnique(arr) { var res = []; var s = new Set(); for(var e of arr) { if(!s.has(e)) { res.push(e) s.add(e) } } return res;}func
cpp_toc.js
var cheerio = require('cheerio')
var request = require('sync-request')
var fs = require('fs')

function arrayUnique(arr) {
    var res = [];
    var s = new Set();
    for(var e of arr) {
        if(!s.has(e)) {
            res.push(e)
            s.add(e)
        }
    }
    return res;
}

function getNext(url) {
    
    var html = request('GET', url).getBody().toString();
    
    var $ = cheerio.load(html)
    var $links = $('a')
    var links = []
    for(var i = 0; i < $links.length; i++)
        links.push($links.eq(i).attr('href'))
    
    links = links.filter(s => s)
        .map(s => s.startsWith('/')? 'http://zh.cppreference.com' + s: s)
        .filter(s => s.startsWith(url))
        .filter(s => s != url)
        .filter(s => /^\/[^\/]+$/.test(s.replace(url, '')))
        .filter(s => s.indexOf('#') == -1)
        .filter(s => s.indexOf('/experimental') == -1);
    
    console.log(arrayUnique(links))
    return arrayUnique(links);
}

//var url = 'http://zh.cppreference.com/w/cpp';
var url = 'http://zh.cppreference.com/w/c';
var ofs = fs.openSync('out.txt', 'w')

var stk = [url]

while(stk.length != 0) {
    
    url = stk.pop();
    fs.writeSync(ofs, url + '\n')
    console.log(url)
    
    for(var nxt of getNext(url).reverse()) {
        stk.push(nxt)
    }
    
}

console.log('done')
cpp.js
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');



var ofile = fs.openSync('out.html', 'a');
var start;
if (fs.existsSync('out.idx')) {
    start = fs.readFileSync('out.idx')
    start = Number.parseInt(start) + 1
}
else
    start = 0;


var toc = getToc();
for (var i = start; i < toc.length; i++) {
    try {
        var url = toc[i]
        console.log('page: ' + url);
        html = request('GET', url).getBody().toString();
        var content = getContent(html);
        fs.writeSync(ofile, content, null, 'utf-8');
        fs.writeSync(ofile, '\n', null, 'utf-8');

        var hisLink = getHisLink(html);
        html = request('GET', hisLink).getBody().toString();
        content = getHisContent(html);
        fs.writeSync(ofile, content, null, 'utf-8');

        fs.writeSync(ofile, '\n
 \n', null, 'utf-8');

        fs.writeFileSync('out.idx', i.toString())
    } catch (ex) {
        console.log(ex);
        i--;
    }
}

fs.closeSync(ofile);
console.log('Done..');


function getToc() {


    return fs.readFileSync('assets/cpp_toc.txt', 'utf-8')
        .split(/\n/g).filter(s => s);
}


function getContent(html) {
    var $ = cheerio.load(html);
    $('.t-navbar, .editsection').remove();
    var title = $('#firstHeading').toString();
    var content = $('#mw-content-text').html();
    return title + content;
}

function getHisLink(html) {
    var $ = cheerio.load(html);

    return 'http://zh.cppreference.com' + $('#ca-history a').attr('href')
}

function getHisContent(html) {
    var $ = cheerio.load(html);
    $('input, .editsection, .external').remove()
    $('.mw-history-undo').replaceWith('撤销');
    return '

版本历史

\n' + $('#pagehistory').html(); }
网友评论