cpp_toc.js var cheerio = require('cheerio')var request = require('sync-request')var fs = require('fs')function arrayUnique(arr) { var res = []; var s = new Set(); for(var e of arr) { if(!s.has(e)) { res.push(e) s.add(e) } } return res;}func
var cheerio = require('cheerio')
var request = require('sync-request')
var fs = require('fs')
function arrayUnique(arr) {
var res = [];
var s = new Set();
for(var e of arr) {
if(!s.has(e)) {
res.push(e)
s.add(e)
}
}
return res;
}
function getNext(url) {
var html = request('GET', url).getBody().toString();
var $ = cheerio.load(html)
var $links = $('a')
var links = []
for(var i = 0; i < $links.length; i++)
links.push($links.eq(i).attr('href'))
links = links.filter(s => s)
.map(s => s.startsWith('/')? 'http://zh.cppreference.com' + s: s)
.filter(s => s.startsWith(url))
.filter(s => s != url)
.filter(s => /^\/[^\/]+$/.test(s.replace(url, '')))
.filter(s => s.indexOf('#') == -1)
.filter(s => s.indexOf('/experimental') == -1);
console.log(arrayUnique(links))
return arrayUnique(links);
}
//var url = 'http://zh.cppreference.com/w/cpp';
var url = 'http://zh.cppreference.com/w/c';
var ofs = fs.openSync('out.txt', 'w')
var stk = [url]
while(stk.length != 0) {
url = stk.pop();
fs.writeSync(ofs, url + '\n')
console.log(url)
for(var nxt of getNext(url).reverse()) {
stk.push(nxt)
}
}
console.log('done')
cpp.js
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');
var ofile = fs.openSync('out.html', 'a');
var start;
if (fs.existsSync('out.idx')) {
start = fs.readFileSync('out.idx')
start = Number.parseInt(start) + 1
}
else
start = 0;
var toc = getToc();
for (var i = start; i < toc.length; i++) {
try {
var url = toc[i]
console.log('page: ' + url);
html = request('GET', url).getBody().toString();
var content = getContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\n', null, 'utf-8');
var hisLink = getHisLink(html);
html = request('GET', hisLink).getBody().toString();
content = getHisContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\n
\n', null, 'utf-8');
fs.writeFileSync('out.idx', i.toString())
} catch (ex) {
console.log(ex);
i--;
}
}
fs.closeSync(ofile);
console.log('Done..');
function getToc() {
return fs.readFileSync('assets/cpp_toc.txt', 'utf-8')
.split(/\n/g).filter(s => s);
}
function getContent(html) {
var $ = cheerio.load(html);
$('.t-navbar, .editsection').remove();
var title = $('#firstHeading').toString();
var content = $('#mw-content-text').html();
return title + content;
}
function getHisLink(html) {
var $ = cheerio.load(html);
return 'http://zh.cppreference.com' + $('#ca-history a').attr('href')
}
function getHisContent(html) {
var $ = cheerio.load(html);
$('input, .editsection, .external').remove()
$('.mw-history-undo').replaceWith('撤销');
return '版本历史
\n' + $('#pagehistory').html();
}
