msdoc_toc.js var cheerio = require('cheerio');var request = require('sync-request');var fs = require('fs');var process = require('process');var ofile = fs.openSync('out.txt', 'w');var root = getToc();var stk = [root]while(stk.length != 0) {
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');
var ofile = fs.openSync('out.txt', 'w');
var root = getToc();
var stk = [root]
while(stk.length != 0) {
var n = stk.pop();
var url = 'https://docs.microsoft.com/zh-cn/dotnet/' + n.href;
console.log(url)
fs.writeSync(ofile, url + '\n')
/*
var html = request('GET', url).getBody().toString();
var content = getContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\n
\n', null, 'utf-8');
*/
if(n.children) {
for(var ch of n.children.reverse())
stk.push(ch)
}
if(n.tocHref) {
for(var ch of getChildrenByToc(n.tocHref).reverse())
if(ch.href && ch.href.indexOf('../') == -1)
stk.push(ch)
}
}
function getChildrenByToc(tocHref) {
var tocUrl = 'https://docs.microsoft.com/zh-cn/dotnet/' + tocHref;
//console.log(tocUrl)
var toc = request('GET', tocUrl).getBody().toString();
var jsonObj = JSON.parse(toc)
var children = jsonObj.items[0].children
if(!children) return []
for(var ch of children) {
addUrlPrefix(ch, tocHref.replace('toc.json', ''))
}
return children
}
function addUrlPrefix(obj, prefix) {
if(obj.href)
obj.href = prefix + obj.href
if(obj.tocHref)
obj.tocHref = prefix + obj.tocHref
if(obj.children) {
for(var ch of obj.children)
addUrlPrefix(ch, prefix)
}
}
/*
for(var i in toc) {
try {
var url = toc[i];
console.log('page: ' + url);
html = request('GET', url).getBody().toString();
var content = getContent(html);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\n
\n', null, 'utf-8');
} catch(ex) {
console.log(ex);
}
}
*/
fs.closeSync(ofile);
console.log('Done..');
function getToc() {
// var jsonStr = request('GET', 'https://docs.microsoft.com/zh-cn/dotnet/toc.json').getBody().toString()
var jsonStr = fs.readFileSync('assets/msdoc_toc.json', 'utf-8')
var jsonObj = JSON.parse(jsonStr)
return jsonObj.items[1]
}
function getContent(html) {
var $ = cheerio.load(html);
$('.tryitbtn').remove();
var content = $('#content').html();
return content;
}
msdoc.js
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');
var ofile = fs.openSync('out.html', 'a');
var start;
if(fs.existsSync('out.idx')) {
start = fs.readFileSync('out.idx')
start = Number.parseInt(start) + 1
}
else
start = 0;
var toc = getToc();
for(var i = start; i < toc.length; i++) {
try {
var url = toc[i];
console.log('page: ' + url);
html = request('GET', url).getBody().toString();
var content = getContent(html, url);
fs.writeSync(ofile, content, null, 'utf-8');
fs.writeSync(ofile, '\n
\n', null, 'utf-8');
fs.writeFileSync('out.idx', i.toString())
} catch(ex) {
console.log(ex);
i--;
}
}
fs.closeSync(ofile);
console.log('Done..');
function getToc() {
return fs.readFileSync('assets/dnet_std_toc.txt', 'utf-8')
.split('\n')
.filter(s => s)
}
function baseUrl(url) {
var pos = url.lastIndexOf('/')
if(pos == -1)
return url;
else
return url.slice(0, pos + 1);
}
function getContent(html, url) {
var $ = cheerio.load(html);
$('.sxs-lookup').remove();
var title = $('main h1').toString();
var meta = getMeta($('ul.metadata'));
var $content = $('main>div');
var base = baseUrl(url);
var $imgs = $content.find('img')
for(var i = 0; i < $imgs.length; i++) {
var $img = $imgs.eq(i);
$img.attr('src', base + $img.attr('src'))
}
var content = $content.html();
return title + meta + content;
}
function getMeta($meta) {
var time = $meta.find('time').text();
var $authors = $meta.find('.contributors a');
var authors = []
for(var i = 0; i < $authors.length; i++)
{
var $au = $authors.eq(i)
var s = `${$au.attr('title')}`
authors.push(s)
}
var author = authors.join(' ')
return `日期:${time}
${'\n'}作者:${author}
`
}
