msdoc_toc.js var cheerio = require('cheerio');var request = require('sync-request');var fs = require('fs');var process = require('process');var ofile = fs.openSync('out.txt', 'w');var root = getToc();var stk = [root]while(stk.length != 0) {
var cheerio = require('cheerio'); var request = require('sync-request'); var fs = require('fs'); var process = require('process'); var ofile = fs.openSync('out.txt', 'w'); var root = getToc(); var stk = [root] while(stk.length != 0) { var n = stk.pop(); var url = 'https://docs.microsoft.com/zh-cn/dotnet/' + n.href; console.log(url) fs.writeSync(ofile, url + '\n') /* var html = request('GET', url).getBody().toString(); var content = getContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); */ if(n.children) { for(var ch of n.children.reverse()) stk.push(ch) } if(n.tocHref) { for(var ch of getChildrenByToc(n.tocHref).reverse()) if(ch.href && ch.href.indexOf('../') == -1) stk.push(ch) } } function getChildrenByToc(tocHref) { var tocUrl = 'https://docs.microsoft.com/zh-cn/dotnet/' + tocHref; //console.log(tocUrl) var toc = request('GET', tocUrl).getBody().toString(); var jsonObj = JSON.parse(toc) var children = jsonObj.items[0].children if(!children) return [] for(var ch of children) { addUrlPrefix(ch, tocHref.replace('toc.json', '')) } return children } function addUrlPrefix(obj, prefix) { if(obj.href) obj.href = prefix + obj.href if(obj.tocHref) obj.tocHref = prefix + obj.tocHref if(obj.children) { for(var ch of obj.children) addUrlPrefix(ch, prefix) } } /* for(var i in toc) { try { var url = toc[i]; console.log('page: ' + url); html = request('GET', url).getBody().toString(); var content = getContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); } catch(ex) { console.log(ex); } } */ fs.closeSync(ofile); console.log('Done..'); function getToc() { // var jsonStr = request('GET', 'https://docs.microsoft.com/zh-cn/dotnet/toc.json').getBody().toString() var jsonStr = fs.readFileSync('assets/msdoc_toc.json', 'utf-8') var jsonObj = JSON.parse(jsonStr) return jsonObj.items[1] } function getContent(html) { var $ = cheerio.load(html); $('.tryitbtn').remove(); var content = $('#content').html(); return content; }msdoc.js
var cheerio = require('cheerio'); var request = require('sync-request'); var fs = require('fs'); var process = require('process'); var ofile = fs.openSync('out.html', 'a'); var start; if(fs.existsSync('out.idx')) { start = fs.readFileSync('out.idx') start = Number.parseInt(start) + 1 } else start = 0; var toc = getToc(); for(var i = start; i < toc.length; i++) { try { var url = toc[i]; console.log('page: ' + url); html = request('GET', url).getBody().toString(); var content = getContent(html, url); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); fs.writeFileSync('out.idx', i.toString()) } catch(ex) { console.log(ex); i--; } } fs.closeSync(ofile); console.log('Done..'); function getToc() { return fs.readFileSync('assets/dnet_std_toc.txt', 'utf-8') .split('\n') .filter(s => s) } function baseUrl(url) { var pos = url.lastIndexOf('/') if(pos == -1) return url; else return url.slice(0, pos + 1); } function getContent(html, url) { var $ = cheerio.load(html); $('.sxs-lookup').remove(); var title = $('main h1').toString(); var meta = getMeta($('ul.metadata')); var $content = $('main>div'); var base = baseUrl(url); var $imgs = $content.find('img') for(var i = 0; i < $imgs.length; i++) { var $img = $imgs.eq(i); $img.attr('src', base + $img.attr('src')) } var content = $content.html(); return title + meta + content; } function getMeta($meta) { var time = $meta.find('time').text(); var $authors = $meta.find('.contributors a'); var authors = [] for(var i = 0; i < $authors.length; i++) { var $au = $authors.eq(i) var s = `${$au.attr('title')}` authors.push(s) } var author = authors.join(' ') return `日期:${time}
${'\n'}作者:${author}
` }