当前位置 : 主页 > 网络编程 > JavaScript >

微软文档中心爬虫

来源:互联网 收集:自由互联 发布时间:2021-06-28
msdoc_toc.js var cheerio = require('cheerio');var request = require('sync-request');var fs = require('fs');var process = require('process');var ofile = fs.openSync('out.txt', 'w');var root = getToc();var stk = [root]while(stk.length != 0) {
msdoc_toc.js
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');


var ofile = fs.openSync('out.txt', 'w');


var root = getToc();
var stk = [root]

while(stk.length != 0) {
    
    var n = stk.pop();
    var url = 'https://docs.microsoft.com/zh-cn/dotnet/' + n.href;
    console.log(url)
    fs.writeSync(ofile, url + '\n')
    
    /*
    var html = request('GET', url).getBody().toString();
    var content = getContent(html);
    fs.writeSync(ofile, content, null, 'utf-8');
    fs.writeSync(ofile, '\n
 \n', null, 'utf-8');
    */
    
    if(n.children) {
        for(var ch of n.children.reverse())
            stk.push(ch)
    }
    
    if(n.tocHref) {
        
        
        for(var ch of getChildrenByToc(n.tocHref).reverse())
            if(ch.href && ch.href.indexOf('../') == -1)
                stk.push(ch)
        
    }
    
}

function getChildrenByToc(tocHref) {
    
    var tocUrl = 'https://docs.microsoft.com/zh-cn/dotnet/' + tocHref;
    //console.log(tocUrl)
    var toc = request('GET', tocUrl).getBody().toString();
    var jsonObj = JSON.parse(toc)
    var children = jsonObj.items[0].children
    if(!children) return []
    for(var ch of children) {
        addUrlPrefix(ch, tocHref.replace('toc.json', ''))
    }
    return children
}

function addUrlPrefix(obj, prefix) {
    if(obj.href)
        obj.href = prefix + obj.href
    if(obj.tocHref)
        obj.tocHref = prefix + obj.tocHref
    if(obj.children) {
        for(var ch of obj.children)
            addUrlPrefix(ch, prefix)
    }
        
}

/*
for(var i in toc) {
    try {
        var url = toc[i];
        console.log('page: ' + url);
        html = request('GET', url).getBody().toString();
        var content = getContent(html);
        fs.writeSync(ofile, content, null, 'utf-8');
        fs.writeSync(ofile, '\n
 \n', null, 'utf-8');
    } catch(ex) {
        console.log(ex);
    }
}
*/

fs.closeSync(ofile);
console.log('Done..');


function getToc()  {
	
    // var jsonStr = request('GET', 'https://docs.microsoft.com/zh-cn/dotnet/toc.json').getBody().toString()
    var jsonStr = fs.readFileSync('assets/msdoc_toc.json', 'utf-8')
	var jsonObj = JSON.parse(jsonStr)
	
	return jsonObj.items[1]
}


function getContent(html) {
	var $ = cheerio.load(html);
    $('.tryitbtn').remove();
	var content = $('#content').html();
	return content;
}
msdoc.js
var cheerio = require('cheerio');
var request = require('sync-request');
var fs = require('fs');
var process = require('process');

var ofile = fs.openSync('out.html', 'a');

var start;
if(fs.existsSync('out.idx')) {
    start = fs.readFileSync('out.idx')
    start = Number.parseInt(start) + 1
}
else
    start = 0;

var toc = getToc();
for(var i = start; i < toc.length; i++) {
    try {
        var url = toc[i];
        console.log('page: ' + url);
        html = request('GET', url).getBody().toString();
        var content = getContent(html, url);
        fs.writeSync(ofile, content, null, 'utf-8');
        fs.writeSync(ofile, '\n
 \n', null, 'utf-8');
        
        fs.writeFileSync('out.idx', i.toString())
    } catch(ex) {
        console.log(ex);
        i--;
    }
}

fs.closeSync(ofile);
console.log('Done..');


function getToc()  {
	
	return fs.readFileSync('assets/dnet_std_toc.txt', 'utf-8')
    .split('\n')
    .filter(s => s)
}

function baseUrl(url) {
    
    var pos = url.lastIndexOf('/')
    if(pos == -1)
        return url;
    else
        return url.slice(0, pos + 1);
}

function getContent(html, url) {
	var $ = cheerio.load(html);
    $('.sxs-lookup').remove();
    
    
    
    var title = $('main h1').toString();
    var meta = getMeta($('ul.metadata'));
	var $content = $('main>div');
    
    var base = baseUrl(url);
    var $imgs = $content.find('img')
    
    for(var i = 0; i < $imgs.length; i++) {
        var $img = $imgs.eq(i);
        $img.attr('src', base + $img.attr('src'))
    }
    
    var content = $content.html();
	return title + meta + content;
}

function getMeta($meta) {
    
    var time = $meta.find('time').text();
    var $authors = $meta.find('.contributors a');
    var authors = []
    for(var i = 0; i < $authors.length; i++)
    {
        var $au = $authors.eq(i)
        var s = `${$au.attr('title')}`
        authors.push(s)
    }
    
    var author = authors.join(' ')
    
    return `

日期:${time}

${'\n'}

作者:${author}

` }
网友评论