当前位置 : 主页 > 网络编程 > JavaScript >

抓取 HTML 图片(img.js)

来源:互联网 收集:自由互联 发布时间:2021-06-28
img.js var fs = require('fs');var request = require('sync-request');var crypto = require('crypto');var cheerio = require('cheerio');var dirname = process.argv[2];if(!dirname){console.log('请指定目录。');process.exit(0);}try {fs.mkdirS
img.js
var fs = require('fs');
var request = require('sync-request');
var crypto = require('crypto');
var cheerio = require('cheerio');

var dirname = process.argv[2];
if(!dirname)
{
	console.log('请指定目录。');
	process.exit(0);
}

try {fs.mkdirSync(dirname + '/img');} catch(ex) {}
var dir= fs.readdirSync(dirname);

var imgs = new Set();

for(var i = 0; i < dir.length; i++)
{
    
        var fname = dir[i];
        if(!fname.endsWith('.html'))
            continue;
        console.log('file: ' + fname);
        var content = fs.readFileSync(dirname + '/' + fname, 'utf-8');
        content = dealWithHtml(content);
        fs.writeFileSync(dirname + '/' + fname, content);
    
}

function dealWithHtml(html) {
    
    var $ = cheerio.load(html);
    
    var $imgs = $('img');
    
    for(var i = 0; i < $imgs.length; i++) {
        
        try {
            var $img = $imgs.eq(i);
            var url = $img.attr('src');
            if(!url.startsWith('http'))
                continue;
            
            var picname = crypto.createHash('md5').update(url).digest('hex') + ".jpg";
            console.log(picname)
            
            if(!imgs.has(picname)) {
                var data = request('GET', url).getBody();
                fs.writeFileSync(dirname + '/img/' + picname, data);
                imgs.add(picname);
            }
            
            $img.attr('src', '../Images/' + picname);
        } catch(ex) {console.log(ex.toString())}
    }
    
    return $.html();
}
网友评论