当前位置 : 主页 > 编程语言 > java >

python抓网页资源小脚本

来源:互联网 收集:自由互联 发布时间:2023-02-04
#!/usr/bin/env python# coding: utf-8import urllibdef filter_src(file_name): resource_list = [] f_obj = open(file_name) for f_line in f_obj: if '404' in f_line: str_goal = f_line.strip().split(' ')[7] if not str_goal in resource_list: print

#!/usr/bin/env python# coding: utf-8import urllibdef filter_src(file_name): resource_list = [] f_obj = open(file_name) for f_line in f_obj: if '404' in f_line: str_goal = f_line.strip().split(' ')[7] if not str_goal in resource_list: print str_goal if '/static' in str_goal: str_goal = str_goal.replace('/static', '') resource_list.append(str_goal[:-1]) print resource_list return resource_listdef down_src(source_list): base_url = "http://www.ttcrm.com" down_path = r"src" for source in source_list: source_url = base_url + source source_path = down_path + source print source_url source_stram = urllib.urlopen(source_url) f_obj = open(source_path,'wb') f_obj.write(source_stram.read()) if __name__=='__main__': file_name = 'src.txt' source_list = filter_src(file_name) down_src(source_list)

关键点在于保存是以二进制方式保存!

f_obj = open(source_path,'wb') f_obj.write(source_stram.read())

网友评论