当前位置 : 主页 > 编程语言 > python >

python_简单图片爬取

来源:互联网 收集:自由互联 发布时间:2022-06-14
文章目录 ​​overview​​ ​​version1:​​ ​​version2:​​ overview 测试可运行于python 3.9+ 正则匹配规则根据具体的网站源码可以适当调整 版本2采用beautifulSoup来代替正则表达式,但请注意


文章目录

  • ​​overview​​
  • ​​version1:​​
  • ​​version2:​​

overview

测试可运行于python 3.9+
正则匹配规则根据具体的网站源码可以适当调整
版本2采用beautifulSoup来代替正则表达式,但请注意修改文件保存路径调整

version1:

# -*- coding: utf-8 -*-

import os
import re
import urllib.error # 用于错误处理
import urllib.request # 主要用于打开和阅读url

prefix_path = r"image1"


def picCraw(url, topic, img_pattern_str):
count = 1
file_name = os.path.join(prefix_path, topic + ".html")
print("正在保存:" + file_name)
read_result_bytes = urllib.request.urlopen(url).read()
# set save path
to_save_path = os.path.join(prefix_path, topic)
if not os.path.isdir(to_save_path): os.makedirs(to_save_path) # 创建目录保存图片
#从字节码解码成文本(您应当注意,如果decode()方法不加参数,可能会导致ascii编码相关报错.
page_data_str = read_result_bytes.decode("utf8","ignore")

MatchedImage_link_list = re.findall(img_pattern_str, page_data_str) # 找出所有匹配
print("MatchedImages:", MatchedImage_link_list)
for image in MatchedImage_link_list: # 用正则表达式匹配所有的图片
pattern = re.compile(r'//.*\.jpg$') # 匹配jpg格式的文件
if pattern.search(image): # 如果匹配成功,则获取图片信息;若不成功继续下一个
try:
if "http" not in image: image = "http:" + image
image_data_bytes = urllib.request.urlopen(image).read() # 获取图片信息
image_path = os.path.join(prefix_path, topic, str(count) + ".jpg") # 给图片命名
count += 1
with open(image_path, "wb") as image_file:
image_file.write(image_data_bytes) # 将图片写入jpg文件
except urllib.error.URLError as e:
print("Download failed")
#with open(file_name, "wb") as file: # 将页面写入文件
# file.write(read_result_bytes)


if __name__ == "__main__":
# https://sc.chinaz.com/tupian/fengjingtupian.html
url = 'http://www.jituwang.com/tuku/biology/'
# 匹配图片的pattern,可通过查看网页源代码获悉
#通过findall(),该会返回匹配该分组的匹配部分所构成的集合
img_pattern_str = r'<img .*src=\"(.+\.jpg)\"'
picCraw(url, "scene", img_pattern_str)

version2:

# -*- coding: utf-8 -*-
"""
Created on Fri May 21 22:17:08 2021

@author: zero
"""

# bs4.pics.py
'''1. 获取主网页源代码'''
import requests
from bs4 import BeautifulSoup

# url="https://sc.chinaz.com/tupian/"
url = "http://www.jituwang.com/tuku/naturally/"
url = "http://www.jituwang.com/tuku/biology"

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
response = requests.get(url=url, headers=headers)
response.encoding = "utf-8"
"""
Property text of requests.models.Response @property def text(self) -> str
Content of the response, in unicode.
If Response.encoding is None, encoding will be guessed using chardet.
The encoding of the response content is determined based solely on HTTP headers,
following RFC 2616 to the letter.
If you can take advantage of non-HTTP knowledge to make a better guess at the encoding, you should set r.encoding appropriately before accessing this property."""
resp_text = response.text

'''2.从网页源代码获取子标签里的链接'''
soup = BeautifulSoup(resp_text, 'html.parser')
# print(soup)
img_tags_RS = soup.find_all("img")
# print(img_tags_RS)

'''3.爬取链接下的图片,并写入文件'''
for img_tag in img_tags_RS:
href = img_tag.get('src')
# 超链接目标:href
# 查看获取的src的属性值
print(href)
# 请断开代理(如果有的话,否则可能失败)
# 从这里开始,就基本和requests操作一致:
# get the Response instance
# get the bytes from the Response instance
# save the bytes as a file(with open() as...)
img_response = requests.get(href)
img_content_bytes = img_response.content
img_name = href.split('/')[-1]
with open("image2/" + img_name, mode='wb')as fos:
fos.write(img_content_bytes)

response.close()


网友评论