一、[知识点]: 爬虫基本流程 保存海量漫画数据 requests的使用 base64解密 二、[开发环境]: 版 本:python 3.8 编辑器:pycharm requests: pip install requests 三、思路 爬虫: 分析网页数据来源 所有图
一、[知识点]:
爬虫基本流程
保存海量漫画数据
requests的使用
base64解密
二、[开发环境]:
版 本:python 3.8
编辑器:pycharm
requests: pip install requests
三、思路
爬虫:
分析网页数据来源 所有图片链接
https://ac.qq.com/ComicView/index/id/505430/cid/1
多个页面采集
四、代码实现:
1. 发送请求
2. 获取数据
3. 解析数据
4. 保存数据
五、完整代码
import requests # 发送请求(访问网站) 第三方
import re # 解析数据
import base64
import json
import os
import parsel
headers = {
'cookie': '__AC__=1; tvfe_boss_uuid=bb88930a5ac8406d; iip=0; _txjk_whl_uuid_aa5wayli=55a33622e35c40e987c810022a8c40c6; pgv_pvid=6990680204; ptui_loginuin=1321228067; RK=Kj3JwrkEZn; ptcz=42d9e016607f032705abd9792c4348479e6108da38fd5426d9ecaeff1088aa19; fqm_pvqid=d77fc224-90eb-4654-befc-ab7b6d275fb4; psrf_qqopenid=4F37937E43ECA9EAB02F9E89BE1860E2; psrf_qqaccess_token=2B1977379A78742A0B826B173FB09E92; wxunionid=; tmeLoginType=2; psrf_access_token_expiresAt=1664978634; psrf_qqrefresh_token=03721D80236524B49062B95719F2F8B4; psrf_qqunionid=FAEE1B5B10434CF5562642FABE749AB9; wxrefresh_token=; wxopenid=; euin=oKoAoK-ANens7z**; pac_uid=1_321228067; o_cookie=3421355804; luin=o3421355804; lskey=00010000cc21c5247a7b57cfa49ce4837f31c5b209104ee0097255fb83272e5526f2ebedd04b546e5739897b; nav_userinfo_cookie=; ac_wx_user=; ts_refer=www.baidu.com/link; ts_uid=6545534402; theme=white; roastState=2; readLastRecord=%5B%5D; _qpsvr_localtk=0.918720523577107; Hm_lvt_f179d8d1a7d9619f10734edb75d482c4=1660113387,1660132056; pgv_info=ssid=s9159912060; readRecord=%5B%5B505430%2C%22%E8%88%AA%E6%B5%B7%E7%8E%8B%22%2C2%2C%22%E7%AC%AC2%E8%AF%9D%20%E6%88%B4%E8%8D%89%E5%B8%BD%E7%9A%84%E8%B7%AF%E9%A3%9E%22%2C2%5D%2C%5B530876%2C%22%E6%8E%92%E7%90%83%E5%B0%91%E5%B9%B4%EF%BC%81%EF%BC%81%22%2C2%2C%22%E7%AC%AC1%E8%AF%9D%20%E7%BB%93%E6%9D%9F%E4%B8%8E%E5%BC%80%E5%A7%8B%22%2C1%5D%2C%5B17114%2C%22%E5%B0%B8%E5%85%84%EF%BC%88%E6%88%91%E5%8F%AB%E7%99%BD%E5%B0%8F%E9%A3%9E%EF%BC%89%22%2C3%2C%22%E7%AC%AC1%E9%9B%86%22%2C1%5D%2C%5B650998%2C%22%E5%A4%A7%E7%8C%BF%E9%AD%82%EF%BC%88%E8%A5%BF%E8%A1%8C%E7%BA%AA%E7%B3%BB%E5%88%97%EF%BC%89%22%2C1011%2C%22%E3%80%8A%E5%A4%A7%E7%8C%BF%E9%AD%82%E3%80%8B%E5%BA%8F%E7%AB%A0%22%2C1%5D%5D; Hm_lpvt_f179d8d1a7d9619f10734edb75d482c4=1660138229; ts_last=ac.qq.com/ComicView/index/id/505430/cid/2',
'referer': 'https://ac.qq.com/Comic/comicInfo/id/644270',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
hhw_url = 'https://ac.qq.com/Comic/comicInfo/id/644270'
htmlData = requests.get(url=hhw_url, headers=headers).text
select = parsel.Selector(htmlData)
subUrlList = select.css('ol > li > p > span > a::attr(href)').getall()
print(subUrlList)
page = 1
for subUrl in subUrlList[150:]:
if not os.path.exists(f'航海王/第{page}话'):
os.mkdir(f'航海王/第{page}话')
url = f'https://ac.qq.com' + subUrl
# 1. 发送请求
response = requests.get(url=url, headers=headers)
# 2. 获取数据
# xpath css 解析方式 网页源代码
# re
html_data = response.text
# 3. 解析数据
# 匹配规则, 你要在哪里匹配
b4_str = re.findall("var DATA = '(.*?)',", html_data)[0]
# print(base64.b64decode(b4_str))
# 把整个字符串长度获取下来
# for循环遍历 字符串
for i in range(len(b4_str)):
# 字符串切片 一个个尝试 直到解析不出错
try:
json_str = base64.b64decode(b4_str[i:].encode('utf-8')).decode('utf-8')
# print(json_str)
# "picture":[]
pictures = re.findall('"picture":(\[.*?\])', json_str)[0]
# print(pictures)
json_list = json.loads(pictures)
# print(json_list)
for j in range(len(json_list)):
imgUrl = json_list[j]['url']
print(f'第{page}话', imgUrl)
imgData = requests.get(imgUrl).content
# 4. 保存图片
with open(f'航海王/第{page}话/{j + 1}.jpg', mode='wb') as f:
f.write(imgData)
# 退出循环
break
except:
pass
page += 1