================================
工具准备:
================================
下载与 chome 浏览器版本一致的 chromedriver, chromedriver 国内下载镜像
https://npm.taobao.org/mirrors/chromedriver
将 chromedriver.exe 复制到 python 的scripts目录中, 比如 C:\Anaconda3\Scripts\
并将C:\Anaconda3\Scripts\加到Windows 环境变量PATH 中.
================================
安装 selenium python 包
================================
pip install selenium================================
selenium 的更多信息
================================
selenium 不仅支持Python, 还支持Java/C#
https://www.selenium.dev/documentation/zh-cn/webdriver/driver_requirements/
https://www.selenium.dev/documentation/zh-cn/selenium_installation/installing_webdriver_binaries/
本文共有好多个下载脚本, 是一个不断完善的过程, 所以, 最后一个下载脚本是最通用, 最完美的.
================================
根据章节序号推算单章url地址, 然后下载
================================
from selenium import webdriverweb = webdriver.Chrome()
full_text="小说:穿越种田之将门妻"
full_text=full_text+"\n" +"\n" +"\n"
home_url="https://www.jingcaiyuedu6.com/novel/CW8MY3/"
#web.get('https://www.jingcaiyuedu6.com/novel/CW8MY3/1.html')
chapter_start=1
chapter_end=39 #39
start_page_id=0
for i in range(chapter_start,chapter_end+1):
page_id=i+start_page_id
url=home_url+str(page_id)+".html"
#print("第"+str(i)+"章")
full_text=full_text+"\n" +"\n" +"\n" +"======================"+"\n"+"第"+str(i)+"章"+ "\n"
web.get(url)
#<div id="content">
content_tag = web.find_element_by_id("content")
#content_tag = web.find_element_by_class_name("panel panel-default panel-readcontent")
content = content_tag.text
full_text=full_text+content
print(full_text)
web.close()
================================
从列表也提取单章url, 然后下载单章文本
================================
#========================================# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
"""数字转中文"""
num=str(num)
new_str=""
num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
listnum=list(num)
# print(listnum)
shu=[]
for i in listnum:
# print(num_dict[i])
shu.append(num_dict[i])
new_str="".join(shu)
# print(new_str)
return new_str
#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name: num2chinese
# Author: yunhgu
# Date: 2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------
_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999
class NotIntegerError(Exception):
pass
class OutOfRangeError(Exception):
pass
class Num2Chinese:
def convert(self, number: int):
"""
:param number:
:return:chinese number
"""
return self._to_chinese(number)
def _to_chinese(self, num):
if not str(num).isdigit():
raise NotIntegerError(u'%s is not a integer.' % num)
if num < _MIN or num > _MAX:
raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
if num < _S4:
return self._to_chinese4(num)
elif num < _S8:
return self._to_chinese8(num)
else:
return self._to_chinese16(num)
@staticmethod
def _to_chinese4(num):
assert (0 <= num < _S4)
if num < 10:
return _MAPPING[num]
else:
lst = []
while num >= 10:
lst.append(num % 10)
num = num // 10
lst.append(num)
c = len(lst) # 位数
result = u''
for idx, val in enumerate(lst):
if val != 0:
result += _P0[idx] + _MAPPING[val]
if idx < c - 1 and lst[idx + 1] == 0:
result += u'零'
return result[::-1].replace(u'一十', u'十')
def _to_chinese8(self, num):
assert (num < _S8)
to4 = self._to_chinese4
if num < _S4:
return to4(num)
else:
mod = _S4
high, low = num // mod, num % mod
if low == 0:
return to4(high) + u'万'
else:
if low < _S4 // 10:
return to4(high) + u'万零' + to4(low)
else:
return to4(high) + u'万' + to4(low)
def _to_chinese16(self, num):
assert (num < _S16)
to8 = self._to_chinese8
mod = _S8
high, low = num // mod, num % mod
if low == 0:
return to8(high) + u'亿'
else:
if low < _S8 // 10:
return to8(high) + u'亿零' + to8(low)
else:
return to8(high) + u'亿' + to8(low)
#========================================
# 从列表页提取单章url, 然后下载单章文本
#========================================
from selenium import webdriver
web = webdriver.Chrome()
num2chinese = Num2Chinese()
full_text="小说:掌家小娘子"
full_text=full_text+"\n" +"\n" +"\n"
print(full_text)
list_url="https://www.baihexs.com/0/54/"
chapter_start=1
chapter_end=306 #306
for i in range(chapter_start,chapter_end+1):
chinese_chapter_id=num2chinese.convert(i) #中文数字
#chinese_chapter_id=str(i) #阿拉伯数字
chinese_chapter_name="第"+chinese_chapter_id+"章"
if chinese_chapter_name.find("百十"):
chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
#print(chinese_chapter_name)
web.get(list_url) #跳转会列表页, 以便抓取单页的url地址
url=""
try:
url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
except:
url=""
#print(url)
if url:
web.get(url)
#<dd id="contents">
#//*[@id="content"]
#content_tag = web.find_elements_by_css_selector("dd")[2]
#content_tag = web.find_element_by_id("contents")
#content_tag = web.find_element_by_class_name("container body-content")
content_tag = web.find_element_by_xpath('''//*[@id="center"]''')
content = content_tag.text
else:
content="不提供下载"
chapter_text = "\n" + "\n" + "\n" + "======================" + "\n" + "第" + str(i) + "章" + "\n"
chapter_text=chapter_text+content
print(chapter_text)
full_text=full_text+chapter_text
#print(full_text)
web.close()
================================
每章支持多个分页
作了性能优化
自动输出到文件
增加番外篇下载
增加列表页面点击"显示全部页面"功能
代码逻辑优化
================================
# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
"""数字转中文"""
num=str(num)
new_str=""
num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
listnum=list(num)
# print(listnum)
shu=[]
for i in listnum:
# print(num_dict[i])
shu.append(num_dict[i])
new_str="".join(shu)
# print(new_str)
return new_str
#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name: num2chinese
# Author: yunhgu
# Date: 2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------
_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999
class NotIntegerError(Exception):
pass
class OutOfRangeError(Exception):
pass
class Num2Chinese:
def convert(self, number: int):
"""
:param number:
:return:chinese number
"""
return self._to_chinese(number)
def _to_chinese(self, num):
if not str(num).isdigit():
raise NotIntegerError(u'%s is not a integer.' % num)
if num < _MIN or num > _MAX:
raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
if num < _S4:
return self._to_chinese4(num)
elif num < _S8:
return self._to_chinese8(num)
else:
return self._to_chinese16(num)
@staticmethod
def _to_chinese4(num):
assert (0 <= num < _S4)
if num < 10:
return _MAPPING[num]
else:
lst = []
while num >= 10:
lst.append(num % 10)
num = num // 10
lst.append(num)
c = len(lst) # 位数
result = u''
for idx, val in enumerate(lst):
if val != 0:
result += _P0[idx] + _MAPPING[val]
if idx < c - 1 and lst[idx + 1] == 0:
result += u'零'
return result[::-1].replace(u'一十', u'十')
def _to_chinese8(self, num):
assert (num < _S8)
to4 = self._to_chinese4
if num < _S4:
return to4(num)
else:
mod = _S4
high, low = num // mod, num % mod
if low == 0:
return to4(high) + u'万'
else:
if low < _S4 // 10:
return to4(high) + u'万零' + to4(low)
else:
return to4(high) + u'万' + to4(low)
def _to_chinese16(self, num):
assert (num < _S16)
to8 = self._to_chinese8
mod = _S8
high, low = num // mod, num % mod
if low == 0:
return to8(high) + u'亿'
else:
if low < _S8 // 10:
return to8(high) + u'亿零' + to8(low)
else:
return to8(high) + u'亿' + to8(low)
def get_sub_page_url(chapter_url, sub_page_count, first_sub_page_url_index, sub_page_id):
"""
get sub_page url
:param chapter_url: chapter url
:param sub_page_count: total sub_page count of every chapter
:param first_sub_page_url_index: first sub_page index with prefix _
:param sub_page_id:
:return:
"""
if sub_page_count==0:
return chapter_url
else:
if sub_page_id<first_sub_page_url_index:
return chapter_url
else:
#https://www.mht99.com/98886/82000964.html
# https://www.mht99.com/98886/82000964_1.html
return chapter_url.replace(".html","_"+str(sub_page_id)+".html")
def output(text,file_name):
"""
output to console and file
:param text:
:param file_name:
:return:
"""
print(text)
with open(file_name, 'a+', encoding='utf-8') as f:
f.write(text+"\n")
def download_chapter(chapter_url, file_name, chapter_webdriver):
if not chapter_url:
chapter_content = "不提供下载"
else:
chapter_content = ""
# download 每一章的分页内容
for j in range(sub_page_count):
sub_page_id = j + first_sub_page_url_index - 1
sub_page_url = get_sub_page_url(chapter_url, sub_page_count, first_sub_page_url_index, sub_page_id)
# print("####第"+ str(sub_page_id)+":" +sub_page_url)
try:
try:
chapter_webdriver.get(sub_page_url)
except (WebDriverException, TimeoutException):
time.sleep(60) # sleep 60 seconds, and then try get url again
chapter_webdriver.get(sub_page_url)
try:
# content_tag = web2.find_elements_by_css_selector("dd")[2]
# content_tag = web2.find_element_by_id("contents")
# content_tag = web2.find_element_by_class_name("container body-content")
content_tag = chapter_webdriver.find_element_by_xpath(content_tag_xpath)
chapter_content = chapter_content + "\n"
chapter_content = chapter_content + content_tag.text
except NoSuchElementException:
output("####第" + str(sub_page_id) + "页:" + "下载失败", file_name)
except:
output("####第" + str(sub_page_id) + "页:" + "下载失败", file_name)
return chapter_content
# ========================================
# 从列表页提取单章url, 然后下载单章文本
# ========================================
import time
from datetime import datetime
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
# config
file_path = r"D:\\"
story_name="侯府小财迷_唐初八"
list_url = "https://www.x52dus.com/xuanhuan/121760/"
chinese_chapter_id_flag = False #中文数字还是阿拉伯数字
sub_page_count = 2 # 每章的页数
first_sub_page_url_index = 1 # 第一个子页url中的下标, https://www.mht99.com/98886/82000964_1.html
chapter_start = 1
chapter_end = 666 # 666
content_tag_xpath = '''//*[@id="BookText"]'''
need_click_show_whole_list=False
show_whole_list_link_ele_id="""yc"""
#因为是通过章节号部分匹配方式获取url地址, 对于 001 和 1001 两个章节, 将会获取到两个地址, 用这个变量控制, 从哪个章节之后将使用第二个url
choose_2nd_url_from_chapter_id=1000
#设置番外章节
appendix_chapter_urls=[ ]
# init
num2chinese = Num2Chinese()
file_name=file_path+story_name+".txt"
full_text = story_name
output(full_text,file_name)
start_time=datetime.now()
output("下载开始时刻:"+ start_time.strftime("%c"),file_name)
web = webdriver.Chrome()
web2 = webdriver.Chrome()
web2.implicitly_wait(60) # 设置智能等待 60 seconds, 参考
# 首先跳转到列表页, 以便抓取单页的url地址, 以便后面用来获取单章的url
web.get(list_url)
#点击 "显示全部" 显示完整的列表页
if need_click_show_whole_list:
click_tag=web.find_element_by_id(show_whole_list_link_ele_id)
web.execute_script("$(arguments[0]).click()", click_tag)
for i in range(chapter_start, chapter_end + 1):
#001、小鱼小蟹
# get chapter name
if chinese_chapter_id_flag:
chinese_chapter_id=num2chinese.convert(i) #中文数字
else:
chinese_chapter_id = str(i) # 阿拉伯数字
# if len(chinese_chapter_id)<2: #补前缀 00
# chinese_chapter_id="00"+chinese_chapter_id
# elif len(chinese_chapter_id)<3: #补前缀 0
# chinese_chapter_id="0"+chinese_chapter_id
chinese_chapter_name = "第" + chinese_chapter_id + "章"
#chinese_chapter_name=chinese_chapter_id+""
if chinese_chapter_name.find("百十"):
chinese_chapter_name = chinese_chapter_name.replace("百十", "百一十")
# print(chinese_chapter_name)
# 从列表也中获取每章的 url
chapter_url = ""
try:
chapter_url_tags = web.find_elements_by_partial_link_text(chinese_chapter_name)
chapter_urls=[]
for url_tag in chapter_url_tags:
chapter_urls.append(url_tag.get_attribute("href"))
#print(url_tag.get_attribute("href"))
if len(chapter_urls)==0:
chapter_url = ""
elif i < choose_2nd_url_from_chapter_id or len(chapter_urls)==1:
chapter_url=chapter_urls[0]
else:
chapter_url=chapter_urls[1]
except Exception as e:
print(e)
chapter_url = ""
# download 每章内容
chapter_content = download_chapter(chapter_url, file_name, web2)
# output chapter content
chapter_title_text = "\n" + "\n" + "\n" + "======================" + "\n" + chinese_chapter_name + "\n"
chapter_full_text = chapter_title_text + chapter_content
output(chapter_full_text, file_name)
full_text = full_text + chapter_full_text
#download 番外章节
for i in range(1, len(appendix_chapter_urls) + 1):
# get chapter name
chinese_chapter_name = "番外:" +str(i)
print(chinese_chapter_name)
chapter_url=appendix_chapter_urls[i-1]
# download 每章内容
chapter_content = download_chapter(chapter_url, file_name, web2)
# output chapter content
chapter_title_text = "\n" + "\n" + "\n" + "======================" + "\n" + chinese_chapter_name + "\n"
chapter_full_text = chapter_title_text + chapter_content
output(chapter_full_text, file_name)
full_text = full_text + chapter_full_text
# print(full_text)
output("\n" + "\n" + "\n" + "======================" + "\n", file_name)
end_time=datetime.now()
output("下载结束时刻:"+ end_time.strftime("%c"), file_name)
total_seconds= (end_time-start_time).total_seconds()
output("下载耗时:"+ str(total_seconds) +"秒", file_name)
web.close()
web2.close()
================================
根据正文内容 xpath 不固定
================================
#========================================# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
"""数字转中文"""
num=str(num)
new_str=""
num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
listnum=list(num)
# print(listnum)
shu=[]
for i in listnum:
# print(num_dict[i])
shu.append(num_dict[i])
new_str="".join(shu)
# print(new_str)
return new_str
#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name: num2chinese
# Author: yunhgu
# Date: 2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------
_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999
class NotIntegerError(Exception):
pass
class OutOfRangeError(Exception):
pass
class Num2Chinese:
def convert(self, number: int):
"""
:param number:
:return:chinese number
"""
return self._to_chinese(number)
def _to_chinese(self, num):
if not str(num).isdigit():
raise NotIntegerError(u'%s is not a integer.' % num)
if num < _MIN or num > _MAX:
raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
if num < _S4:
return self._to_chinese4(num)
elif num < _S8:
return self._to_chinese8(num)
else:
return self._to_chinese16(num)
@staticmethod
def _to_chinese4(num):
assert (0 <= num < _S4)
if num < 10:
return _MAPPING[num]
else:
lst = []
while num >= 10:
lst.append(num % 10)
num = num // 10
lst.append(num)
c = len(lst) # 位数
result = u''
for idx, val in enumerate(lst):
if val != 0:
result += _P0[idx] + _MAPPING[val]
if idx < c - 1 and lst[idx + 1] == 0:
result += u'零'
return result[::-1].replace(u'一十', u'十')
def _to_chinese8(self, num):
assert (num < _S8)
to4 = self._to_chinese4
if num < _S4:
return to4(num)
else:
mod = _S4
high, low = num // mod, num % mod
if low == 0:
return to4(high) + u'万'
else:
if low < _S4 // 10:
return to4(high) + u'万零' + to4(low)
else:
return to4(high) + u'万' + to4(low)
def _to_chinese16(self, num):
assert (num < _S16)
to8 = self._to_chinese8
mod = _S8
high, low = num // mod, num % mod
if low == 0:
return to8(high) + u'亿'
else:
if low < _S8 // 10:
return to8(high) + u'亿零' + to8(low)
else:
return to8(high) + u'亿' + to8(low)
def get_sub_page_url(chapter_url, sub_page_count, first_sub_page_url_index, sub_page_id):
"""
get sub_page url
:param chapter_url: chapter url
:param sub_page_count: total sub_page count of every chapter
:param first_sub_page_url_index: first sub_page index with prefix _
:param sub_page_id:
:return:
"""
if sub_page_count==0:
return chapter_url
else:
if sub_page_id<first_sub_page_url_index:
return chapter_url
else:
#https://www.mht99.com/98886/82000964.html
# https://www.mht99.com/98886/82000964_1.html
return chapter_url.replace(".html","_"+str(sub_page_id)+".html")
def output(text,file_name):
"""
output to console and file
:param text:
:param file_name:
:return:
"""
print(text)
with open(file_name, 'a+', encoding='utf-8') as f:
f.write(text+"\n")
def download_chapter(chapter_url, file_name, chapter_webdriver, chinese_chapter_id=""):
#url: https://www.kubiji.org/255565/4076023.html
#content_tag_xpath = '''//*[@id="con4076023"]'''
page_id=chapter_url.split("/")[-1]
page_id=page_id.split(".")[0]
content_tag_xpath = '''//*[@id="con'''+page_id+'''"]'''
if not chapter_url:
chapter_content = "不提供下载"
else:
chapter_content = ""
# download 每一章的分页内容
for j in range(sub_page_count):
sub_page_id = j + first_sub_page_url_index - 1
sub_page_url = get_sub_page_url(chapter_url, sub_page_count, first_sub_page_url_index, sub_page_id)
# print("####第"+ str(sub_page_id)+":" +sub_page_url)
try:
try:
chapter_webdriver.get(sub_page_url)
except (WebDriverException, TimeoutException):
time.sleep(60) # sleep 60 seconds, and then try get url again
chapter_webdriver.get(sub_page_url)
try:
# content_tag = web2.find_elements_by_css_selector("dd")[2]
# content_tag = web2.find_element_by_id("contents")
# content_tag = web2.find_element_by_class_name("container body-content")
content_tag = chapter_webdriver.find_element_by_xpath(content_tag_xpath)
chapter_content = chapter_content + "\n"
chapter_content = chapter_content + content_tag.text
except NoSuchElementException:
output("####第" + str(sub_page_id) + "页:" + "下载失败", file_name)
except:
output("####第" + str(sub_page_id) + "页:" + "下载失败", file_name)
return chapter_content
# ========================================
# 从列表页提取单章url, 然后下载单章文本
# ========================================
import time
from datetime import datetime
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
# config
file_path = r"D:\\"
story_name="穿越之农家有女"
list_url = "https://www.kubiji.org/165779/"
chinese_chapter_id_flag = True #中文数字还是阿拉伯数字
sub_page_count = 1 # 每章的页数
first_sub_page_url_index = 1 # 第一个子页url中的下标, https://www.mht99.com/98886/82000964_1.html
chapter_start = 1
chapter_end = 507# 507
content_tag_xpath = '''//*[@id="con3636511"]'''
need_click_show_whole_list=False
show_whole_list_link_ele_id="""yc"""
#因为是通过章节号部分匹配方式获取url地址, 对于 001 和 1001 两个章节, 将会获取到两个地址, 用这个变量控制, 从哪个章节之后将使用第二个url
choose_2nd_url_from_chapter_id=1000
#设置番外章节
appendix_chapter_urls=[ """https://www.kubiji.org/165779/4564577.html"""
,"""https://www.kubiji.org/165779/4565113.html"""
,"""https://www.kubiji.org/165779/4565541.html"""
,"""https://www.kubiji.org/165779/4566038.html"""
,"""https://www.kubiji.org/165779/4566433.html"""
,"""https://www.kubiji.org/165779/4566834.html"""
,"""https://www.kubiji.org/165779/4567957.html"""
,"""https://www.kubiji.org/165779/4568025.html"""
,"""https://www.kubiji.org/165779/4568840.html"""
,"""https://www.kubiji.org/165779/4569742.html"""
,"""https://www.kubiji.org/165779/4570705.html"""
,"""https://www.kubiji.org/165779/4571203.html"""
,"""https://www.kubiji.org/165779/4571557.html"""
,"""https://www.kubiji.org/165779/4571990.html"""
,"""https://www.kubiji.org/165779/4572324.html"""
,"""https://www.kubiji.org/165779/4572716.html"""
,"""https://www.kubiji.org/165779/4573193.html"""
,"""https://www.kubiji.org/165779/4574522.html"""
,"""https://www.kubiji.org/165779/4575255.html"""
,"""https://www.kubiji.org/165779/4622983.html""" ]
# for i in range(0, 971):
# #https://m.xinqingdou.net/84964/373110.html
# appendix_chapter_urls.append("https://m.xinqingdou.net/84964/"+str(373110+i)+".html")
# #appendix_chapter_urls.append("https://m.quyasw.com/yuedu/527u/" + str(i) + ".html?page=2")
# init
num2chinese = Num2Chinese()
file_name=file_path+story_name+".txt"
full_text = story_name
output(full_text,file_name)
start_time=datetime.now()
output("下载开始时刻:"+ start_time.strftime("%c"),file_name)
web = webdriver.Chrome()
web2 = webdriver.Chrome()
web2.implicitly_wait(60) # 设置智能等待 60 seconds, 参考
# 首先跳转到列表页, 以便抓取单页的url地址, 以便后面用来获取单章的url
web.get(list_url)
#点击 "显示全部" 显示完整的列表页
if need_click_show_whole_list:
click_tag=web.find_element_by_id(show_whole_list_link_ele_id)
web.execute_script("$(arguments[0]).click()", click_tag)
for i in range(chapter_start, chapter_end + 1):
#001、小鱼小蟹
# get chapter name
if chinese_chapter_id_flag:
chinese_chapter_id=num2chinese.convert(i) #中文数字
else:
chinese_chapter_id = str(i) # 阿拉伯数字
# if len(chinese_chapter_id)<2: #补前缀 00
# chinese_chapter_id="00"+chinese_chapter_id
# elif len(chinese_chapter_id)<3: #补前缀 0
# chinese_chapter_id="0"+chinese_chapter_id
chinese_chapter_name = "第" + chinese_chapter_id + "章"
#chinese_chapter_name=chinese_chapter_id+""
if chinese_chapter_name.find("百十"):
chinese_chapter_name = chinese_chapter_name.replace("百十", "百一十")
# print(chinese_chapter_name)
# 从列表也中获取每章的 url
chapter_url = ""
try:
chapter_url_tags = web.find_elements_by_partial_link_text(chinese_chapter_name)
chapter_urls=[]
for url_tag in chapter_url_tags:
chapter_urls.append(url_tag.get_attribute("href"))
#print(url_tag.get_attribute("href"))
if len(chapter_urls)==0:
chapter_url = ""
elif i < choose_2nd_url_from_chapter_id or len(chapter_urls)==1:
chapter_url=chapter_urls[0]
else:
chapter_url=chapter_urls[1]
except Exception as e:
print(e)
chapter_url = ""
# download 每章内容
chapter_content = download_chapter(chapter_url, file_name, web2, chinese_chapter_id)
# output chapter content
chapter_title_text = "\n" + "\n" + "\n" + "======================" + "\n" + chinese_chapter_name + "\n"
chapter_full_text = chapter_title_text + chapter_content
output(chapter_full_text, file_name)
full_text = full_text + chapter_full_text
#download 番外章节
for i in range(1, len(appendix_chapter_urls) + 1):
# get chapter name
chinese_chapter_name = "番外:" +str(i)
print(chinese_chapter_name)
chapter_url=appendix_chapter_urls[i-1]
# download 每章内容
chapter_content = download_chapter(chapter_url, file_name, web2)
# output chapter content
chapter_title_text = "\n" + "\n" + "\n" + "======================" + "\n" + chinese_chapter_name + "\n"
chapter_full_text = chapter_title_text + chapter_content
output(chapter_full_text, file_name)
full_text = full_text + chapter_full_text
# print(full_text)
output("\n" + "\n" + "\n" + "======================" + "\n", file_name)
end_time=datetime.now()
output("下载结束时刻:"+ end_time.strftime("%c"), file_name)
total_seconds= (end_time-start_time).total_seconds()
output("下载耗时:"+ str(total_seconds) +"秒", file_name)
web.close()
web2.close()