def crawl(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36", } print("crawl...") # 配置header破反爬 response = requests.get(url, headers
def crawl(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
}
print("crawl...")
# 配置header破反爬
response = requests.get(url, headers=headers)
# 200就继续
if response.status_code == 200:
html = response.content.decode("utf8")
# print(html)
tree = etree.HTML(html)
print("look for text...")
# 找到需要的html块
title = tree.xpath('//*[@id="articleContentId"]/text()')[0]
block = tree.xpath('//*[@id="content_views"]')
# html
ohtml = unescape(etree.tostring(block[0]).decode("utf8"))
# 纯文本
text = block[0].xpath('string(.)').strip()
# print("html:", ohtml)
# print("text:", text)
print("title:", title)
save(ohtml, text)
# 完成!
print("finish!")
else:
print("failed!")
def save(html, text):
if "output" not in os.listdir():
# 不存在输出文件夹就创建
os.mkdir("output")
os.mkdir("output/html")
os.mkdir("output/text")
os.mkdir("output/markdown")
with open(f"output/html/{title}.html", 'w', encoding='utf8') as html_file:
# 保存html
print("write html...")
html_file.write(html)
with open(f"output/text/{title}.txt", 'w', encoding='utf8') as txt_file:
# 保存纯文本
print("write text...")
txt_file.write(text)
with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file:
# 保存markdown
print("write markdown...")
text_maker = HTML2Text()
# md转换
md_text = text_maker.handle(html)
md_file.write(md_text)
if __name__ == '__main__':
# 你想要爬取的文章url
url = "url"
crawl(url)
感谢大佬帮助
搞定
快乐
\( ̄︶ ̄*\))
睡觉。
( ̄o ̄) . z Z