当前位置 : 主页 > 编程语言 > python >

【python】爬取CSDN博客文章(保存为html,txt,md)

来源:互联网 收集:自由互联 发布时间:2022-07-02
def crawl(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36", } print("crawl...") # 配置header破反爬 response = requests.get(url, headers
def crawl(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36", } print("crawl...") # 配置header破反爬 response = requests.get(url, headers=headers) # 200就继续 if response.status_code == 200: html = response.content.decode("utf8") # print(html) tree = etree.HTML(html) print("look for text...") # 找到需要的html块 title = tree.xpath('//*[@id="articleContentId"]/text()')[0] block = tree.xpath('//*[@id="content_views"]') # html ohtml = unescape(etree.tostring(block[0]).decode("utf8")) # 纯文本 text = block[0].xpath('string(.)').strip() # print("html:", ohtml) # print("text:", text) print("title:", title) save(ohtml, text) # 完成! print("finish!") else: print("failed!") def save(html, text): if "output" not in os.listdir(): # 不存在输出文件夹就创建 os.mkdir("output") os.mkdir("output/html") os.mkdir("output/text") os.mkdir("output/markdown") with open(f"output/html/{title}.html", 'w', encoding='utf8') as html_file: # 保存html print("write html...") html_file.write(html) with open(f"output/text/{title}.txt", 'w', encoding='utf8') as txt_file: # 保存纯文本 print("write text...") txt_file.write(text) with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file: # 保存markdown print("write markdown...") text_maker = HTML2Text() # md转换 md_text = text_maker.handle(html) md_file.write(md_text) if __name__ == '__main__': # 你想要爬取的文章url url = "url" crawl(url)

感谢大佬帮助

搞定

快乐

\( ̄︶ ̄*\))

【python】爬取CSDN博客文章(保存为html,txt,md)

睡觉。

( ̄o ̄) . z Z

上一篇:pycharm有用快捷键
下一篇:没有了
网友评论