当前位置 : 主页 > 编程语言 > python >

Python爬取新浪新闻数据写入Excel

来源:互联网 收集:自由互联 发布时间:2022-07-22
先爬取最新消息列表,再循环爬取对应url中的新闻详细数据 # -*- coding: utf-8 -*- """ Spyder Editor news.py. """ import requests from bs4 import BeautifulSoup from datetime import datetime import json import xlwt def ge


 先爬取最新消息列表,再循环爬取对应url中的新闻详细数据

Python爬取新浪新闻数据写入Excel_主函数

# -*- coding: utf-8 -*-
"""
Spyder Editor

news.py.
"""


import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import xlwt

def get_list(url):

# 新闻链接
res=requests.get(url)
res.encoding='utf-8'

# 完整HTML
html=BeautifulSoup(res.text,'html.parser')

# 新闻列表
newList=[]

for item in html.select('.news-item'):
try:
newObj={}
newObj['title']=item.select('h2 a')[0].text
newObj['url']=item.select('h2 a')[0].get('href')
newList.append(newObj)
except:
print('出现异常')
return newList


def get_detail(url):

# 新闻链接
res=requests.get(url)
res.encoding='utf-8'

# 完整HTML
html=BeautifulSoup(res.text,'html.parser')

# 新闻对象
result={}

# 新闻标题
result['title']=html.select('.main-title')[0].text

# 发布时间
timesource=html.select('.date-source span')[0].text
createtime=datetime.strptime(timesource,'%Y年%m月%d日 %H:%M')
createtime.strftime('%Y-%m-%d')
result['createtime']=createtime

# 新闻来源
result['place']=html.select('.date-source a')[0].text

# 新闻内容
article=[]
for p in html.select('#article p')[:-1]:
article.append(p.text.strip())
articleText=' '.join(article)
result['article']=articleText

# 新闻作者
result['author']=html.select('.show_author')[0].text.strip('责任编辑:')

# 新闻链接
result['url']=url

return result




if __name__ == "__main__": #主函数

newList=get_list('https://news.sina.com.cn/world/')
# print(newList)

# newObj=get_detail('http://news.sina.com.cn/c/2020-10-19/doc-iiznctkc6335371.shtml')
# print(newObj)

book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('ke_qq')
head = ['标题','时间','作者','链接','来源','内容'] #表头
for h in range(len(head)):
sheet.write(0,h,head[h]) #写入表头

for i,item in enumerate(newList):
try:
newObj=get_detail(item['url'])
sheet.write(i+1,0,newObj['title'])
sheet.write(i+1,1,newObj['createtime'])
sheet.write(i+1,2,newObj['author'])
sheet.write(i+1,3,newObj['url'])
sheet.write(i+1,4,newObj['place'])
sheet.write(i+1,5,newObj['article'])
print (str(i),'写入成功')
except:
print (str(i),'出现异常')

book.save('F:\ke.xls')

Python爬取新浪新闻数据写入Excel_html_02

网友评论