特色栏目： python 批处理 net编程 Javascript Php Asp Css Html5 Android seo centos

Python爬虫之利用BeautifulSoup爬取豆瓣小说（三）——将小说信息写入文件

来源：互联网收集：自由互联发布时间：2022-07-05

1 #-*-coding:utf-8-*- 2 import urllib2 3 from bs4 import BeautifulSoup 4 5 class dbxs: 6 7 def __init__(self): 8 self.pageIndex = 0 9 self.enable = True 10 self.file = None 11 self.content = [] 12 13 14 #获取html页面的内容 15 def get

1 #-*-coding:utf-8-*-
2 import urllib2
3 from bs4 import BeautifulSoup
4
5 class dbxs:
6
7 def __init__(self):
8 self.pageIndex = 0
9 self.enable = True
10 self.file = None
11 self.content = []
12
13
14 #获取html页面的内容
15 def getPage(self, pageIndex):
16 try:
17 #设置代理ip
18 enable_proxy = True
19 proxy_handler = urllib2.ProxyHandler({'Http': '113.118.170.230:808'})
20 null_proxy_handler = urllib2.ProxyHandler({})
21 if enable_proxy:
22 opener = urllib2.build_opener(proxy_handler)
23 else:
24 opener = urllib2.build_opener(null_proxy_handler)
25 urllib2.install_opener(opener)
26 #获得页面响应的内容
27 url = 'https://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book' + "?start=" + str(pageIndex)
28 #设置请求头部信息，模拟浏览器的行为
29 my_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0)'}
30 request = urllib2.Request(url, headers = my_headers)
31 response = urllib2.urlopen(request)
32 return response.read()
33 except urllib2.URLError, e:
34 if hasattr(e, "code"):
35 print e.code
36 if hasattr(e, "reason"):
37 print e.reason
38 return None
39
40 #过滤查找这一页的小说名字，信息和评分
41 def getContent(self, pageIndex, content):
42 pageCode = self.getPage(pageIndex)
43 soup = BeautifulSoup(pageCode, 'html.parser')
44 #在获得相应的内容中找出所有标签为<dd>的内容（里面包含了我们需要的小说信息）
45 contents = soup.find_all('dd')
46
47 if contents:
48 for item in contents:
49 title = item.find(class_ = 'title').string.encode('utf-8')
50 info = item.find(class_ = 'desc').string.strip().encode('utf-8')
51 rate = item.find(class_ = 'rating_nums')
52 #通过试验，我们发现某一页可能存在小说没有评分，如果我们不判断rate，那么可能就出现报错
53 if rate:
54 rates = rate.string.encode('utf-8')
55 content.append([title, info, rates])
56
57 else:
58 content.append([title, info])
59 #如果页面不包含<dd>标签，我们应该停止
60 else:
61 print u"所有页面已加载完"
62 self.enable = False
63
64 return content
65
66
67
68 #写入文件
69 def writeData(self, content):
70 self.file = open("bdxs.txt", "w+") #必须在for循环外面，不然每一次写入都会覆盖之前的数据
71 for item in content:
72 if len(item) == 3:
73 self.file.write(item[0] + "\n")
74 self.file.write(item[1] + "\n")
75 self.file.write(u"评分:" + item[2] + "\n\n")
76 else:
77 self.file.write(item[0] + "\n")
78 self.file.write(item[1] + "\n")
79 self.file.write("========================================\n\n")
80
81
82 #创建一个开始方法
83 def start(self):
84 x = 1
85 while self.enable == True:
86 content = self.getContent(self.pageIndex, self.content)
87 if self.enable == True:
88 print "正在写入第%s页..." %x
89 self.writeData(content)
90 self.pageIndex += 15
91 x += 1
92
93
94 DBXS = dbxs()
95 DBXS.start()

这段代码我还没理解透彻，比如每一页的小说信息写入完成后，怎么在后面加上第几页，后期我将继续完善它。

Python爬虫之利用BeautifulSoup爬取豆瓣小说（三）——将小说信息写入文件

相关文章