评论链接 放入excel(negkey)中的内容 1 # -*- coding: utf-8 -*- 2 """ 3 Created on Thu Aug 16 14:52:09 2019 4 需要爬虫服务可以联系Q: 2960389193 6 """ 7 # 好评链接:http://www.dianping.com/shop/2044996/review_all/p2?q
评论链接
放入excel(negkey)中的内容 1 # -*- coding: utf-8 -*-
2 """ 3 Created on Thu Aug 16 14:52:09 2019 4 需要爬虫服务可以联系Q:2960389193抓取结果展示 爬一次IP就被封了 o(╥﹏╥)o
6 """ 7 #好评链接:http://www.dianping.com/shop/2044996/review_all/p2?queryType=reviewGrade&queryVal=good 8 #差评链接:http://www.dianping.com/shop/2044996/review_all/p2?queryType=reviewGrade&queryVal=bad
9 10 11 import requests 12 from lxml import etree 13 import xlrd 14 import random 15 import time 16 17 data = [] 18 def Comments(url): 19 20 headers = {‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, 21 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36‘, 22 ‘Cookie‘: ‘浏览器获取的cookie‘ 23 } 24 #url = ‘http://www.dianping.com/shop/19110508/review_all?queryType=reviewGrade&queryVal=bad‘ 25 resp = requests.get(url,headers=headers) 26 27 #print (resp.content.decode(‘utf-8‘))#查看爬取网页代码,代码中文内容用二进制表示,故print时要decode 28 29 comments = etree.HTML(resp.text,parser=etree.HTMLParser(encoding=‘utf-8‘)) 30 commentlong = comments.xpath(‘//div[@class="review-words Hide"]‘)#长评 31 commentshort = comments.xpath(‘//div[@class="review-words"]‘)#短评 32 for l in commentlong: 33 data.append(l.xpath(‘string(.)‘).replace("\t","").replace("\n","").replace(" ","").replace("收起评论","").encode(‘gbk‘,‘ignore‘).decode(‘gbk‘)) 34 #把不需要的字符清除,可以用split()去掉\xa0,\t,\n,但是句子会被拆分成一个列表,不方便后续使用 35 for s in commentshort: 36 data.append(s.xpath(‘string(.)‘).replace("\t","").replace("\n","").replace(" ","").encode(‘gbk‘,‘ignore‘).decode(‘gbk‘)) 37 38 def getComments(): 39 path = ‘D:/anaconda/shirleylearn/dazhongdianping/negkey.xlsx‘#抓取关键字 40 excelfile = xlrd.open_workbook(path) 41 keys = excelfile.sheet_by_name(‘Sheet1‘) 42 n = keys.nrows 43 for i in range(0,n): 44 key = int(keys.row(i)[0].value) 45 for page in range(1,int(keys.row(i)[1].value)+1): 46 url = ‘http://www.dianping.com/shop/%d/review_all/p%d?queryType=reviewGrade&queryVal=bad‘%(key,page)#拼链接,修改bad,good即为差评和好评 47 #print(url) 48 Comments(url) 49 time.sleep(random.random()) 50 51 if __name__ == "__main__": 52 getComments() 53 print (len(data)) 54 with open(‘neg.txt‘,‘w‘) as f:#把评论放入txt,好评改为pos.txt 55 for k in data: 56 f.write(k)