方法一:使用面向过程爬取1.0 import time import json import requests from lxml import etree from requests.exceptions import RequestExceptionurl = " http://www.lovehhy.net/joke/Detail/QSBK " headers = { " User-Agent " : " Mozilla/
方法一:使用面向过程爬取1.0
import time import json import requests from lxml import etree from requests.exceptions import RequestException url = "http://www.lovehhy.net/joke/Detail/QSBK" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } response = requests.get(url, headers=headers) # print(response.text) dom = etree.HTML(response.text) titles = dom.xpath(‘//div[@class="cat_llb"]/h3‘) contents = dom.xpath(‘//div[@class="cat_llb"]/div[@id="endtext"]‘) times = dom.xpath(‘//div[@class="cat_llb"]‘) # print(times) title_list = [] for title in titles: title_list.append(title.xpath(‘./a/text()‘)[0]) content_list = [] for content in contents: content_list.append(content.xpath(‘.//text()‘)[0]) time_list = [] for time in times: for i in time.xpath(‘./text()‘): print(len(i)) time_list.append(i) zip_item = zip(title_list, content_list, time_list) with open(‘content.json‘, ‘w‘, encoding=‘utf-8‘) as obj_f: for i in zip_item: item = {} item[‘title‘] = i[0] item[‘content‘] = i[1] item[‘time‘] = i[2][0:22] item[‘click‘] = i[2][22:-1] obj_f.write(json.dumps(item, ensure_ascii=False) + ‘,\n‘)
方法二:使用面向过程爬取2.0(使用函数封装爬取)
import time import json import requests from lxml import etree from requests.exceptions import RequestException def get_one_page(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } response = requests.get(url, headers=headers) if response.status_code == 200: print(response.url) return response.text return None except RequestException: return None def parse_one_page(html): dom = etree.HTML(html) title_list = [] for title in dom.xpath(‘//div[@class="cat_llb"]/h3‘): title_list.append(title.xpath(‘./a/text()‘)[0]) content_list = [] for content in dom.xpath(‘//div[@class="cat_llb"]/div[@id="endtext"]‘): content_list.append(content.xpath(‘.//text()‘)[0]) time_list = [] for time in dom.xpath(‘//div[@class="cat_llb"]‘): for i in time.xpath(‘./text()‘): time_list.append(i) zip_item = zip(title_list, content_list, time_list) for i in zip_item: item = {} item[‘title‘] = i[0] item[‘content‘] = i[1].strip() item[‘time‘] = i[2][0:22].strip() item[‘click‘] = i[2][22:-1].strip() print(item) yield item def write_to_file(content): with open(‘result.json‘, ‘a‘, encoding=‘utf-8‘) as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False) + ‘\n‘) def main(start): url = "http://www.lovehhy.net/joke/Detail/QSBK/" + str(start) html = get_one_page(url) for item in parse_one_page(html): write_to_file(item) if __name__ == "__main__": for i in range(0, 5): main(start=i) time.sleep(1)
方法三:使用面向对象爬取
import time import json import requests from lxml import etree from requests.exceptions import RequestException class ChouShiBaiKe: def __init__(self): self.url = "http://www.lovehhy.net/joke/Detail/QSBK/" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } def get_one_page(self, url): try: response = requests.get(url, headers=self.headers) if response.status_code == 200: return response.text return None except RequestException: return None def process_data(self, html): dom = etree.HTML(html) titles = dom.xpath(‘//div[@class="cat_llb"]/h3‘) contents = dom.xpath(‘//div[@class="cat_llb"]/div[@id="endtext"]‘) times = dom.xpath(‘//div[@class="cat_llb"]‘) title_list = [] for title in titles: title_list.append(title.xpath(‘./a/text()‘)[0]) content_list = [] for content in contents: content_list.append(content.xpath(‘.//text()‘)[0]) time_list = [] for time in times: for i in time.xpath(‘./text()‘): time_list.append(i) zip_item = zip(title_list, content_list, time_list) for i in zip_item: item = {} item[‘title‘] = i[0] item[‘content‘] = i[1].strip() item[‘time‘] = i[2][0:22].strip() item[‘click‘] = i[2][22:-1].strip() print(item) yield item def save_file(self, content): with open(‘result_class.json‘, ‘a‘, encoding=‘utf-8‘) as f: # print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False) + ‘\n‘) def run(self, start): url = "http://www.lovehhy.net/joke/Detail/QSBK/" + str(start) html = self.get_one_page(url) for item in self.process_data(html): self.save_file(item) if __name__ == "__main__": qsbk = ChouShiBaiKe() for i in range(0, 5): qsbk.run(start=i) time.sleep(1)