当前位置 : 主页 > 手机开发 > 其它 >

【菜鸟学Python】爬取糗事百科分别使用面向过程面向对象

来源:互联网 收集:自由互联 发布时间:2021-06-19
方法一:使用面向过程爬取1.0 import time import json import requests from lxml import etree from requests.exceptions import RequestExceptionurl = " http://www.lovehhy.net/joke/Detail/QSBK " headers = { " User-Agent " : " Mozilla/
方法一:使用面向过程爬取1.0
import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


url = "http://www.lovehhy.net/joke/Detail/QSBK"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}

response = requests.get(url, headers=headers)
# print(response.text)

dom = etree.HTML(response.text)
titles = dom.xpath(//div[@class="cat_llb"]/h3)
contents = dom.xpath(//div[@class="cat_llb"]/div[@id="endtext"])
times = dom.xpath(//div[@class="cat_llb"])
# print(times)

title_list = []
for title in titles:
    title_list.append(title.xpath(./a/text())[0])


content_list = []
for content in contents:
    content_list.append(content.xpath(.//text())[0])

time_list = []
for time in times:
    for i in time.xpath(./text()):
        print(len(i))
        time_list.append(i)


zip_item = zip(title_list, content_list, time_list)
with open(content.json, w, encoding=utf-8) as obj_f:
    for i in zip_item:
        item = {}
        item[title] = i[0]
        item[content] = i[1]
        item[time] = i[2][0:22]
        item[click] = i[2][22:-1]
        obj_f.write(json.dumps(item, ensure_ascii=False) + ,\n)
方法二:使用面向过程爬取2.0(使用函数封装爬取)
import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(response.url)
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    dom = etree.HTML(html)
    title_list = []
    for title in dom.xpath(//div[@class="cat_llb"]/h3):
        title_list.append(title.xpath(./a/text())[0])

    content_list = []
    for content in dom.xpath(//div[@class="cat_llb"]/div[@id="endtext"]):
        content_list.append(content.xpath(.//text())[0])

    time_list = []
    for time in dom.xpath(//div[@class="cat_llb"]):
        for i in time.xpath(./text()):
            time_list.append(i)

    zip_item = zip(title_list, content_list, time_list)
    for i in zip_item:
        item = {}
        item[title] = i[0]
        item[content] = i[1].strip()
        item[time] = i[2][0:22].strip()
        item[click] = i[2][22:-1].strip()
        print(item)
        yield item


def write_to_file(content):
    with open(result.json, a, encoding=utf-8) as f:
        print(type(json.dumps(content)))
        f.write(json.dumps(content, ensure_ascii=False) + \n)


def main(start):
    url = "http://www.lovehhy.net/joke/Detail/QSBK/" + str(start)
    html = get_one_page(url)
    for item in parse_one_page(html):
        write_to_file(item)


if __name__ == "__main__":
    for i in range(0, 5):
        main(start=i)
        time.sleep(1)

方法三:使用面向对象爬取

import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


class ChouShiBaiKe:

    def __init__(self):

        self.url = "http://www.lovehhy.net/joke/Detail/QSBK/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }

    def get_one_page(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None

    def process_data(self, html):

        dom = etree.HTML(html)
        titles = dom.xpath(//div[@class="cat_llb"]/h3)
        contents = dom.xpath(//div[@class="cat_llb"]/div[@id="endtext"])
        times = dom.xpath(//div[@class="cat_llb"])
        title_list = []
        for title in titles:
            title_list.append(title.xpath(./a/text())[0])

        content_list = []
        for content in contents:
            content_list.append(content.xpath(.//text())[0])

        time_list = []

        for time in times:
            for i in time.xpath(./text()):
                time_list.append(i)

        zip_item = zip(title_list, content_list, time_list)

        for i in zip_item:
            item = {}
            item[title] = i[0]
            item[content] = i[1].strip()
            item[time] = i[2][0:22].strip()
            item[click] = i[2][22:-1].strip()
            print(item)
            yield item

    def save_file(self, content):
        with open(result_class.json, a, encoding=utf-8) as f:
            # print(type(json.dumps(content)))
            f.write(json.dumps(content, ensure_ascii=False) + \n)

    def run(self, start):
        url = "http://www.lovehhy.net/joke/Detail/QSBK/" + str(start)
        html = self.get_one_page(url)
        for item in self.process_data(html):
            self.save_file(item)


if __name__ == "__main__":
    qsbk = ChouShiBaiKe()
    for i in range(0, 5):
        qsbk.run(start=i)
        time.sleep(1)
网友评论