主要源码如下: import scrapy from pc . items import FileItem import json import math import datetime class xxSpider ( scrapy . Spider ): name = 'xx' allowed_domains = [ 'xx.com' ] #offset = 1 #max_page=10 #抓取的最大页数 baseU
主要源码如下:
from pc.items import FileItem
import json
import math
import datetime
class xxSpider(scrapy.Spider):
name = 'xx'
allowed_domains = ['xx.com']
#offset = 1
#max_page=10 #抓取的最大页数
baseURL = 'https://www.xxx.com/js/piaofu.html'
start_urls = [baseURL]
def parse(self, response):
url_list = response.xpath("//div[@class='list-pngjs']/dl/dd/a/@href").extract()
for url in url_list:
href = 'https://www.xx.com' + url
print("href" * 30)
print(href)
print("href" * 30)
if type(href) == str:
yield scrapy.Request(
href,
callback=self.parse_detail
)
if len(response.xpath("//div[@class='dede_pages']/ul/li[@class='thisclass']/following-sibling::li")):
url = response.xpath("//div[@class='dede_pages']/ul/li[@class='thisclass']/following-sibling::li/a/@href").extract()[0]
url = 'https://www.xx.com' + url
print("0" * 30)
print(url)
print("0" * 30)
yield scrapy.Request(url, callback=self.parse)
def parse_detail(self,response):
pic_id = response.xpath("//div[@id='l']/div[@class='content-a']/div[@class='xiazai']/a[@class='bt-blue js-download']/@data-fileid").extract()[0]
url='https://www.xxx.com/js/d'+pic_id+'.zip'
# fileUrl为相对路径时,可用response.urljoin(url)进行拼接
item = FileItem(file_urls=[url])
yield item # 注意:此处为yield,不是return
下载源码