当前位置 : 主页 > 编程语言 > python >

python爬虫--爬取cctv连续剧

来源:互联网 收集:自由互联 发布时间:2022-09-02
1 #encoding=utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import re 5 import os 6 from aria2rpc import rpc_addUri 7 class Cntv(): 8 9 def openUrl(self,url): 10 """ 11 This method is used to open a web site 12 :param url:Web site
1 #encoding=utf-8
2 import requests
3 from bs4 import BeautifulSoup
4 import re
5 import os
6 from aria2rpc import rpc_addUri
7 class Cntv():
8
9 def openUrl(self,url):
10 """
11 This method is used to open a web site
12 :param url:Web site to request
13 :return:Requested object
14 """
15 header = {
16 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
17 }
18 response = requests.get(url, header)
19 return response
20 # pass
21 def getEachEpisodeUrl(self):
22 """
23 Get the address of each episode of the TV play
24 :return:urls lists
25 """
26 urls = []
27 # response = requests.get(self.url)
28 url = "http://tv.cctv.com/2014/07/07/VIDA1404730290373811.shtml"
29 response = self.openUrl(url)
30 html = response.content.decode('utf-8')
31 soup = BeautifulSoup(html,'html.parser')
32 title = soup.select(".text_mod h3")
33 print(title[0].text)
34 episodes = soup.select('.img a')
35 # print(episodes)
36 for each in range(1,len(episodes),3):
37 print(episodes[each]['title'],"link:"+episodes[each]['href'])
38 urls.append(episodes[each]['href'])
39 print("Get Each Episode Url Come Over !!!")
40 return urls
41 def getEachDLUrl(self):
42 urls = self.getEachEpisodeUrl()
43 links = []
44 for num,url in enumerate(urls):
45 response = self.openUrl(url)
46 html = response.text
47 # soup = BeautifulSoup(html, 'html.parser')
48 match = re.search(r'guid = "(\w+?)";', html)
49 pid = match.group(1)
50 # print(pid)
51 link = "http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=%s&tz=%s&from=%s&url=%s&idl=%s&idlr=%s&modifyed=%s" %(pid,'-8','000news',url,'32','32','false')
52 links.append(link)
53 print("获取第%d集" %(num))
54 # print(urls)
55 return links
56 def getDLList(self):
57 """
58 Get the download address for each episode of the TV play
59 :return:ownload address list
60 """
61 links = self.getEachDLUrl()
62 # links = ["http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=59381a0e55404cf5b101f7d3bcad2da8&tz=-8&from=000news&url=http://tv.cctv.com/2014/07/15/VIDE1405435161521590.shtml&idl=32&idlr=32&modifyed=false"]
63 dl_urls = []
64 for link in links:
65 dl_url = []
66 response = self.openUrl(link)
67 # html = response.content.decode('utf-8')
68 dl_list = response.json()['video']['chapters4']
69 for each in range(len(dl_list)):
70 downloadurl = dl_list[each]['url']
71 dl_url.append(downloadurl)
72 print(downloadurl)
73 dl_urls.append(dl_url)
74 return dl_urls
75 def _add_aria2_task(self, url, name):
76 """
77 :param url:download url
78 :param name:dowmload tv name
79 :return:
80 """
81 try:
82 result = rpc_addUri(url, {'out': name})
83 return result
84 except Exception as e:
85 print(e)
86 return None
87
88
89 # response.json()['video']['lowChapters'][0]['url']
90 # response.json()['video']['chapters4'][0]['url']
91 """
92 def dlTv(self):
93
94 dl_urls_list = self.getDLList()
95 if os.path.exists("tv_list") == False:
96 os.mkdir("tv_list")
97 os.chdir("tv_list")
98 for dl_urls in dl_urls_list:
99 for dl_url in dl_urls:
100 print("download" + dl_url)
101 # response = self.openUrl(dl_url)
102 # with open("first.mp4",'ab') as tl:
103 # tl.write(response.content)
104 print("-"*20)
105 """
106 if __name__ == "__main__":
107 cm = Cntv()
108 # cm.getUrl()
109 # cm.openUrl()
110
111 lists = cm.getDLList()
112 for num,list in enumerate(lists):
113 for i,url in enumerate(list):
114 cm._add_aria2_task(url, str(num+1)+'_'+str(i+1)+'.mp4')

 



网友评论