当前位置 : 主页 > 编程语言 > python >

python多线程爬取网页名称写入到excel

来源:互联网 收集:自由互联 发布时间:2022-07-07
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import threading import requests from bs4 import BeautifulSoup from time import sleep from datetime import datetime # In[2]: df = pd . read_excel ( "网站对应名字.xlsx"
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import threading
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime


# In[2]:


df = pd.read_excel("网站对应名字.xlsx")


# In[16]:


sites = df.URL
data_count = len(sites)
thread_count = 16
threads = []
n_loops = range(thread_count)


# In[17]:


names = [None]*data_count


# In[18]:


def get_url_title(site):
try:
html = requests.get(site)
soup = BeautifulSoup(html.content)
return soup.find("title").text
except BaseException:
return "网址有误"


# In[19]:


# 从改点开始
def write_title(start):
# 引用全局变量
global data_count,thread_count,names
for i in range(start,data_count,thread_count):
names[i] = get_url_title(sites[i])
print(i,names[i])


# In[20]:


def main():
global threads,n_loops
for i in n_loops:
t = threading.Thread(target=write_title,args=(i,))
threads.append(t)
# 启动 多个线程
for i in n_loops:
threads[i].start()
# wait for all threads to finish
for i in n_loops:
threads[i].join()


# In[21]:


if __name__ == '__main__':
main()


# In[22]:


names


# In[10]:


names


# In[11]:


len(names)


# In[12]:


df.info


# In[23]:


import multiprocessing
print(multiprocessing.cpu_count())


# In[ ]:


【文章出处:香港gpu服务器 http://www.558idc.com/hkgpu.html 复制请保留原URL】
上一篇:python输入输出详情
下一篇:没有了
网友评论