特色栏目： python 批处理 net编程 Javascript Php Asp Css Html5 Android seo centos

python多线程爬取网页名称写入到excel

来源：互联网收集：自由互联发布时间：2022-07-07

#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import threading import requests from bs4 import BeautifulSoup from time import sleep from datetime import datetime # In[2]: df = pd . read_excel ( "网站对应名字.xlsx"

#!/usr/bin/env python
# coding: utf-8

# In[1]:

import pandas as pd
import threading
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime

# In[2]:

df = pd.read_excel("网站对应名字.xlsx")

# In[16]:

sites = df.URL
data_count = len(sites)
thread_count = 16
threads = []
n_loops = range(thread_count)

# In[17]:

names = [None]*data_count

# In[18]:

def get_url_title(site):
try:
html = requests.get(site)
soup = BeautifulSoup(html.content)
return soup.find("title").text
except BaseException:
return "网址有误"

# In[19]:

# 从改点开始
def write_title(start):
# 引用全局变量
global data_count,thread_count,names
for i in range(start,data_count,thread_count):
names[i] = get_url_title(sites[i])
print(i,names[i])

# In[20]:

def main():
global threads,n_loops
for i in n_loops:
t = threading.Thread(target=write_title,args=(i,))
threads.append(t)
# 启动多个线程
for i in n_loops:
threads[i].start()
# wait for all threads to finish
for i in n_loops:
threads[i].join()

# In[21]:

if __name__ == '__main__':
main()

# In[22]:

names

# In[10]:

names

# In[11]:

len(names)

# In[12]:

df.info

# In[23]:

import multiprocessing
print(multiprocessing.cpu_count())

# In[ ]:

【文章出处:香港gpu服务器 http://www.558idc.com/hkgpu.html 复制请保留原URL】

上一篇：python输入输出详情
下一篇：没有了

python多线程爬取网页名称写入到excel

相关文章