特色栏目： python 批处理 net编程 Javascript Php Asp Css Html5 Android seo centos

Python爬虫爬取2021高教社杯数学建模优秀论文

来源：互联网收集：自由互联发布时间：2022-07-13

Python爬虫爬取2021高教社杯数学建模优秀论文程序背景在准备数学建模国赛时，每年国赛的优秀论文是必看的内容，但是官方放出来的优秀论文都是以图片的形式，看起来非常的麻烦，

Python爬虫爬取2021高教社杯数学建模优秀论文

程序背景

在准备数学建模国赛时，每年国赛的优秀论文是必看的内容，但是官方放出来的优秀论文都是以图片的形式，看起来非常的麻烦，本篇文章使用Python爬虫爬取中国大学生在线放出的优秀论文并整合成PDF文档方便阅读学习，拼接效果如图，有疑问欢迎大家评论私信。

Python爬虫爬取2021高教社杯数学建模优秀论文_数学建模

Python爬虫爬取2021高教社杯数学建模优秀论文_爬虫_02

环境搭建

本篇文章使用的均是一些常用库，仅PyMuPDF库不常见，安装方法如下：

pip install PyMuPDF -i https://pypi.tuna.tsinghua.edu.cn/simple

程序层次介绍

文件或文件夹名

作用

img

临时存储从网站爬取的论文图片，自行创建

pdf

存储整合后的论文PDF文档，自行创建

InitProcedure.py

初始化论文链接以及转成PDF的文档名

ParseHtml.py

爬取论文链接的img标签，匹配论文图片，将图片存储至img文件夹下

PdfFileWriter.py

调用PyMuPDF库，将图片拼接成PDF文件

开源代码

InitProcedure.py

import PdfFileWriter
from ParseHtml import *
from PdfFileWriter import *

# 输入链接列表
urls = ['https://dxs.moe.gov.cn/zx/a/hd_sxjm_sxjmlw_2021qgdxssxjmjslwzs/211025/1734093.shtml', 'https://dxs.moe.gov.cn/zx/a/hd_sxjm_sxjmlw_2021qgdxssxjmjslwzs/211025/1734095.shtml']
# 输入title列表
titles = ['C085.pdf', 'C169.pdf']

# 指定存储图片的文件路径
fileDir = './img/'

# 遍历链接列表，获取图片并转存成pdf
for i in range(len(urls)):
print('正在处理第%s个文件' % str(i+1))
# 指定链接
url = urls[i]
print(url)
# 指定保存名称
title = titles[i]
p_suffix = getimage(fileDir, url)
# 指定存储PDF路径
target_path = './pdf/'
PdfFileWriter.pic2pdf(fileDir, target_path + title, title)

print('Completed Resource Crawl')

ParseHtml.py

import os
import requests
from bs4 import BeautifulSoup
import shutil
from pathlib import Path
from tqdm import tqdm

# 获取网页源代码
def getHtmlCode(url):
# 伪装头部
headers = {
'User-Agent': 'MMozilla/5.0(Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0'
}
# 获取网页
r = requests.get(url, headers=headers)
# 指定网页解析的编码格式
r.encoding = 'UTF-8'
# 获取url页面的源代码字符串文本
page = r.text
# 返回网页源代码字符串文本
return page

# 该方法传入html的源代码，通过截取其中的img标签，将图片保存到本机
def getImg(page, localPath):
# 判断文件夹是否存在，存在则删除
if os.path.exists(localPath):
shutil.rmtree(localPath)
os.mkdir(localPath)

# 按照Html格式解析页面
soup = BeautifulSoup(page, 'html.parser')
# 返回的是一个包含所有img标签的列表
imgList = soup.find_all('img')
# 图片的编号
x = 0
# 初始化集合
remove_file_set = set()
# 循环url列表
print("Downdloading images...")
for imgUrl in tqdm(imgList):
try:
# 得到img标签中的src具体内容
imgUrl_src = imgUrl.get('src')
# # 获取alt值
alt_name = imgUrl.get('alt')
# alt属性及获取路径不为空
if imgUrl_src != '' and alt_name.find('2021') != -1:
# print('正在下载第 %d : %s 张图片' % (x+1, imgUrl_src))
# 判断图片是否是从绝对路径https开始
if "https://" not in imgUrl_src:
m = 'http://dxs.moe.gov.cn/' + imgUrl_src
# print('正在下载： %s' % m)
# 获取图片
ir = requests.get(m)
else:
ir = requests.get(imgUrl_src)
# 设置Path变量，为了使用Pahtlib库中的方法提取后缀名
p = Path(imgUrl_src)
# 防止下载重复的图片
file_id = str(p).split('?')[0]
# print('file_id:' + file_id)
if file_id not in remove_file_set:
remove_file_set.add(file_id)
else:
break

# 得到后缀，返回的是如 '.jpg'
p_suffix = p.suffix
# print('p_suffix1:' + p_suffix)
if '?' in p_suffix:
temp_p_suffix = p_suffix.split('?')
p_suffix = temp_p_suffix[0]
# 用write()方法写入本地文件中，存储的后缀名用原始的后缀名称
open(localPath + str(x) + p_suffix, 'wb').write(ir.content)
x = x + 1
except:
continue
print("Image Download Complete!")
return p_suffix

def getimage(file_path, target_url):
# 文件路径
fileDir = file_path
# 指定爬取图片链接
url = target_url
# 得到网页源代码
page = getHtmlCode(url)
# 保存图片
getImg(page, fileDir)

PdfFileWriter.py

import glob
import os
import fitz

def pic2pdf(img_dir,pdf_dir,title):
doc = fitz.open()
for img in sorted(glob.glob("{}\*".format(img_dir))): # 读取图片，确保按文件名排序
print(img)
imgdoc = fitz.open(img) # 打开图片
pdfbytes = imgdoc.convert_to_pdf() # 使用图片创建单页的 PDF
imgPdf = fitz.open("pdf", pdfbytes)
doc.insert_pdf(imgPdf) # 将当前页插入文档
if os.path.exists(title):
os.remove(title)
doc.save(pdf_dir) # 保存pdf文件
doc.close()

上一篇：NumPy 与 Python 内置列表计算标准差的区别
下一篇：没有了

Python爬虫爬取2021高教社杯数学建模优秀论文