杭州土地出让公告 思路 代码 代码解读 免责声明 思路 目标页 http://gtj.hangzhou.gov.cn/col/col1376014/index.html?uid=4420029pageNum=1 分两次获取 第一次利
杭州土地出让公告
- 思路
- 代码
- 代码解读
- 免责声明
思路
目标页
http://gtj.hangzhou.gov.cn/col/col1376014/index.html?uid=4420029&pageNum=1
分两次获取
第一次利用webdriver模拟浏览获取每条公告的名称,详情页和公告日期,存起来。
第二次利用requests模块进行请求获取公告列表
代码
# -*- coding: utf-8 -*-"""
Created on Thu Feb 27 15:16:58 2020
project name:杭州国土出让公告
@author: 帅帅de三叔
"""
import requests
import time
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Accept":"*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive"} #构造请求头
def click_nextpage(url): #定义点击下一页函数
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") #驱动初始化options=chrome_options
driver.get(url) #
driver.implicitly_wait(1)#设置隐性等待时间
driver.find_element_by_xpath("//*[@id='4420029']/table/tbody/tr/td/table/tbody/tr/td[8]/a").click() #点击下一页
#next_page=driver.current_url
#return next_page
def click_detail(url): #点击当前页的公告列表
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") #驱动初始化options=chrome_options
driver.get(url) #
driver.implicitly_wait(1)#设置隐性等待时间
for j in range(1,11):
detail_url="//*[@id='4420029']/div/table/tbody/tr[{}]/td[1]/a".format(j) #构造详情页url
driver.find_element_by_xpath(detail_url).click() #点击详情页
driver.switch_to.window(driver.window_handles[1])#切换当前页面标签
time.sleep(2) #进程挂起1秒
url_list.append(driver.current_url)
driver.close() #关闭当前窗口
driver.switch_to.window(driver.window_handles[0])#切换前一页面标签
time.sleep(2) #进程挂起1秒
def parser(url): #定义解析函数
response=requests.get(url, header)
response.encoding="utf-8"
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
tr_list=soup.find("table", class_="MsoNormalTable").find("tbody").findAll("tr")
with open("杭州市土地出让信息表.csv", 'a+', newline='') as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text()
location=td[1].get_text()
land_area=td[2].get_text()
user=td[3].get_text()
building_area=td[4].get_text()
price=td[5].get_text()
deposit=td[6].get_text()
year_spam=td[7].get_text()
public_date=date
writer.writerow([land_code, location, land_area, user, building_area, price, deposit, year_spam, public_date])
title=[]
url_list=[]
date=[]
url="http://gtj.hangzhou.gov.cn/col/col1376014/index.html?uid=4420029&pageNum=1" #进入首页
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe", options=chrome_options) #驱动初始化options=chrome_options
driver.get(url) #首页
driver.maximize_window() #窗口最大化
driver.implicitly_wait(1)#设置隐性等待时间
page=0
while page<33: #总共33页
with open("杭州市土地出让信息表.csv", 'w+', newline='') as file:
writer=csv.writer(file)
Header=["title", "detail_url", "date"]
writer.writerow(Header)
for j in range(1,11):
detail_url="//*[@id='4420029']/div/table/tbody/tr[{}]/td[1]/a".format(j) #构造详情页url
time.sleep(2)
driver.find_element_by_xpath(detail_url).click() #点击详情页
#time.sleep(2)
driver.switch_to.window(driver.window_handles[1])#切换当前详情页面标签
time.sleep(1) #进程挂起1秒
title=driver.find_element_by_xpath("//*[@id='article']/tbody/tr[1]/td").text #标题
detail_url=driver.current_url #详情页网址
date=driver.find_element_by_xpath("//*[@id='article']/tbody/tr[2]/td/table/tbody/tr[2]/td[1]/span").text.split(":")[1] #发布日期
print(title, detail_url, date)
writer.writerow([title, detail_url, date])
driver.close() #关闭当前窗口
driver.switch_to.window(driver.window_handles[0])#切换前一页面标签
driver.implicitly_wait(2)#设置隐性等待时间
driver.find_element_by_xpath("//*[@id='4420029']/table/tbody/tr/td/table/tbody/tr/td[8]/a").click() #点击下一页
driver.switch_to.window(driver.window_handles[0])#切换当前页
driver.implicitly_wait(1)#设置隐性等待时间
page=page+1# -*- coding: utf-8 -*-
"""
project_name:get_detail
@author: 帅帅de三叔
Created on Mon Mar 2 09:25:12 2020
"""
import time #导入时间模块
import pandas as pd #导入数据分析模块
import requests #导入网页请求模块
from bs4 import BeautifulSoup #导入网页解析模块
import csv #导入csv模块
data=pd.read_csv("杭州市土地出让详情页列表.csv", encoding="ANSI") #读取详情页地址, encoding="ANSI"
title=data['title'] #公告名称
url_list=data['detail_url'] #详情列表
date=data['date']
header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Accept":"*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive"} #构造请求头
def parser2018(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("0-30杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
building_area=td[4].get_text().strip()
price=td[5].get_text().strip()
deposit=td[6].get_text().strip()
year_spam=td[7].get_text().strip()
public_date=date
print(land_code, location, land_area, user, building_area, price, deposit, year_spam, public_date)
writer.writerow([land_code, location, land_area, user, building_area, price, deposit, year_spam, public_date, url])
except:
print("no message")
def parser2017(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("30-44杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
building_area=td[4].get_text().strip()
#price=td[5].get_text().strip()
#deposit=td[6].get_text().strip()
year_spam=td[5].get_text().strip()
public_date=date
print(land_code, location, land_area, user, building_area, year_spam, public_date)
writer.writerow([land_code, location, land_area, user, building_area, year_spam, public_date, url])
except:
print("no message")
def parser2016(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("2016杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
building_area=td[4].get_text().strip()
#price=td[5].get_text().strip()
#deposit=td[6].get_text().strip()
year_spam=td[5].get_text().strip()
public_date=date
print(land_code, location, land_area, user, building_area, year_spam, public_date)
writer.writerow([land_code, location, land_area, user, building_area, year_spam, public_date, url])
except:
print("no message")
def parser2014(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("2014杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
#land_area=td[2].get_text().strip()
user=td[2].get_text().strip()
building_area=td[3].get_text().strip()
#price=td[5].get_text().strip()
#deposit=td[6].get_text().strip()
year_spam=td[4].get_text().strip()
public_date=date
print(land_code, location, user, building_area, year_spam, public_date)
writer.writerow([land_code, location, user, building_area, year_spam, public_date, url])
except:
print("no message")
def parser2010(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("2010杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
capacity_ratio=td[4].get_text().strip()
green_ratio=td[5].get_text().strip()
density=td[6].get_text().strip()
public_date=date
print(land_code, location, land_area, user, capacity_ratio, green_ratio, density, public_date)
writer.writerow([land_code, location, land_area, user, capacity_ratio, green_ratio, density, public_date, url])
except:
print("no message")
if __name__=="__main__":
for url in url_list[90:97]:
print("当前详情页为 %s"%url)
parser2014(url)
代码解读
仅以此告诫自己别那么完美主义,其实没这么恐怖,遗留问题,如何处理嵌套表格问题。
免责声明
Python爬虫仅为学习交流,如有冒犯,请告知删。