当前位置 : 主页 > 编程语言 > python >

杭州土地出让公告

来源:互联网 收集:自由互联 发布时间:2022-06-15
杭州土地出让公告 ​​思路​​ ​​代码​​ ​​代码解读​​ ​​免责声明​​ 思路 目标页 ​​http://gtj.hangzhou.gov.cn/col/col1376014/index.html?uid=4420029pageNum=1​​ 分两次获取 第一次利



杭州土地出让公告

  • ​​思路​​
  • ​​代码​​
  • ​​代码解读​​
  • ​​免责声明​​

思路

目标页

​​http://gtj.hangzhou.gov.cn/col/col1376014/index.html?uid=4420029&pageNum=1​​

分两次获取

第一次利用webdriver模拟浏览获取每条公告的名称,详情页和公告日期,存起来。

第二次利用requests模块进行请求获取公告列表

代码

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 27 15:16:58 2020
project name:杭州国土出让公告
@author: 帅帅de三叔
"""
import requests
import time
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Accept":"*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive"} #构造请求头


def click_nextpage(url): #定义点击下一页函数
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") #驱动初始化options=chrome_options
driver.get(url) #
driver.implicitly_wait(1)#设置隐性等待时间
driver.find_element_by_xpath("//*[@id='4420029']/table/tbody/tr/td/table/tbody/tr/td[8]/a").click() #点击下一页
#next_page=driver.current_url
#return next_page

def click_detail(url): #点击当前页的公告列表
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") #驱动初始化options=chrome_options
driver.get(url) #
driver.implicitly_wait(1)#设置隐性等待时间
for j in range(1,11):
detail_url="//*[@id='4420029']/div/table/tbody/tr[{}]/td[1]/a".format(j) #构造详情页url
driver.find_element_by_xpath(detail_url).click() #点击详情页
driver.switch_to.window(driver.window_handles[1])#切换当前页面标签
time.sleep(2) #进程挂起1秒
url_list.append(driver.current_url)
driver.close() #关闭当前窗口
driver.switch_to.window(driver.window_handles[0])#切换前一页面标签
time.sleep(2) #进程挂起1秒


def parser(url): #定义解析函数
response=requests.get(url, header)
response.encoding="utf-8"
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
tr_list=soup.find("table", class_="MsoNormalTable").find("tbody").findAll("tr")
with open("杭州市土地出让信息表.csv", 'a+', newline='') as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text()
location=td[1].get_text()
land_area=td[2].get_text()
user=td[3].get_text()
building_area=td[4].get_text()
price=td[5].get_text()
deposit=td[6].get_text()
year_spam=td[7].get_text()
public_date=date
writer.writerow([land_code, location, land_area, user, building_area, price, deposit, year_spam, public_date])
title=[]
url_list=[]
date=[]
url="http://gtj.hangzhou.gov.cn/col/col1376014/index.html?uid=4420029&pageNum=1" #进入首页
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe", options=chrome_options) #驱动初始化options=chrome_options
driver.get(url) #首页
driver.maximize_window() #窗口最大化
driver.implicitly_wait(1)#设置隐性等待时间
page=0
while page<33: #总共33页
with open("杭州市土地出让信息表.csv", 'w+', newline='') as file:
writer=csv.writer(file)
Header=["title", "detail_url", "date"]
writer.writerow(Header)
for j in range(1,11):
detail_url="//*[@id='4420029']/div/table/tbody/tr[{}]/td[1]/a".format(j) #构造详情页url
time.sleep(2)
driver.find_element_by_xpath(detail_url).click() #点击详情页
#time.sleep(2)
driver.switch_to.window(driver.window_handles[1])#切换当前详情页面标签
time.sleep(1) #进程挂起1秒
title=driver.find_element_by_xpath("//*[@id='article']/tbody/tr[1]/td").text #标题
detail_url=driver.current_url #详情页网址
date=driver.find_element_by_xpath("//*[@id='article']/tbody/tr[2]/td/table/tbody/tr[2]/td[1]/span").text.split(":")[1] #发布日期
print(title, detail_url, date)
writer.writerow([title, detail_url, date])
driver.close() #关闭当前窗口
driver.switch_to.window(driver.window_handles[0])#切换前一页面标签
driver.implicitly_wait(2)#设置隐性等待时间
driver.find_element_by_xpath("//*[@id='4420029']/table/tbody/tr/td/table/tbody/tr/td[8]/a").click() #点击下一页
driver.switch_to.window(driver.window_handles[0])#切换当前页
driver.implicitly_wait(1)#设置隐性等待时间
page=page+1# -*- coding: utf-8 -*-
"""
project_name:get_detail
@author: 帅帅de三叔
Created on Mon Mar 2 09:25:12 2020
"""
import time #导入时间模块
import pandas as pd #导入数据分析模块
import requests #导入网页请求模块
from bs4 import BeautifulSoup #导入网页解析模块
import csv #导入csv模块
data=pd.read_csv("杭州市土地出让详情页列表.csv", encoding="ANSI") #读取详情页地址, encoding="ANSI"
title=data['title'] #公告名称
url_list=data['detail_url'] #详情列表
date=data['date']

header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Accept":"*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive"} #构造请求头

def parser2018(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("0-30杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
building_area=td[4].get_text().strip()
price=td[5].get_text().strip()
deposit=td[6].get_text().strip()
year_spam=td[7].get_text().strip()
public_date=date
print(land_code, location, land_area, user, building_area, price, deposit, year_spam, public_date)
writer.writerow([land_code, location, land_area, user, building_area, price, deposit, year_spam, public_date, url])
except:
print("no message")

def parser2017(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("30-44杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
building_area=td[4].get_text().strip()
#price=td[5].get_text().strip()
#deposit=td[6].get_text().strip()
year_spam=td[5].get_text().strip()
public_date=date
print(land_code, location, land_area, user, building_area, year_spam, public_date)
writer.writerow([land_code, location, land_area, user, building_area, year_spam, public_date, url])
except:
print("no message")

def parser2016(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("2016杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
building_area=td[4].get_text().strip()
#price=td[5].get_text().strip()
#deposit=td[6].get_text().strip()
year_spam=td[5].get_text().strip()
public_date=date
print(land_code, location, land_area, user, building_area, year_spam, public_date)
writer.writerow([land_code, location, land_area, user, building_area, year_spam, public_date, url])
except:
print("no message")

def parser2014(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("2014杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
#land_area=td[2].get_text().strip()
user=td[2].get_text().strip()
building_area=td[3].get_text().strip()
#price=td[5].get_text().strip()
#deposit=td[6].get_text().strip()
year_spam=td[4].get_text().strip()
public_date=date
print(land_code, location, user, building_area, year_spam, public_date)
writer.writerow([land_code, location, user, building_area, year_spam, public_date, url])
except:
print("no message")
def parser2010(url): #定义解析函数
response=requests.get(url, header, timeout=10)
response.encoding="utf-8"
time.sleep(1)
soup=BeautifulSoup(response.text, 'lxml') #解析网页
date=soup.find("table", id="article").find("tbody").findAll("tr")[1].findAll("td")[0].get_text().split(":")[1] #发布日期
try:
tr_list=soup.find("td", class_="bt_content").find("div").find("tbody").findAll("tr")
with open("2010杭州市土地出让信息表.csv", 'a+', newline='', encoding="utf8") as file:
writer=csv.writer(file)
#Header=["land_code", "location", "land_area", "user, building_area", "price", "deposit", "year_spam", "public_date"]
#writer.writerow(Header)
for i in range(1, len(tr_list)):
td=tr_list[i].findAll("td")
land_code=td[0].get_text().strip()
location=td[1].get_text().strip()
land_area=td[2].get_text().strip()
user=td[3].get_text().strip()
capacity_ratio=td[4].get_text().strip()
green_ratio=td[5].get_text().strip()
density=td[6].get_text().strip()
public_date=date
print(land_code, location, land_area, user, capacity_ratio, green_ratio, density, public_date)
writer.writerow([land_code, location, land_area, user, capacity_ratio, green_ratio, density, public_date, url])
except:
print("no message")

if __name__=="__main__":
for url in url_list[90:97]:
print("当前详情页为 %s"%url)
parser2014(url)

代码解读

仅以此告诫自己别那么完美主义,其实没这么恐怖,遗留问题,如何处理嵌套表格问题。

免责声明

Python爬虫仅为学习交流,如有冒犯,请告知删。



上一篇:图片去水印
下一篇:没有了
网友评论