文章目录
- 代码功能介绍:
- 代码如下:
import asynciofrom pyppeteer import launchimport osimport sysimport inspectimport arrowimport requestsfrom requests.COOKIEs import RequestsCOOKIEJarfrom datetime import datetimeimport jsonimport recommon_dir os.path.realpath(os.path.abspath(os.path.join(os.path.split(inspect.getfile(inspect.currentframe()))[0],"../../../")))if common_dir not in sys.path:sys.path.insert(0, common_dir)# import logging# logging.basicConfig(levellogging.DEBUG,# format%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s)from ai.AIfactory.AIfactory import AIfactoryclass AgentQuery():def __init__(self, *args, **kwargs):# 由于使用百度ai的API, 此处已经封装self.ai AIfactory.create_ai(ai_companybaidu)(**kwargs)self.session requests.Session()self.session.headers {"User-Agent":Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36,}self.jar RequestsCOOKIEJar()self.host http://iir.circ.gov.cn/ipq/ipiQuery.html # 首页面self.captcha_url http://iir.circ.gov.cn/ipq/captchacn.svl # 验证码URLself.base_url http://iir.circ.gov.cn/ipq/ipiqueryru.html # 查询后的列表页self.baseinfo_url http://iir.circ.gov.cn/ipq/ipiqueryru.do?validate # 查询预览页面的接口self.detail_url http://iir.circ.gov.cn/ipq/ipiqueryru.do?query # 查询详细的数据接口async def main():result {}obj AgentQuery(choicegeneral_basic)# headless参数设为False则变成有头模式browser await launch(ignoreHTTPSErrorsTrue,headlessFalse,autoCloseFalse)page await browser.newPage()# 设置页面视图大小await page.setViewport(viewport{width:1280, height:800})# 是否启用JSenabled设为False则无渲染效果await page.setJavascriptEnabled(enabledTrue)await page.goto(obj.host)await page.setUserAgent(obj.session.headers.get(User-Agent))await page.type(input#agencyname,世纪保险经纪股份有限公司上海分公司)COOKIEs await page.COOKIEs()for COOKIE in COOKIEs:obj.jar.set(COOKIE[name], COOKIE[value])response2 obj.session.get(obj.captcha_url, COOKIEsobj.jar)img_path os.path.join(common_dir,agencycaptchacn.png)with open(img_path, wb) as f:f.write(response2.content)captcha obj.ai.get_result(img_path)print(识别后的验证码是, captcha)if captcha is None or len(captcha)!4:await main()else:await page.type(input#yzm,captcha)await page.click(#chaxun)await page.waitFor(3000)await page.reload() # 必须项, 否则在browser.pages()中只有两个页面page_list await browser.pages()page page_list[-1]if page.url obj.base_url:content await page.content()COOKIEs await page.COOKIEs()not_found_info await page.Jx(//*[id"interme"]/tr[1]/td)if not_found_info:not_found_info await ( await not_found_info[0].getProperty(textContent)).jsonValue()if 很抱歉没有找到符合条件的记录 in not_found_info:return result# 以下已经基本信息采集agentname await page.Jx(//*[id"interme"]/tr[1]/td[2]/a)if agentname:agentname await ( await agentname[0].getProperty(textContent)).jsonValue()agencycode await page.Jx(//*[id"interme"]/tr[1]/td[3])if agencycode:agencycode await ( await agencycode[0].getProperty(textContent)).jsonValue()regulatorycode await page.Jx(//*[id"interme"]/tr[1]/td[4])if regulatorycode:regulatorycode await ( await regulatorycode[0].getProperty(textContent)).jsonValue()registertime await page.Jx(//*[id"interme"]/tr[1]/td[5])if registertime:registertime await ( await registertime[0].getProperty(textContent)).jsonValue()result.setdefault(agentname, agentname)result.setdefault(agencycode, agencycode)result.setdefault(regulatorycode, regulatorycode)result.setdefault(registertime, registertime)return result# 以下基于API研究开发, 可以使用,也可以不使用org_no await page.Jx(//*[id"interme"]/tr[1]/td[2]/a)if org_no:org_no await ( await org_no[0].getProperty(href)).jsonValue()org_no_list re.findall(r"\".*\"", str(org_no))if org_no_list:org_no re.sub(r\", , org_no_list[0])for COOKIE in COOKIEs:obj.jar.set(COOKIE[name], COOKIE[value])response3 obj.session.post(obj.detail_url,data {org_no: org_no},COOKIEsobj.jar)print(response3.text)if __name____main__:asyncio.get_event_loop().run_until_complete(main())