时间:2021-05-07 python爬虫 查看: 1209
最近接了个小项目需要批量搜索企查查上的相关企业并把指定信息保存到Excel文件中,由于企查查需要登录后才能查看所有搜索到的信息所以第一步需要模拟登录企查查。
python模拟登录企查查最重要的是自动拖拽验证插件
Web应用程序测试的工具,Selenium可以模拟用户在浏览器中的操作,就像真实用户使用一样。
官方技术文档:https://www.selenium.dev/selenium/docs/api/py/index.html
谷歌浏览器,不作过多介绍
配合Selenium操作Chrome浏览器的驱动程序,注意在下载Chromedriver时必须与已安装的Chrome浏览器版本号前3位保持一至
官方下载地址:http://chromedriver.storage.googleapis.com/index.html
假设电脑中已安装Chrome最新版(如果没有安装请自行下载安装),下载与电脑系统、Chrome版本相匹配的版本(Chromedriver的版本号必须与安装的Chrome版本号一至)。
从官网下载的文件是一个压缩包,解压出Chromedriver.exe文件,
网上有很多文章说要正常使用Chromedriver.exe,需要配置系统的环境变量,其实这是一种比较麻烦的方法。
为了项目的可移动性和操作方便使用另一种方法,就是把Chrome浏览器安装目录下的整个Application目录都复制到项目目录下,这样就可以随便移动项目到新开发环境中而不用考虑新环境的系统环境变量了。
把解压出Chromedriver.exe文件复制到项目目录下的从Chrome浏览器安装目录中复制过来的Application目录下,保证Chromedriver.exe文件与chrome.exe文件在同一目录下。
pip install selenium
在Pycharm菜单栏中找到并点击【file】->【settings】
在弹出窗口中按下图所示操作
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # 开启无界面模式
options.add_argument('--disable-gpu') # 禁用gpu,解决一些莫名的问题
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
options.add_argument('--start-maximized')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
d = DesiredCapabilities.CHROME
d['goog:loggingPrefs'] = {'performance': 'ALL'}# 获取Headers必须参数
driver = webdriver.Chrome(options=options, executable_path="Application/chromedriver.exe", desired_capabilities=d)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {#清除验证插件中windows.navigator.webdriver的值
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.implicitly_wait(2)#延时
driver.set_window_size(width=800, height=600)
driver.get("https://www.QCC.com/")
driver.find_element_by_xpath('//a[@class="navi-btn"][1]').click()
locator = (By.ID, "dom_id_two")
try:
WebDriverWait(driver, 20, 0.5).until(EC.presence_of_element_located(locator))
except:
driver.close()
# WebDriverWait(driver,20,0.5).until(lambda driver:driver.find_element_by_xpath('//span[@class="nc_iconfont btn_slide"]'))
# 找到账号输入框
driver.find_element_by_xpath('//input[@id="nameVerify"]').send_keys('手机号')
验证插件会检测浏览器是否为webdriver即使用JS检查windows.navigator.webdriver值
所以需要在页面加载前手动修改windows.navigator.webdriver值
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
修改完成windows.navigator.webdriver值后再模拟拖动验证插件滑块
# 滑动条定位
start = driver.find_element_by_xpath('//span[@class="nc_iconfont btn_slide"]')
action = ActionChains(driver)
action.click_and_hold(start)
action.drag_and_drop_by_offset(start, 308, 0).perform()
time.sleep(2)
style = 'position:absolute;top:0;left:0;width:100%;z-index:999;font-size:40px;line-height:100px;background:rgba(255,217,0,90%);height:100%;text-align:center;color:#000;'
driver.execute_script(
'var htm=document.getElementsByClassName("login-sao-panel")[0];htm.innerHTML+="<div style={style}><b id=tt></b><b id=ts></b></div>"'.format(
style=style))
ts = driver.find_element_by_id('ts')
tt = driver.find_element_by_id('tt')
try:
driver.find_element_by_xpath('//div[@class="errloading"][1]')
set_id_att(driver, 'tt', 'innerHTML', '请手工验证')
except:
tr = driver.find_element_by_xpath('//span[@class="nc-lang-cnt"][1]')
if tr.text != '验证通过':
set_id_att(driver, 'tt', 'innerHTML', '请手工验证')
# for i in range(1, 6):
# if tr.text == '验证通过':
# break
# set_id_att(driver, 'ts', 'innerHTML', i)
# time.sleep(1)
try:
driver.find_element_by_xpath('//a[@class="text-primary vcode-btn get-mobile-code"]').click()
except:
pass
# code=driver.find_element_by_xpath('//input[@id="vcodeNormal"]')
set_id_att(driver, 'tt', 'innerHTML', '请填入手机验证码')
# rjs='const callback = arguments[arguments.length - 1];callback({v:document.getElementById("vcodeNormal").value})'
rjs = 'return document.getElementById("vcodeNormal").value'
locator = (By.CLASS_NAME, "nav-user")
but = driver.find_element_by_xpath('//form[@id="user_login_verify"]/button')
for i in range(1, 1):
# code = driver.execute_async_script(rjs)
code = driver.execute_script(rjs)
if len(code) == 6:
but.click()
try:
#WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))
break
except:
pass
#return 0
set_id_att(driver, 'ts', 'innerHTML', i)
time.sleep(1)
上面的代码中在页面里增加了一些状态显示元素及JS代码
style = 'position:absolute;top:0;left:0;width:100%;z-index:999;font-size:40px;line-height:100px;background:rgba(255,217,0,90%);height:100%;text-align:center;color:#000;'
driver.execute_script(
'var htm=document.getElementsByClassName("login-sao-panel")[0];htm.innerHTML+="<div style={style}><b id=tt></b><b id=ts></b></div>"'.format(
style=style))
def set_id_att(bor, id, att, val):
bor.execute_script('document.getElementById("{a}").{b}="{c}"'.format(a=id, b=att, c=val))
def set_class_att(bor, classs, id, att, val):
bor.execute_script('document.getElementsByClassName("{a}")[{d}].{b}="{c}"'.format(a=classs, b=att, c=val, d=id))
登录成功后还需要获取页面的headers、Cookie方便后面的requests库使用
def getheader(browser):
for responseReceived in browser.get_log('performance'):
try:
response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
if response[u'url'] == browser.current_url:
return response[u'requestHeaders']
except:
pass
return None
cookie = [item["name"] + "=" + item["value"] for item in driver.get_cookies()]
headers['cookie'] = ';'.join(item for item in cookie)
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def getheader(browser):
for responseReceived in browser.get_log('performance'):
try:
response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
if response[u'url'] == browser.current_url:
return response[u'requestHeaders']
except:
pass
return None
def set_id_att(bor, id, att, val):
bor.execute_script('document.getElementById("{a}").{b}="{c}"'.format(a=id, b=att, c=val))
def set_class_att(bor, classs, id, att, val):
bor.execute_script('document.getElementsByClassName("{a}")[{d}].{b}="{c}"'.format(a=classs, b=att, c=val, d=id))
def login():
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # 开启无界面模式
options.add_argument('--disable-gpu') # 禁用gpu,解决一些莫名的问题
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
options.add_argument('--start-maximized')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
d = DesiredCapabilities.CHROME
d['goog:loggingPrefs'] = {'performance': 'ALL'}
driver = webdriver.Chrome(options=options, executable_path="Application/chromedriver.exe", desired_capabilities=d)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.implicitly_wait(2)
driver.set_window_size(width=800, height=600)
driver.get("https://www.QCC.com/",)
driver.find_element_by_xpath('//a[@class="navi-btn"][1]').click()
locator = (By.ID, "dom_id_two")
try:
WebDriverWait(driver, 20, 0.5).until(EC.presence_of_element_located(locator))
except:
driver.close()
# WebDriverWait(driver,20,0.5).until(lambda driver:driver.find_element_by_xpath('//span[@class="nc_iconfont btn_slide"]'))
# 找到账号输入框
driver.find_element_by_xpath('//input[@id="nameVerify"]').send_keys('19942496979')
# 滑动条定位
start = driver.find_element_by_xpath('//span[@class="nc_iconfont btn_slide"]')
action = ActionChains(driver)
action.click_and_hold(start)
action.drag_and_drop_by_offset(start, 308, 0).perform()
time.sleep(2)
style = 'position:absolute;top:0;left:0;width:100%;z-index:999;font-size:40px;line-height:100px;background:rgba(255,217,0,90%);height:100%;text-align:center;color:#000;'
driver.execute_script(
'var htm=document.getElementsByClassName("login-sao-panel")[0];htm.innerHTML+="<div style={style}><b id=tt></b><b id=ts></b></div>"'.format(
style=style))
ts = driver.find_element_by_id('ts')
tt = driver.find_element_by_id('tt')
try:
driver.find_element_by_xpath('//div[@class="errloading"][1]')
set_id_att(driver, 'tt', 'innerHTML', '请手工验证')
except:
tr = driver.find_element_by_xpath('//span[@class="nc-lang-cnt"][1]')
if tr.text != '验证通过':
set_id_att(driver, 'tt', 'innerHTML', '请手工验证')
# for i in range(1, 6):
# if tr.text == '验证通过':
# break
# set_id_att(driver, 'ts', 'innerHTML', i)
# time.sleep(1)
try:
driver.find_element_by_xpath('//a[@class="text-primary vcode-btn get-mobile-code"]').click()
except:
pass
# code=driver.find_element_by_xpath('//input[@id="vcodeNormal"]')
set_id_att(driver, 'tt', 'innerHTML', '请填入手机验证码')
# rjs='const callback = arguments[arguments.length - 1];callback({v:document.getElementById("vcodeNormal").value})'
rjs = 'return document.getElementById("vcodeNormal").value'
locator = (By.CLASS_NAME, "nav-user")
but = driver.find_element_by_xpath('//form[@id="user_login_verify"]/button')
for i in range(1, 1):
# code = driver.execute_async_script(rjs)
code = driver.execute_script(rjs)
if len(code) == 6:
but.click()
try:
#WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))
break
except:
pass
#return 0
set_id_att(driver, 'ts', 'innerHTML', i)
time.sleep(1)
headers = getheader(driver)#获取headers
ip = "202.121.178.244"
if headers:
#获取cookie并存入headers中
cookie = [item["name"] + "=" + item["value"] for item in driver.get_cookies()]
headers['cookie'] = ';'.join(item for item in cookie)
del headers[':authority']
del headers[':method']
del headers[':path']
del headers[':scheme']
headers['X-Forwarded-For'] = ip
headers['X-Remote-IP'] = ip
headers['X-Originating-IP'] = ip
headers['X-Remote-Addr'] = ip
headers['X-Client-IP'] = ip
return headers
headers=login()#自动登录并获取登录后的Headers包括cookies
要获取完整项目代码(selenium模拟登录企查查+requests库自动搜索获取指定信息并保存Excel)请关注上面的公众号“python客栈”然后回复“qcc”
本文主要介绍了如何使用python的selenium模拟登录企查查,主要介绍了如何使用selenium保存Cookies与headers、自动验证及selenium库对页面元素的一些操作方法
下一篇将介绍Python使用requests库自动在企查查上搜索相关企业并获取指定信息