python天生就是适合用来做爬虫,结合selenium真是如虎添翼;
1) 安装库
pip install selenium
pip install selenium-wire
2)添加驱动,比如 chrome需要下载一个驱动,放到项目目录下或者python安装目录下,根据机器上对应的chrome版本进行下载。我是放在python3.exe的目录
下载地址:
CNPM Binaries Mirror
selenium功能比较强大,但是仍然缺少一些特性,比如需要获取每个请求的头,返回的头信息等,靠谱的方式是selenium-wire,需要注意的是:不要使用IPV6,测试发现只能使用IPV4!!!
效果如下:
比如我的需求是:测试某网页全页面加载时长,各个子元素请求时长,并且截图,测试代码如下:
import time
from PIL import Image # pip install pillow
import json
#from selenium import webdriver
# https://pypi.org/project/selenium-wire/#response-objects
from seleniumwire import webdriver # Import from seleniumwire
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
# Create a new instance of the Chrome driver
option = webdriver.ChromeOptions()
chrome_options = Options()
chrome_options.add_argument('--headless') # 2> 添加无头参数r,一定要使用无头模式,不然截不了全页面,只能截到你电脑的高度
chrome_options.add_argument('--disable-gpu') # 3> 为了解决一些莫名其妙的问题关闭 GPU 计算
chrome_options.add_argument('--no-sandbox') # 4> 为了解决一些莫名其妙的问题浏览器不动
chrome_options.add_argument("--user-data-dir=C:\\Users\\[user]\\AppData\\Local\\Google\\Chrome\\User Data")
#chrome_options.add_extension("adblock_v3.6.12.crx") # 加载.crx后缀的插件
# 调用打印功能的设置,
# 打印不能使用--headless模式,必须要可见模式;打印的pDF有时格式还会乱;不如截图
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": ""
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
# "customMargins": {},
#"marginsType": 2,#边距(2是最小值、0是默认)
# "scaling": 100,
# "scalingType": 3,
# "scalingTypePdf": 3,
#"isLandscapeEnabled": True, # 若不设置该参数,默认值为纵向
"isCssBackgroundEnabled": True,
"mediaSize": {
"height_microns": 297000,
"name": "ISO_A4",
"width_microns": 210000,
"custom_display_name": "A4"
},
}
chrome_options.add_argument('--enable-print-browser')
# chrome_options.add_argument('--headless') #headless模式下,浏览器窗口不可见,可提高效率
prefs = {
'printing.print_preview_sticky_settings.appState': json.dumps(settings),
'savefile.default_directory': 'd:\\test\\'
}
# 此处填写你希望文件保存的路径,可填写your file path默认下载地址
chrome_options.add_argument('--kiosk-printing') # 静默打印,无需用户点击打印页面的确定按钮
chrome_options.add_experimental_option('prefs', prefs)
##
option.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(chrome_options=chrome_options)
# 窗口最大化
driver.maximize_window()
# 访问页面
driver.get('https://weibo.com')
# 记录全页面中成功与失败的请求数,并记录出错使用的时长
n1 = 0
n2 = 0
for request in driver.requests:
if request.response:
if request.response.status_code == 200:
n1 += 1
else:
n2 += 1
print(
request.url,
request.response.status_code,
request.response.headers['Content-Type'] )
# print(request.headers)
# print(request.response.headers)
# print(request.date)
# print(request.response.date)
delta = round(( request.response.date - request.date).microseconds/1000000, 2)
print("cost ", delta, "s")
print("%d, %d" % (n1, n2))
#driver.webDriverWait()
#driver.implicitly_wait(10)
element = WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.ID, "app")))
try:
# 模拟人滚动滚动条,处理图片懒加载问题
k = 1
js_height = "return document.body.clientHeight"
height = driver.execute_script(js_height)
while True:
if k * 500 < height:
js_move = "window.scrollTo(0,{})".format(k * 500)
print(js_move)
driver.execute_script(js_move)
time.sleep(0.2)
height = driver.execute_script(js_height)
k += 1
else:
break
time.sleep(1)
# 7> # 直接截图截不全,调取最大网页截图
width = driver.execute_script(
"return Math.max(document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth);")
height = driver.execute_script(
"return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")
print(width, height)
# 将浏览器的宽高设置成刚刚获取的宽高
driver.set_window_size(width + 100, height + 100)
time.sleep(1)
png_path = "d:\\test\\" + '{}.png'.format('xx网址截图')
# 截图并关掉浏览器
driver.save_screenshot(png_path)
driver.get_screenshot_as_file("d:\\test\\selenium.png")
#driver.execute_script('document.title="test.pdf";window.print();')
# png转pdf
# image1 = Image.open(png_path)
# im1 = image1.convert('RGB')
# pdf_path = png_path.replace('.png', '.pdf')
# im1.save(pdf_path)
except Exception as e:
pass
driver.close()
后记:
在linux下,默认chrome是启动不了的,需要更改/usr/bin/google-chrome脚本,但是这样会造成selenium无法正常工作,
需要指定程序的绝对路径:
from seleniumwire import webdriver # Import from seleniumwire
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
# Create a new instance of the Chrome driver
options = webdriver.ChromeOptions()
options.add_argument("--disable-dev-shm-usage");
options.add_argument("start-maximized");
options.add_argument("disable-infobars");
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu");
options.add_argument("--no-sandbox");
options.add_argument("--user-data-dir=/root/chrome/data")
# 指定chrome的路径
options.binary_location = "/opt/google/chrome/chrome"
s = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=s, options=options)
# 窗口最大化
driver.maximize_window()
# 访问页面
driver.get('https://mail.qq.com')
这样就OK了。
后记2:
这个组件使用Selenium和MitmProxy两个组件来做信息检测,
也就是说自己加了一个中间人代理,通过代理将数据拦截下来并记录到内存或者目录;
文档链接:文章来源:https://www.toymoban.com/news/detail-510459.html
Event Hooks & API文章来源地址https://www.toymoban.com/news/detail-510459.html
到了这里,关于python使用selenium以及selenium-wire做质量与性能检测的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!