目录
1.介绍
2.使用 selenium
1.安装
2.使用
1.测试打开网页,抓取雷速体育日职乙信息
2.通过xpath查找
3.输入文本框内容 send_keys
4.点击事件 click
5.获取网页源码:
6.获取cookies
7.seleniumt提供元素定位方式:8种
8.控制浏览器前进、后退、刷新
9.控制鼠标
10. 设置等待
11设置后台运行
12.后台终止
3.实战
1.介绍
1. selenium是一个用于web应用程序自动化测试工具,Selenium测试直接运行在浏览器中;
2.像真正的用户在操作一样2,驱动浏览执行特定的动作,如点击、下来等操作;
3.selenium支持浏览器
4.支持的语言
5.selenium在爬虫的应用
2.使用 selenium
1.安装
pip3 install selenium
2.使用
1.测试打开网页,抓取雷速体育日职乙信息
# coding:utf-8
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化:
webdriver_chrome.maximize_window()
# 爬取日职乙联赛信息
webdriver_chrome.get("https://www.leisu.com/data/zuqiu/comp-568/season-11286")
# 默认页面是升级附加赛:提取主队信息
for home in webdriver_chrome.find_elements(By.XPATH, "//td[@class='home']/a"):
print("升级附加赛:主队信息:", home.text)
# 点击联赛
webdriver_chrome.find_element(By., "//div[@class='stage_name']").click()
# 获取联赛信息
for home in webdriver_chrome.find_elements(By.XPATH, "//td[@class='home']/a"):
print("联赛:主队信息:", home.text)
# 获取标题
print("标题:", webdriver_chrome.title)
print("获取cookie", webdriver_chrome.get_cookies())
print("获取页面源码", webdriver_chrome.page_source())
# 打开5秒关闭
time.sleep(5)
# 关闭浏览器
webdriver_chrome.quit()
2.通过xpath查找
webdriver_chrome.find_elements(By.XPATH, "//td[@class='home']/a")
3.输入文本框内容 send_keys
webdriver_chrome.find_element(By.XPATH,"//input[@id='']").send_keys("内容")
4.点击事件 click
webdriver_chrome.find_element(By.XPATH,"//input[@id='']").click()
5.获取网页源码:
webdriver_chrome.page_source()
6.获取cookies
webdriver_chrome.get_cookies()
7.seleniumt提供元素定位方式:8种
老版本使用
新版使用
find_elements(By.XPATH, "原始值")
find_elements(By.ID, "原始值")
find_elements(By.CLASS_NAME, "原始值")
等等
8.控制浏览器前进、后退、刷新
9.控制鼠标
import time
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化:
webdriver_chrome.maximize_window()
# 打开百度
webdriver_chrome.get("https://www.baidu.com/")
#鼠标移动到设置上
#定位设置
set_element_above=webdriver_chrome.find_element(By.ID,"s-usersetting-top")
print(set_element_above.text)
#移动鼠标到设置上
ActionChains(webdriver_chrome).move_to_element(set_element_above).perform()
time.sleep(5)
webdriver_chrome.quit()
10. 设置等待
1. 使用场景:有时候需要等某些元素加载后进行操作,或者网络原因需要加载;
2.等待分为2种方式,分为显式等待和隐式等待
3.显式等待代码:打开百度,输入内容
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化:
webdriver_chrome.maximize_window()
# 打开百度
webdriver_chrome.get("https://www.baidu.com/")
# WebDriverWait:显示等待
# 参数:1.webdriver_chrome打开浏览器对象,2.timeout,3.轮训参数
# until:EC场景判断,通过id找到输入框
element = WebDriverWait(webdriver_chrome, 5, 0.5).until(EC.presence_of_element_located((By.ID, "kw")))
# 找到元素输入查找内容
element.send_keys("Python")
time.sleep(5)
webdriver_chrome.quit()
4.隐式等待代码:打开百度,输入内容
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化:
webdriver_chrome.maximize_window()
#隐式等待
webdriver_chrome.implicitly_wait(5)
# 打开百度
webdriver_chrome.get("https://www.baidu.com/")
try:
webdriver_chrome.find_element(By.ID,"kw1").send_keys("python")
except NoSuchElementException as e:
print("超时没有找到元素:",e)
time.sleep(5)
webdriver_chrome.quit()
11设置后台运行
from selenium.webdriver.chrome.options import Optionsoptions = { "headless": "--headless", "no_sandbox": "--no-sandbox", "gpu": "--disable-gpu" } chrome_options = Options()driver = webdriver.Chrome(options=chrome_options)
12.后台终止
1.如运行异常可使用任务管理器,找到进程“chromediver.exe”结束进程
3.实战
1. 自动爬取比赛信息:彩票500
2.自动翻页文章来源:https://www.toymoban.com/news/detail-838929.html
3.导出到excel中文章来源地址https://www.toymoban.com/news/detail-838929.html
# coding:utf-8
import xlsxwriter
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class caipao500_project():
def __init__(self, chrome_options):
self.driver = webdriver.Chrome(options=chrome_options)
# 设置浏览器最大化
self.driver.maximize_window()
def open_page(self, url):
"""
打开页面方法
:param self:
:param url: 页面地址
:return:
"""
print("打开页面:{}".format(url))
self.driver.get(url)
# 判断是否打开
if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, "联赛赛程"))):
print("打开页面成功")
# 创建excel
self.create_excel()
# 解析数据,返回当前轮次
round_num = self.parse_html(self.driver.page_source)
while True:
round_num = round_num - 1
if round_num == 0:
break
round_num_btn = self.driver.find_element(By.XPATH,
"//div[@class='lsaiguo_round_list_wrap_in']/ul/li/a[@data-group={}]".format(
round_num)).text
# 点击上一个轮次
if not round_num_btn:
# 点击翻页
self.driver.find_element(By.XPATH, "//a[@class='itm_arrow itm_arrow_up']").click()
# 等待5秒
time.sleep(5)
if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located(((By.XPATH,
"//div[@class='lsaiguo_round_list_wrap_in']/ul/li/a[@data-group={}]".format(
round_num))))):
# 点击轮次
self.driver.find_element(By.XPATH,
"//div[@class='lsaiguo_round_list_wrap_in']/ul/li/a[@data-group={}]".format(
round_num)).click()
round_num = self.parse_html(self.driver.page_source)
self.book.close()
return True
else:
print("打开页面失败")
return False
def create_excel(self):
"""
创建Excel
:return:
"""
# 创建存放excel文件夹
self.book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S", time.gmtime()) + "文件.xlsx")
self.sheet = self.book.add_worksheet("sheet1")
# 记录添加到第几行
self.curr_row = 1
title_data = ("轮次", "时间", "主队", "全场比分", "全场总分", "半场比分", "半场总分", "客队")
# 添加表头
for index, title_datum in enumerate(title_data):
self.sheet.write(0, index, title_datum)
def parse_html(self, content):
"""
解析网页数据
:param content: 网页源码
:return:
"""
html = etree.HTML(content)
table_trs = html.xpath("//table[@class='lsaiguo_list ltable jTrHover']/tbody/tr")
row_content = {}
for tr in table_trs:
# 轮次
round_num = tr.xpath("./td[1]/text()")[0]
# 时间
time = "".join(tr.xpath("./td[2]/text()"))
# 主队
home = tr.xpath("./td[3]/a/text()")[0]
# 比分:全场
whole_score_array = tr.xpath("./td[4]/span/text()")
whole_score = ":".join(whole_score_array)
# 全场总分
whole_score_total = int(whole_score_array[0]) + int(whole_score_array[1])
# 半场
half_score_str = "".join(tr.xpath("./td[4]/text()"))
half_score = half_score_str[half_score_str.find("(") + 1:half_score_str.find(")")]
# 总分
half_score_array = half_score.split(":")
half_score_total = int(half_score_array[0]) + int(half_score_array[1])
# 客队
away = tr.xpath("./td[5]/a/text()")[0]
row_content = {
"round_num": round_num,
"time": time,
"home": home,
"whole_score": whole_score,
"whole_score_total": whole_score_total,
"half_score": half_score,
"half_score_total": half_score_total,
"away": away
}
print("row:", row_content)
for index, e in enumerate(row_content):
self.sheet.write(self.curr_row, index, row_content.get(e))
self.curr_row += 1
return int(round_num)
if __name__ == '__main__':
options = {
#"headless": "--headless",
#"no_sandbox": "--no-sandbox",
#"gpu": "--disable-gpu",
"proxy-server": "--proxy-server=https://121.37.201.60:8118"
}
chrome_options = Options()
for k, v in options.items():
print("设置浏览器参数:{}:{}".format(k, v))
chrome_options.add_argument(v)
leisu = caipao500_project(chrome_options=chrome_options)
leisu.open_page("https://liansai.500.com/zuqiu-6779/jifen-19426/")
到了这里,关于第十六天-爬虫selenium库的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!