selenium爬虫——以爬取澎湃新闻某搜索结果为例
前言
本程序致力于实现以下目标:
(1)爬取澎湃新闻关于“反腐”的全部文章内容;
(2)按标题、链接将其整理到excel中;
(3)将标题和文章整合到一个word文档中。
许久没有正经写过了,有些生疏,代码耦合度蛮高的,所幸目标达成了。
需要导入的包
import time
import docx
import xlwt
from docx.oxml.ns import qn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
需要避雷的点
webdriver的版本要与浏览器一致
如果用的是google chrome,可以在这里找到新版本的driver;
其他浏览器的话直接百度就能找到。
如果使用爬虫打开了新网页,要记得跳转
一开始不知道这一点,试了半天都定位不到要爬取的元素,结果最后发现一直没跳转到第二个页面上,那在第一个页面上当然定位不到了……跳转的代码如下:
new_window = driver.window_handles[1] #找第二个窗口
driver.switch_to.window(new_window) #切换到新窗口
driver.refresh() #刷新
XPath和selector都可以直接复制
复制过程如下图所示,比自己写方便多了。
爬取多网页时记得try
比如这次爬取的澎湃新闻的文章,有些链接点进去是视频,是我们所不需要的,定位的位置也不一样,极有可能会报错中断。这时,就需要try-except语句来帮助我们跳过了。
try:
x_path="//main/div[4]/div[1]/div[1]/div/h1"
title=driver.find_element(By.XPATH, x_path)
x_path = "//main/div[4]/div[1]/div[1]/div/div[2]"
article=driver.find_element(By.XPATH, x_path)
print(title.text)
print(article.text)
file.add_paragraph(article.text)
except:
print("非文字")
打入word时调整字体的问题
具体程序如下:
for para in file.paragraphs:
for run in para.runs:
run.font.size = docx.shared.Pt(10) #设置字体大小为10
run.font.name = 'Times New Roman' #英文
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'楷体') # 中文
值得注意的是,中文的字体前面最好加一个u,而且qn需要单独导包:
from docx.oxml.ns import qn
完整程序
import time
import docx
import xlwt
from docx.oxml.ns import qn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
def main():
driver = webdriver.Edge()
driver.get("https://www.thepaper.cn/")
time.sleep(1)
search=driver.find_element(By.TAG_NAME,'input')
search.send_keys("反腐")
time.sleep(1)
x_path="//main/div/div/div/div/div/div/div/span"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
x_path="//main/div[3]/div[1]/div/div[2]/div/ul/li[2]"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
last_height = driver.execute_script("return document.body.scrollHeight") # 获取当前页面的高度
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
last_height = driver.execute_script("return document.body.scrollHeight")
while True: # 模拟下拉操作,直到滑动到底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 模拟下拉操作
time.sleep(2) # 等待页面加载
new_height = driver.execute_script("return document.body.scrollHeight") # 获取当前页面的高度
if new_height == last_height: # 判断是否已经到达页面底部
break
last_height = new_height
x_path="//main/div[3]/div[1]/div/div/div/ul/li/div/a"
names=driver.find_elements(By.XPATH,x_path)
name_text=[]
name_href=[]
num=-1
for name in names:
name_text.append(name.text)
name_href.append(name.get_attribute("href"))
num=num+1
print(name.text)
print(name.get_attribute("href"))
file=docx.Document() #创建docx对象
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,'标题')
sheet1.write(0,1,'链接')
for i in range(num+1):
print(name_text[i])
print(name_href[i])
address=name_href[i]
driver.get(address)
file.add_paragraph(name_text[i])
sheet1.write(i+1,0,name_text[i])
sheet1.write(i + 1, 1, name_href[i])
try:
x_path="//main/div[4]/div[1]/div[1]/div/h1"
title=driver.find_element(By.XPATH, x_path)
x_path = "//main/div[4]/div[1]/div[1]/div/div[2]"
article=driver.find_element(By.XPATH, x_path)
print(title.text)
print(article.text)
file.add_paragraph(article.text)
except:
print("非文字")
for para in file.paragraphs:
for run in para.runs:
run.font.size = docx.shared.Pt(10) #设置字体大小为10
run.font.name = 'Times New Roman' #英文
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'楷体') # 中文
file.save("crawlerResult.docx")
workbook.save('./crawlerResult.xls')
if __name__=='__main__':
main()
扩展
现将功能扩展如下:
(1)爬取分别以“反腐”,“从严治党”,“廉洁”,三个关键词搜索的文章内容并存储;
(2)只保留不重复的部分。
为实现该功能,需要一个字典,来判断该文章是否已经被搜索过:
dict={} #记录是否重复的字典
names=driver.find_elements(By.XPATH,x_path)
for name in names:
if name.text not in dict:
name_text.append(name.text)
name_href.append(name.get_attribute("href"))
num=num+1
print(name.text)
print(name.get_attribute("href"))
dict[name.text]=1
另外发现,爬取过程中可能出现某网址已经失效的情况,在这种情况下需要跳过,否则程序也会因执行不下去而异常结束,此处使用try-except处理:
try:
address=name_href[i]
driver.get(address)
except:
print("网址失效")
扩展之后的程序如下:文章来源:https://www.toymoban.com/news/detail-741229.html
import time
import docx
import xlwt
from docx.oxml.ns import qn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
def main():
search_word=['反腐','从严治党','廉洁']
search_word_len=search_word.__len__()
dict={} #记录是否重复的字典
num = -1 #记录标题数
search_word_num=0 #搜索到第几个词
name_text = []
name_href = []
for word in search_word:
search_word_num=search_word_num+1
driver = webdriver.Edge()
driver.get("https://www.thepaper.cn/")
time.sleep(1)
search=driver.find_element(By.TAG_NAME,'input')
#print(word)
search.send_keys(word)
time.sleep(1)
x_path="//main/div/div/div/div/div/div/div/span"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
x_path="//main/div[3]/div[1]/div/div[2]/div/ul/li[2]"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
last_height = driver.execute_script("return document.body.scrollHeight") # 获取当前页面的高度
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
last_height = driver.execute_script("return document.body.scrollHeight")
while True: # 模拟下拉操作,直到滑动到底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 模拟下拉操作
time.sleep(2) # 等待页面加载
new_height = driver.execute_script("return document.body.scrollHeight") # 获取当前页面的高度
if new_height == last_height: # 判断是否已经到达页面底部
break
last_height = new_height
x_path="//main/div[3]/div[1]/div/div/div/ul/li/div/a"
names=driver.find_elements(By.XPATH,x_path)
for name in names:
if name.text not in dict:
name_text.append(name.text)
name_href.append(name.get_attribute("href"))
num=num+1
print(name.text)
print(name.get_attribute("href"))
dict[name.text]=1
if search_word_num == search_word_len:
file=docx.Document() #创建docx对象
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,'标题')
sheet1.write(0,1,'链接')
for i in range(num+1):
print(name_text[i])
print(name_href[i])
try:
address=name_href[i]
driver.get(address)
except:
print("网址失效")
file.add_paragraph(name_text[i])
sheet1.write(i+1,0,name_text[i])
sheet1.write(i + 1, 1, name_href[i])
try:
x_path="//main/div[4]/div[1]/div[1]/div/h1"
title=driver.find_element(By.XPATH, x_path)
x_path = "//main/div[4]/div[1]/div[1]/div/div[2]"
article=driver.find_element(By.XPATH, x_path)
print(title.text)
print(article.text)
file.add_paragraph(article.text)
except:
print("非文字")
for para in file.paragraphs:
for run in para.runs:
run.font.size = docx.shared.Pt(10) #设置字体大小为10
run.font.name = 'Times New Roman' #英文
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'楷体') # 中文
file.save("crawlerResult.docx")
workbook.save('./crawlerResult.xls')
else:
driver.close()
print(dict.keys())
if __name__=='__main__':
main()
爬取效果
word共2203页324万字
excel共1768行(1767个文章标题,第一行为表头)
文章来源地址https://www.toymoban.com/news/detail-741229.html
到了这里,关于selenium爬虫——以爬取澎湃新闻某搜索结果为例的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!