一、引言
- 在日常生活中,我们经常会遇到一个问题:不知道吃什么。尤其是在陌生的城市或附近的地方,面对众多的餐馆选择,很难做出决策。随着互联网的发展,大众点评等餐饮评价平台应运而生,为我们提供了海量的餐馆信息和用户评价。然而,即使在这样的平台上,面对庞大的数据量,我们仍然很难迅速找到最适合我们口味的美食店
二、爬取目标
- 采集以下数据:店名,美食类型,地点,评分,评价人数以及人均消费
三、准备工作
- 版本:python版本3.x及以上
- 需要用到的包:requests,selenium,re,bs4,tqdm,subprocess,time,random,bag(自写包,可私聊获取)
-
json文件:完整json文件过大,这里只截取了部分数据进行展示
# city.json { "郑州": "https://www.dianping.com/zhengzhou", "珠海": "https://www.dianping.com/zhuhai", "张家口": "https://www.dianping.com/zhangjiakou" } # menu.json { "美食": "https://www.dianping.com/{}/ch10", "丽人": "https//:www.dianping.com/{}/beauty", "周边游": "https//:www.dianping.com/{}/ch35", } """menu.json这个文件通过后面代码自动生成,生成格式如上所示""" # cookies.json [{}] """这里涉及到隐私问题就没把cookies展示出来, 下面会一步步带领大家如何自动获得可用的cookies 并且保存到本地需要时直接调用"""
四、爬虫实现
-
使用selenium获取登录后的cookies
@echo off cd "C:\Program Files\Google\Chrome\Application" start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile"
#!/usr/bin/env python3 # coding:utf-8 import subprocess import bag import time import random # batch_file_content = r''' # @echo off # cd "C:\Program Files\Google\Chrome\Application" # start chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenium\AutomationProfile" # ''' # # with open('run_chrome.bat', 'w') as f: # f.write(batch_file_content) subprocess.Popen('run_chrome.bat', shell=True) web = bag.Bag.web_debug() web.get(r'https://www.dianping.com/') time.sleep(random.randint(5, 10)) cookie = web.get_cookies() web.close() bag.Bag.save_json(cookie, r'./cookies.json')
-
新建一个文本文件,将第一个代码复制过去并修改后缀为.bat文件,至于为什么要这样做呢,主要是因为这样做了后可以用subprocess来控制程序文章来源:https://www.toymoban.com/news/detail-857584.html
-
运行下面的代码一个可用的cookies便会自动生成文章来源地址https://www.toymoban.com/news/detail-857584.html
-
-
选择需要爬取的类型并生成menu.json文件
#!/usr/bin/env python3 # coding:utf-8 import bag from bs4 import BeautifulSoup import re session = bag.session.create_session() for cookie in bag.Bag.read_json(r'./cookies.json'): session.cookies.set(cookie['name'], cookie['value']) # 输入需要爬取的城市名称 def choose_city(): js_data = bag.Bag.read_json('./city.json') choose = input('输入城市名:') judge = js_data.get(choose) # 判断输入的城市是否存在 # pattern = re.compile(r'<a.*?data-click-title="first".*?href="(.*?)".*?>(.*?)</a>', re.S) pattern = re.compile(r'<a.*?href="(.*?)".*?>(.*?)</a>', re.S) dic = {} if judge: resp = session.get(judge) html = BeautifulSoup(resp.text, 'lxml') soup = html.findAll('span', class_='span-container') for info in soup: data = re.findall(pattern, str(info)) mid: list = data[0][0].split('/') mid[-2] = '{}' dic[data[0][1]] = 'https:' + ''.join(mid) else: print('无效输入!') choose_city() print(dic) # 根据输入信息得到的生成结果 '''输入城市名:珠海 { "美食": "https:www.dianping.com{}ch10", "休闲娱乐": "https:www.dianping.com{}ch30", "结婚": "https:www.dianping.com{}wedding", "电影演出赛事": "https:www.dianping.com{}movie", "丽人": "https:www.dianping.com{}beauty", "酒店": "https:www.dianping.com{}hotel", "亲子": "https:www.dianping.com{}baby", "周边游": "https:www.dianping.com{}ch35", "运动健身": "https:www.dianping.com{}ch45", "购物": "https:www.dianping.com{}ch20", "家装": "https:www.dianping.com{}home", "学习培训": "https:www.dianping.com{}education", "生活服务": "https:www.dianping.com{}ch80", "医疗健康": "https:www.dianping.com{}ch85", "爱车": "https:www.dianping.com{}ch65", "宠物": "https:www.dianping.com{}ch95" }''' bag.Bag.save_json(dic, r'./menu.json') if __name__ == '__main__': choose_city()
-
完整代码
# choose.py # !/usr/bin/env python3 # coding:utf-8 import bag def choose_city(): session = bag.session.create_session() for cookie in bag.Bag.read_json(r'./cookies.json'): session.cookies.set(cookie['name'], cookie['value']) session.headers['Connection'] = 'close' js_data = bag.Bag.read_json('./city.json') choose = input('输入城市名:') judge = js_data.get(choose) if judge: city = judge.split('/')[-1] choose_1 = input('输入爬取类类型:') js_data1 = bag.Bag.read_json('./menu.json') judge1 = js_data1.get(choose_1) if judge1: return judge1.format(city), session else: print('开发中......') return None else: print('无效输入!') return None
# get_shop.py # !/usr/bin/env python3 # coding:utf-8 import bag import chooses import re from bs4 import BeautifulSoup from tqdm import tqdm import requests proxies = { "http": "http://{}:{}", } def check(): url_ = r'https://www.dianping.com/zhuhai/ch10' ip_ls = bag.Bag.read_json('../代理ip/IP地址.json') index = 0 if len(ip_ls) == 0: print('IP地址全部失效') exit() for ip_address in ip_ls: proxies_ = { "http": "{}:{}".format(ip_address[0], ip_address[1]), } resp = session.get(url_, proxies=proxies_) if resp.status_code == 200: proxies['http'] = proxies['http'].format(ip_address[0], ip_address[1]) # 创建虚拟IP bag.Bag.save_json(ip_ls[index:], r'../代理ip/IP地址.json') print(f'[{index}] 更换ip成功') return index += 1 url, session = chooses.choose_city() def get_types(): # 正常传参 check() pattern = re.compile(r'<a.*?href="(.*?)".*?<span>(.*?)</span></a>', re.S) if bool(url): resp = session.get(url, proxies=proxies) html = BeautifulSoup(resp.text, 'lxml') soup = html.findAll('div', id='classfy') links = re.findall(pattern, str(soup)) return links else: check() get_types() def get_shop(): links = get_types() pattern = re.compile(r'<div class="tit">.*?<a.*?data-shopid="(.*?)".*?href="(.*?)".*?title="(.*?)"' r'(?:.*?<div class="star_icon">.*?<span class="(.*?)"></span>.*?<b>(.*?)</b>)?' r'(?:.*?<b>(.*?)</b>)?' r'(?:.*?<div class="tag-addr">.*?<span class="tag">(.*?)</span>.*?<em class="sep">.*?<span class="tag">(.*?)</span>)?', re.S) number = re.compile(r'data-ga-page="(.*?)"', re.S) result = [] if not bool(links): print('获取异常') return for link in links: # 获取第一页 try: resp = session.get(link[0], proxies=proxies) page = [int(i) for i in re.findall(number, resp.text)] page_num = sorted(page, reverse=True)[0] html = BeautifulSoup(resp.text, 'lxml') soup = html.findAll('li', class_='') for i in soup: for j in re.findall(pattern, str(i)): result.append(j) if page_num >= 2: # 获取第一页往后 for count in tqdm(range(page_num)[1:]): try: resp1 = session.get(link[0]+'p{}'.format(count+1), proxies=proxies) html1 = BeautifulSoup(resp1.text, 'lxml') soup1 = html1.findAll('li', class_='') for k in soup1: info = pattern.search(str(k)) if info: groups = list(info.groups()) for i in range(len(groups)): if not groups[i]: groups[i] = 'null' result.append(tuple(groups)) except requests.exceptions.RequestException as e: print(e) check() except Exception as e: print(e) continue else: pass except requests.exceptions.RequestException as e: print(e) check() except Exception as e: print(e) check() return result end = get_shop() bag.Bag.save_excel(end, './商店.xlsx')
五、成品展示
六、总结
- 使用selenium结合requests对网页数据进行采集可以避免很多弯弯绕绕的破解
- 大众点评反爬机制比较完善,爬取的时候为了防止ip被拉黑建议使用代理ip,代理ip使用方法可自行百度
到了这里,关于《大众点评爬虫程序实战:爬取店铺展示信息》的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!