python 爬虫爬取高考录取分数线信息

这篇具有很好参考价值的文章主要介绍了python 爬虫爬取高考录取分数线信息。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

原文链接: python 爬虫爬取高考录取分数线信息

上一篇: axios 原生上传xlsx文件

下一篇: pandas 表格数据补全空值

网页

https://gkcx.eol.cn/school/search

完整资料一个多g

高校信息爬取接口

import requests_html
import json

sess = requests_html.HTMLSession()
url = "https://api.eol.cn/gkcx/api/"
data = {
    "access_token": "",
    "admissions": "",
    "central": "",
    "department": "",
    "dual_class": "",
    "f211": "",
    "f985": "",
    "is_dual_class": "",
    "keyword": "",
    "page": 8, "province_id": "",
    "request_type": 1,
    "school_type": "",
    "size": 20,
    "sort": "view_total",
    "type": "",
    "uri": "apigkcx/api/school/hotlists"
}
res = sess.post(url, data)
js = res.json()
print(type(js))  # <class 'dict'>

with open('./school.json', mode='w+') as f:
    f.write(js['data']['item'])

省份信息,在需要选择省份的下拉框页面中查看请求

def get_province():
    url = "https://static-data.eol.cn/www/config/detial/1.json"
    res = sess.get(url)
    js = res.json()
    print(js)
    with open('./province.json', mode='w+') as f:
        json.dump(js, f)

接口有最大数据传输量的限制,需要按照分页格式先请求所有,然后合并学校信息,并输出为xlsx文件

完整代码

import requests_html
import json
import glob
import pandas as pd

sess = requests_html.HTMLSession()


def get_school():
    url = "https://api.eol.cn/gkcx/api/"
    for pageNo in range(300):
        data = {
            "access_token": "",
            "admissions": "",
            "central": "",
            "department": "",
            "dual_class": "",
            "f211": "",
            "f985": "",
            "is_dual_class": "",
            "keyword": "",
            "page": pageNo,
            "province_id": "",
            "request_type": 1,
            "school_type": "",
            "size": 20,
            "sort": "view_total",
            "type": "",
            "uri": "apigkcx/api/school/hotlists"
        }
        res = sess.post(url, data)
        js = res.json()
        print(type(js['data']['item']))  # <class 'dict'>
        with open(f'./school/school_{pageNo}.json', mode='w+') as f:
            json.dump(js['data']['item'], f)


def get_province():
    url = "https://static-data.eol.cn/www/config/detial/1.json"
    res = sess.get(url)
    js = res.json()
    print(js)
    with open('./province.json', mode='w+') as f:
        json.dump(js, f)


def show_province():
    with open('./province.json', ) as f:
        data = json.load(f)
    print(data)


def merge_school():
    all_school = []
    for path in glob.glob('./school/*'):
        with open(path) as f:
            data = json.load(f)
            all_school += data

    print(len(all_school)) # 2855
    all_school = json.dumps(all_school)
    with open('./json/school.json', mode='w+') as f:
        json.dump(all_school, f)
    df = pd.read_json(all_school)
    df.to_excel('./school.xlsx')


if __name__ == '__main__':
    # get_school()
    # get_province()
    merge_school()

查询每个学校每年在各个省份的录取成绩文章来源地址https://www.toymoban.com/news/detail-408254.html

import json
import requests
import os
from multiprocessing import Pool
import time
import random

years = range(2014, 2018 + 1)  # 年份
pages = range(2)  # 数目,3页基本够用了
types = [
    {
        'code': 1,
        'name': '理科'
    },
    {
        'code': 2,
        'name': '文科'
    }
]  # 文理科
with open('./json/province.json') as f:
    provinces = json.load(f)
with open('./json/school.json') as f:
    schools = json.load(f)

headers = {
    'Accept': 'application/json, text/plain, */*',
    'Content-Type': 'application/json;charset=UTF-8',
    'Origin': 'https://gkcx.eol.cn',
    'Referer': 'https://gkcx.eol.cn/school/332',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}


def get_score(school):
    path = f"./score/{school['name']}.json"
    if os.path.exists(path):
        print(school['name'], 'exists')
        return
    sess = requests.Session()
    url = "https://api.eol.cn/gkcx/api/"
    score = {}
    for type in types:
        score[type['name']] = {}
        for province in provinces:
            score[type['name']][province['name']] = {}
            for year in years:
                all_data = []
                try:
                    for page in pages:
                        data = {
                            "access_token": "",
                            "local_province_id": province['code'],  # 学生省份
                            "local_type_id": type['code'],  # 文理科 理科1 文科2
                            "page": page,  # 页号
                            "school_id": school['school_id'],  # 学校id
                            "size": 20,
                            "uri": "apidata/api/gk/score/special",
                            "year": year  # 年份
                        }
                        # time.sleep(random.random() * 0.1)
                        js = sess.post(url, data, headers=headers).json()
                        all_data += js['data']['item']
                except Exception as e:
                    print(e)
                score[type['name']][province['name']][str(year)] = all_data

    with open(path, mode='w+') as f:
        json.dump(score, f)
    print(school['name'], 'finished')


def start():
    for school in schools[:10]:
        print(school['name'], 'start')
        get_score(school)


def thread_start():
    #  线程数目太多会被访问拒绝
    p = Pool(4)
    for school in schools:
        print(school['name'], 'start')
        p.apply_async(get_score, args=(school,))
    p.close()  # 关闭进程池，不再接受新的任务
    p.join()  # 阻塞主进程，知道进程池中的进程完成任务


def main():
    # start()
    thread_start()


if __name__ == '__main__':
    main()

到了这里，关于python 爬虫爬取高考录取分数线信息的文章就介绍完了。如果您还想了解更多内容，请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章，希望大家以后多多支持TOY模板网！