import requests
import re
import json
import os
session = requests.session()
def fetch_url(url):
return session.get(url).content.decode('gbk')
def get_doc_id(url):
return re.findall('view/(.*).html', url)[0]
def parse_type(content):
return re.findall(r"docType.*?\:.*?\'(.*?)\'\,", content)[0]
def parse_title(content):
return re.findall(r"title.*?\:.*?\'(.*?)\'\,", content)[0]文章来源:https://www.toymoban.com/news/detail-422823.html
def parse_doc(content):
result = ''
url_list = re.findall('(https.*?0.json.*?)\\\\x22}', content)
url_list = [addr.replace("\\\\\\/", "/") for addr in url_list]
for url in url_list[:-5]:
content = fetch_url(url)
y = 0
txtlists = re.findall('"c":"(.*?)".*?"y":文章来源地址https://www.toymoban.com/news/detail-422823.html
到了这里,关于百度文库爬虫(爬取需要下载券的文档)的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!