适用抖音、快手视频和标题获取
1、前言
本篇介绍从电脑如何安装多版本Chrome到Java结合selenium爬虫实现网页、API数据获取技术,抖音和快手也会不定期会更新请求方式,注意版本适配。适用win10,win11,有需要的小伙伴可以继续往下看。
2、环境配置
2.1、浏览器环境
浏览器安装参考链接:点击链接
首先电脑任意盘创建文件夹(根据自己喜好命名),暂且命名为old_chrome,在网上下载主启动GoogleChromePortable.exe文件放置到old_chrome文件下
GoogleChromePortable.exe下载地址:点击链接
通过360压缩或其他压缩软件右击打开,不是解压,是右击选择360压缩软件打开,把GoogleChromePortable.exe拖出来(如上图所示)
在old_chrome下创建一个新的文件夹,为了方便查看,我使用的114版本,文件命名为old_chrome114,在网上下载对应Chrome版本的离线安装包(一般文件大小>50MB为离线安装包),后缀为.exe的文件,放到old_chrome114文件下。
如果找不到离线安装包,可参考Chrome 的107以前版本下载地址:点击链接
然后查看安装包按照下图所示步骤,右击点击属性,点击数字前面,双击下面签名者名称,查看数字签名信息是否正常,此处必须数字签名正常的情况才可以进行后续操作。
然后用同样的方法,右击选择360压缩软件打开看到chrome.7z。
新建文件夹APP,把chrome.7z文件拖拽到APP文件下进行解压,得到Chrome-bin文件。chrome.7z压缩包就可以删除了
把开始下载好的GoogleChromePortable.exe文件复制到old_chrome114文件下,我这里名字改加了版本号改成了GoogleChromePortable114.exe。
然后双击GoogleChromePortable114.exe就可以启动114版本的浏览器了,启动之后会在当前文件夹里面创建一个Data文件夹存放数据。之后可以在浏览器里面查看关于Chrome,查看版本,下图我两个不同谷歌浏览器的运行展示。也可以右击GoogleChromePortable114.exe点击发送到——桌面快捷方式,即可在桌面创建快捷访问,如果想安装多个版本的浏览器可以按照这种操作逐个添加。
2.2、浏览器驱动
百度下载对应的Chrome驱动chromedriver,放到对应版本的文件下,然后进行解压,拿到chromedriver.exe,记住这个路径。
2.3、开发环境
开发使用的jdk1.8,搭建的spring项目
爬虫使用依赖4.10版本
<!--爬虫-->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.10.0</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chromium-driver</artifactId>
<version>4.10.0</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-devtools-v114</artifactId>
<version>4.10.0</version>
</dependency>
3、抖音爬虫获取
因为抖音改版后不能直接通过API获取视频链接和标题,视频链接可以通过API获取,标题需要通过请求前端html,通过截取获得。文章来源:https://www.toymoban.com/news/detail-805724.html
3.1、视频获取
package cn.executor;
import cn.hutool.http.HttpUtil;
import cn.perfectlinks.node.properties.RemoveWatermarkProperties;
import cn.perfectlinks.node.utils.RemoveWatermarkConstant;
import cn.perfectlinks.node.vo.VideoRemoveWatermarkVo;
import cn.perfectlinks.node.vo.VideoUrlVo;
import com.alibaba.fastjson2.JSONObject;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.devtools.DevTools;
import org.openqa.selenium.devtools.v114.network.Network;
import org.openqa.selenium.devtools.v114.network.model.Request;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Component
@Slf4j
@RequiredArgsConstructor
public class DYVideo{
private final RemoveWatermarkProperties removeWatermarkProperties;
@SneakyThrows
public VideoRemoveWatermarkVo executor(String oldVideoUrl) throws IOException {
log.info("请求参数:"+ oldVideoUrl);
VideoRemoveWatermarkVo videoRemoveWatermarkVo = new VideoRemoveWatermarkVo();
// 拿到分享链接中的视频地址
String filterUrl = this.filterUrl(oldVideoUrl);
// 视频解析
if (oldVideoUrl.contains(RemoveWatermarkConstant.D_Y_COM)) {
// dou_yin由于有真人验证问题需要多掉几次接口
Integer n = RemoveWatermarkConstant.ZERO;
do {
n++;
// videoRemoveWatermarkVo = this.douYinParseUrl(filterUrl);
VideoUrlVo videoUrlVo = this.getTrueAddress(filterUrl, RemoveWatermarkConstant.D_Y_TYPE);
//去水印
videoRemoveWatermarkVo.setUrl(videoUrlVo.getResponseVideoUrl().replaceAll(RemoveWatermarkConstant.PLAY_WM, RemoveWatermarkConstant.PLAY));
} while (
StringUtils.isBlank(videoRemoveWatermarkVo.getUrl())
&& n <= RemoveWatermarkConstant.FIVE
);
} else {
throw new Exception(RemoveWatermarkConstant.ONLY_SUPPORT_ERR);
}
if (videoRemoveWatermarkVo.getUrl() == null){
throw new Exception(RemoveWatermarkConstant.SHARING_FAILURE);
}
return videoRemoveWatermarkVo;
}
/**
* 方法描述: 抖音视频去水印
*/
private VideoRemoveWatermarkVo douYinParseUrl(String url) {
VideoRemoveWatermarkVo videoRemoveWatermarkVo = new VideoRemoveWatermarkVo();
try {
VideoUrlVo trueAddress = this.getTrueAddress(url, RemoveWatermarkConstant.D_Y_TYPE);
log.info(RemoveWatermarkConstant.D_Y_DATA, trueAddress);
if (StringUtils.isBlank(trueAddress.getResponseVideoUrl())) return videoRemoveWatermarkVo;
// 调抖音接口获取视频数据
String jsonStr = HttpUtil.get(trueAddress.getResponseVideoUrl());
log.info(RemoveWatermarkConstant.D_Y_API_DATA, jsonStr);
if (StringUtils.isBlank(jsonStr)) return videoRemoveWatermarkVo;
JSONObject obj = JSONObject.parseObject(jsonStr);
// 获取当前的视频的真实url
String videoAddress = obj.getJSONArray(RemoveWatermarkConstant.ITEM_LIST)
.getJSONObject(RemoveWatermarkConstant.ZERO)
.getJSONObject(RemoveWatermarkConstant.VIDEO)
.getJSONObject(RemoveWatermarkConstant.PLAY_ADDR)
.getJSONArray(RemoveWatermarkConstant.URL_LIST)
.get(RemoveWatermarkConstant.ZERO)
.toString();
// 把playwm替换成play
videoAddress = videoAddress.replaceAll(RemoveWatermarkConstant.PLAY_WM, RemoveWatermarkConstant.PLAY);
// 视频标题
String title = obj.getJSONArray(RemoveWatermarkConstant.ITEM_LIST)
.getJSONObject(RemoveWatermarkConstant.ZERO)
.getString(RemoveWatermarkConstant.DESC);
videoRemoveWatermarkVo.setUrl(videoAddress)
.setTitle(title);
} catch (Exception e) {
log.error(RemoveWatermarkConstant.D_Y_API_ERR, e.getMessage());
}
log.info("videoRemoveWatermarkVo参数:{}",videoRemoveWatermarkVo.toString());
return videoRemoveWatermarkVo;
}
/**
* 方法描述: 过滤分享链接的中文汉字
*/
private String filterUrl(String url) {
Matcher m = Pattern.compile(RemoveWatermarkConstant.REGEX).matcher(url);
if (m.find()) {
return url.substring(m.start(), m.end());
}
return "";
}
/**
* 爬取原视频需要的地址和入参
*/
private VideoUrlVo getTrueAddress(String url, Integer type) {
VideoUrlVo videoUrlVo = new VideoUrlVo();
//本地测试驱动路径
// System.setProperty(RemoveWatermarkConstant.DRIVER_URL, "D:\\Program Files\\old_chrome\\old_chrome114\\chromedriver114win32\\chromedriver.exe");
System.setProperty(RemoveWatermarkConstant.DRIVER_URL, removeWatermarkProperties.getDriver_url());
// 设置谷歌浏览器入参
ChromeOptions options = new ChromeOptions();
//本地浏览器路径
// options.setBinary("D:\\Program Files\\old_chrome\\old_chrome114\\APP\\Chrome-bin\\chrome.exe");
options.addArguments(RemoveWatermarkConstant.CHROME_USER_AGENT + RemoveWatermarkConstant.CHROME_USER_AGENT_ANDROID);
options.addArguments(RemoveWatermarkConstant.DISABLE_BLINK_FEATURES);
options.addArguments(RemoveWatermarkConstant.DISABLE_EXTENSIONS);
options.addArguments(RemoveWatermarkConstant.DISABLE_POPUP_BLOCKING);
// 设置浏览器选项,模拟移动设备
options.addArguments(RemoveWatermarkConstant.WINDOW_SIZE);
// 创建 ChromeDriver 并传入 ChromeOptions
ChromeDriver driver = new ChromeDriver(options);
// 启用 Chrome DevTools
DevTools devTools = driver.getDevTools();
devTools.createSession();
// 抖音是GET请求直接拿URL就行
//爬虫更新后的抖音视频获取
devTools.addListener(Network.requestWillBeSent(), response -> {
Request request = response.getRequest();
if (Objects.nonNull(request) && request.getUrl().contains(RemoveWatermarkConstant.D_Y_VIDEO_URL)) {
videoUrlVo.setResponseVideoUrl(request.getUrl());
}
if (Objects.nonNull(request) && request.getUrl().contains(RemoveWatermarkConstant.D_Y_RE_VIDEO_URL)) {
videoUrlVo.setRedirectUrl(request.getUrl());
}
});
// 启用监听器
devTools.send(Network.enable(Optional.empty(), Optional.empty(), Optional.empty()));
// 访问目标网页
driver.get(url);
try {
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
// 关闭浏览器
driver.quit();
return videoUrlVo;
}
}
3.2、标题获取
package cn.executor;
import cn.perfectlinks.node.properties.RemoveWatermarkProperties;
import cn.perfectlinks.node.utils.RemoveWatermarkConstant;
import com.perfectlinks.applink.common.core.exception.Assert;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Component
@Slf4j
@RequiredArgsConstructor
public class Title{
public String titleFetch(String redirectUrl){
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求
HttpGet request = new HttpGet(redirectUrl);
//设置请求头,将爬虫伪装成浏览器
request.setHeader("User-Agent", RemoveWatermarkConstant.CHROME_USER_AGENT_ANDROID);
try {
//3.执行get请求
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
Assert.isTrue(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK, "视频标题获取失败");
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
Assert.isTrue(httpEntity != null, "视频标题获取失败");
String html = EntityUtils.toString(httpEntity, "utf-8");
String extractedContent = extractContent(html);
Assert.isTrue(!"".equals(extractedContent), "视频标题获取失败");
return extractedContent.split("\"")[3].split(" - ")[0];
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return null;
}
//截取字符串
public static String extractContent(String htmlString) {
String startTag = "name=\"description\" content=\"";
String endTag = "\"/><meta data-react-helmet=\"true\" name=\"keywords\"";
int startIndex = htmlString.indexOf(startTag);
int endIndex = htmlString.indexOf(endTag);
if (startIndex == -1 || endIndex == -1) {
return "";
}
return htmlString.substring(startIndex, endIndex);
}
}
4、快手爬虫获取
package cn.perfectlinks.node.executor;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.json.JSONUtil;
import cn.perfectlinks.node.properties.RemoveWatermarkProperties;
import cn.perfectlinks.node.utils.RemoveWatermarkConstant;
import cn.perfectlinks.node.vo.VideoRemoveWatermarkVo;
import cn.perfectlinks.node.vo.VideoUrlVo;
import com.alibaba.fastjson2.JSONObject;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.devtools.DevTools;
import org.openqa.selenium.devtools.v114.network.Network;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Component
@Slf4j
@RequiredArgsConstructor
public class KSVideo {
private final RemoveWatermarkProperties removeWatermarkProperties;
@SneakyThrows
public VideoRemoveWatermarkVo executor(String oldVideoUrl) throws IOException {
log.info("请求参数:"+ oldVideoUrl);
VideoRemoveWatermarkVo videoRemoveWatermarkVo = new VideoRemoveWatermarkVo();
// 拿到分享链接中的视频地址
String filterUrl = this.filterUrl(oldVideoUrl);
// 视频解析
if (oldVideoUrl.contains(RemoveWatermarkConstant.K_S_COM)) {
videoRemoveWatermarkVo = this.ksParseUrl(filterUrl);
} else {
throw new Exception(RemoveWatermarkConstant.ONLY_SUPPORT_ERR);
}
if (videoRemoveWatermarkVo.getUrl() == null){
throw new Exception(RemoveWatermarkConstant.SHARING_FAILURE);
}
return videoRemoveWatermarkVo;
}
/**
* 方法描述: 快手视频去水印
*/
private VideoRemoveWatermarkVo ksParseUrl(String url) {
VideoRemoveWatermarkVo videoRemoveWatermarkVo = new VideoRemoveWatermarkVo();
// 爬取请求数据
VideoUrlVo trueAddress = this.getTrueAddress(url, RemoveWatermarkConstant.K_S_TYPE);
log.info(RemoveWatermarkConstant.K_S_DATA, trueAddress.getResponseVideoUrl(), trueAddress.getReferer());
if (StringUtils.isBlank(trueAddress.getResponseVideoUrl()) || StringUtils.isBlank(trueAddress.getReferer()))
return videoRemoveWatermarkVo;
// 获取快手cookie
this.getCookieInfo(trueAddress);
log.info(RemoveWatermarkConstant.K_S_COOKIE, trueAddress.getCookieInfo());
if (StringUtils.isBlank(trueAddress.getCookieInfo())) return videoRemoveWatermarkVo;
try {
if (StringUtils.isBlank(trueAddress.getVideoPostBody())) return videoRemoveWatermarkVo;
String videoPostBody = trueAddress.getVideoPostBody();
JSONObject obj = JSONObject.parseObject(videoPostBody);
// post请求设置请求体
cn.hutool.json.JSONObject map = JSONUtil.createObj();
this.setPostParams(obj, map);
if (StringUtils.isBlank(trueAddress.getResponseVideoUrl())) return videoRemoveWatermarkVo;
HttpResponse execute = HttpRequest.post(trueAddress.getResponseVideoUrl())
.header(RemoveWatermarkConstant.USER_AGENT, RemoveWatermarkConstant.CHROME_USER_AGENT_IPHONE)
.header(RemoveWatermarkConstant.COOKIE, trueAddress.getCookieInfo())
.header(RemoveWatermarkConstant.REFERER, trueAddress.getReferer())
.body(map.toString())
.execute();
String body = execute.body();
if (StringUtils.isBlank(body)) return videoRemoveWatermarkVo;
JSONObject jsonObject = JSONObject.parseObject(body);
// 获取标题
String title = jsonObject.getJSONObject(RemoveWatermarkConstant.SHARE_INFO)
.getString(RemoveWatermarkConstant.SHARE_TITLE);
// 获取无水印视频链接
String videoAddress = jsonObject.getString(RemoveWatermarkConstant.MP4_URL);
videoRemoveWatermarkVo.setTitle(title)
.setUrl(videoAddress);
} catch (Exception e) {
log.error(RemoveWatermarkConstant.K_S_API_ERR, e.getMessage());
}
return videoRemoveWatermarkVo;
}
private void setPostParams(JSONObject obj, cn.hutool.json.JSONObject map) {
map.set(RemoveWatermarkConstant.FID, obj.getString(RemoveWatermarkConstant.FID));
map.set(RemoveWatermarkConstant.SHARE_TOKEN, obj.getString(RemoveWatermarkConstant.SHARE_TOKEN));
map.set(RemoveWatermarkConstant.SHARE_OBJECT_ID, obj.getString(RemoveWatermarkConstant.SHARE_OBJECT_ID));
map.set(RemoveWatermarkConstant.SHARE_METHOD, obj.getString(RemoveWatermarkConstant.SHARE_METHOD));
map.set(RemoveWatermarkConstant.SHARE_ID, obj.getString(RemoveWatermarkConstant.SHARE_ID));
map.set(RemoveWatermarkConstant.SHARE_RESOURCE_TYPE, obj.getString(RemoveWatermarkConstant.SHARE_RESOURCE_TYPE));
map.set(RemoveWatermarkConstant.SHARE_CHANNEL, obj.getString(RemoveWatermarkConstant.SHARE_CHANNEL));
map.set(RemoveWatermarkConstant.KPN, obj.getString(RemoveWatermarkConstant.KPN));
map.set(RemoveWatermarkConstant.SUB_BIZ, obj.getString(RemoveWatermarkConstant.SUB_BIZ));
map.set(RemoveWatermarkConstant.ENV, obj.getString(RemoveWatermarkConstant.ENV));
map.set(RemoveWatermarkConstant.H5_DOMAIN, obj.getString(RemoveWatermarkConstant.H5_DOMAIN));
map.set(RemoveWatermarkConstant.PHOTO_ID, obj.getString(RemoveWatermarkConstant.PHOTO_ID));
map.set(RemoveWatermarkConstant.IS_LONG_VIDEO, obj.getString(RemoveWatermarkConstant.IS_LONG_VIDEO));
}
/**
* 方法描述: 过滤分享链接的中文汉字
*/
private String filterUrl(String url) {
Matcher m = Pattern.compile(RemoveWatermarkConstant.REGEX).matcher(url);
if (m.find()) {
return url.substring(m.start(), m.end());
}
return "";
}
/**
* 爬取原视频需要的地址和入参
*/
private VideoUrlVo getTrueAddress(String url, Integer type) {
VideoUrlVo videoUrlVo = new VideoUrlVo();
// System.setProperty(RemoveWatermarkConstant.DRIVER_URL, "D:\\Program Files\\old_chrome\\chrome114\\chromedriver114win32\\chromedriver.exe");//本地测试驱动路径
System.setProperty(RemoveWatermarkConstant.DRIVER_URL, removeWatermarkProperties.getDriver_url());
// 设置谷歌浏览器入参
ChromeOptions options = new ChromeOptions();
// options.setBinary("D:\\Program Files\\old_chrome\\chrome114\\APP\\Chrome-bin\\chrome.exe");
options.addArguments(RemoveWatermarkConstant.CHROME_USER_AGENT + RemoveWatermarkConstant.CHROME_USER_AGENT_ANDROID);
options.addArguments(RemoveWatermarkConstant.DISABLE_BLINK_FEATURES);
options.addArguments(RemoveWatermarkConstant.DISABLE_EXTENSIONS);
options.addArguments(RemoveWatermarkConstant.DISABLE_POPUP_BLOCKING);
// 设置浏览器选项,模拟移动设备
options.addArguments(RemoveWatermarkConstant.WINDOW_SIZE);
// 创建 ChromeDriver 并传入 ChromeOptions
ChromeDriver driver = new ChromeDriver(options);
// 启用 Chrome DevTools
DevTools devTools = driver.getDevTools();
devTools.createSession();
// 快手是POST请求需要拿请求体
devTools.addListener(Network.requestWillBeSent(), request -> {
if (Objects.nonNull(request.getRequest()) && RemoveWatermarkConstant.POST.equals(request.getRequest().getMethod())) {
if (request.getRequest().getUrl().contains(RemoveWatermarkConstant.K_S_URL)) {
videoUrlVo.setResponseVideoUrl(
request.getRequest()
.getUrl()
);
request.getRequest()
.getPostData()
.ifPresent(videoUrlVo::setVideoPostBody);
String referer = Objects.requireNonNull(
request.getRequest()
.getHeaders()
.get(RemoveWatermarkConstant.REFERER)
).toString();
videoUrlVo.setReferer(referer);
}
}
});
// 启用监听器
devTools.send(Network.enable(Optional.empty(), Optional.empty(), Optional.empty()));
// 访问目标网页
driver.get(url);
try {
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
// 关闭浏览器
driver.quit();
return videoUrlVo;
}
private void getCookieInfo(VideoUrlVo trueAddress) {
try {
URL urlOne = new URL(RemoveWatermarkConstant.GET_COOKIE_URL);
HttpURLConnection connection = (HttpURLConnection) urlOne.openConnection();
connection.setRequestMethod(RemoveWatermarkConstant.POST);
connection.setRequestProperty(RemoveWatermarkConstant.USER_AGENT, RemoveWatermarkConstant.USER_AGENT_V);
String cookieHeader = connection.getHeaderField(RemoveWatermarkConstant.SET_COOKIE);
String[] cookies = cookieHeader.split(RemoveWatermarkConstant.SPLIT);
String cookie = cookies[RemoveWatermarkConstant.ZERO];
connection.disconnect();
trueAddress.setCookieInfo(cookie);
} catch (IOException e) {
log.error(RemoveWatermarkConstant.GET_COOKIE_ERR, e.getMessage());
}
}
}
5、结语
java爬虫限制较多还是建议用python会更便捷,本篇仅供参考,如有问题可浏览探讨。文章来源地址https://www.toymoban.com/news/detail-805724.html
到了这里,关于保姆级爬虫无水印视频大全 最新版java+selenium的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!