<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.16</version>
</dependency>
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.http.HttpUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
/**
* word文档提取文字集合
*/
@Slf4j
public class Word2ListUtil {
/**
* word文档提取文字集合
*/
public static List<String> word2List(String filePath) throws IOException {
XWPFWordExtractor extractor = null;
FileInputStream fis = null;
InputStream inputStream = null;
File file = null;
try {
// 获取云存储Word文档输入流
inputStream = HttpUtil.createGet(filePath).execute().bodyStream();
// 创建临时文件
File tempFile = File.createTempFile("tmp", ".doc");
// 将输入流写入临时文件
file = FileUtil.writeFromStream(inputStream, tempFile);
log.info("临时文件所在路径: {}", file);
// 输入流转文件输入流
fis = new FileInputStream(file);
// fis = new FileInputStream(filePath);
XWPFDocument document = new XWPFDocument(fis);
// 提取文本内容
extractor = new XWPFWordExtractor(document);
// 特殊操作:将文本内容中所有中文字符转成英文并去掉空格
String text = extractor.getText().replaceAll(":", ":")
.replaceAll("。", ".")
.replaceAll(" ", "")
.replaceAll(";", ";")
.replaceAll(",", ",");
// 文本根据换行符分割数组
String[] textArray = text.split("\n");
// 去掉数组内空串行并组装集合
return Arrays.stream(textArray).filter(str -> !str.trim().isEmpty()).collect(Collectors.toList());
} catch (Exception e) {
log.error("word文档提取文字集合失败异常! {}", e.getMessage());
e.printStackTrace();
} finally {
// 资源关闭遵循:先开后关原则
if (ObjectUtil.isNotNull(extractor)) {
extractor.close();
}
if (ObjectUtil.isNotNull(fis)) {
fis.close();
}
if (ObjectUtil.isNotNull(inputStream)) {
inputStream.close();
}
if (ObjectUtil.isNotNull(file)) {
// 删除临时文件
boolean deleteFlag = file.delete();
log.info("临时文件删除状态: {}", deleteFlag);
}
}
return null;
}
}
文章来源地址https://www.toymoban.com/news/detail-548040.html
文章来源:https://www.toymoban.com/news/detail-548040.html
到了这里,关于Java Word提取内容的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!