一、引入依赖
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
二、实现工具类
package com.xiaobai.util;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.json.JSONArray;
import org.json.JSONObject;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* @Author xiaobai
* @Date 2023/7/12 14:04
* @Title: PageContentHandler
* @Package com.xiaobai.util
* @description:
*/
public class ReadContentHandler extends ToXMLContentHandler {
private String pageTag = "div";
private String pageClass = "page";
private int pageNumber = 0;
private Map<Integer,StringBuilder> pageMap;
public ReadContentHandler(){
super();
pageMap = new HashMap<>();
}
private void startPage() {
pageNumber++;
pageMap.put(pageNumber,new StringBuilder());
}
private void endPage() {
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
if(pageTag.equals(qName) && pageClass.equals(atts.getValue("class"))){
startPage();
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if(pageTag.equals(qName)){
endPage();
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if(length > 0 && pageNumber > 0){
if(ch.length == 1 && ch[0] == '\n'){
return;
}
pageMap.get(pageNumber).append(ch);
// pageMap.get(pageNumber).append('\n');
}
}
/**
* 文件基本信息
* @param file
* @return
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public static Metadata fileData(File file) throws IOException, SAXException, TikaException {
FileInputStream input=new FileInputStream(file);//可以写文件路径,pdf,word,html等
BodyContentHandler textHandler=new BodyContentHandler();//获取内容
Metadata matadata=new Metadata();//Metadata对象保存了作者,标题等元数据
AutoDetectParser parser = new AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
ParseContext context=new ParseContext();
parser.parse(input, textHandler, matadata, context);//执行解析过程
input.close();
return matadata;
}
/**
* 读取文件内容
* @param file 支持txt/word/excle/pdf等多种格式
* @return
* @throws TikaException
* @throws IOException
*/
public static String parseText(File file) throws TikaException, IOException {
Tika tika = new Tika();
return tika.parseToString(file);
}
/**
* 按页读取文件内容
* @param file 仅支持pdf
* @return
* @throws TikaException
* @throws IOException
*/
public static JSONArray parsePageToPdf(File file) throws Exception{
JSONArray jsonArray = new JSONArray();
JSONObject jsonObject = null;
ReadContentHandler handler = new ReadContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(file);
ParseContext pcontext = new ParseContext();
//parsing the document using PDF parser
PDFParser pdfparser = new PDFParser();
pdfparser.parse(inputstream, handler, metadata,pcontext);
//getting the content of the document by pages.
for(Map.Entry<Integer,StringBuilder> entry:handler.pageMap.entrySet()){
jsonObject = new JSONObject();
jsonObject.put("page",entry.getKey());
jsonObject.put("content",entry.getValue().toString());
jsonArray.put(jsonObject);
}
return jsonArray;
}
}
parseText()
文件转文本内容parsePageToPdf()
pdf转按页文本内容文章来源地址https://www.toymoban.com/news/detail-576973.html
文章来源:https://www.toymoban.com/news/detail-576973.html
到了这里,关于【Tika】快速使用:文件(word/pdf)内容转文本的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!