以下为瞎扯淡:
温馨提示:有很多方法均可以解析这些常见的文件,以下内容使用的是apache-poi + apache-pdfbox实现的。
关于文档解析,在网上搜索了很久,无奈内容太过繁杂,找不到合适的代码,一大半都是只支持文本。没办法,只能自己在网上一点一点CV了,最终提取了这些代码,不能说好用吧,应该可解燃眉之急。关于doc文档以及pdf文档还是有很多问题的,后续希望大佬们能在帖子下面多多指正,能优化一下代码,那就更好了。文章来源:https://www.toymoban.com/news/detail-776245.html
以下为正文内容:
首先把以下这些依赖干进去
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.1.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.1.0</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.22</version> </dependency>
要测试的话给你贴一个文档地址吧:(但是这个在线文档是没有图片滴)
public static void main(String[] args) throws IOException { String document = processDocumentFromFilePath("E:\\VPN系统使用手册.pptx", "E:\\临时图片"); System.out.println(document); String documentFromUrl = processDocumentFromUrl("http://api.idocv.com/data/doc/manual.docx", "E:\\临时图片"); System.out.println(documentFromUrl); }
然后上车:飕飕飕
文章来源地址https://www.toymoban.com/news/detail-776245.html
import com.alibaba.dubbo.common.utils.CollectionUtils; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClients; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.hslf.usermodel.*; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.sl.usermodel.TextParagraph; import org.apache.poi.xslf.usermodel.*; import org.apache.poi.xwpf.usermodel.*; import java.io.*; import java.util.Date; import java.util.List; import java.util.stream.Collectors; public class FileProcessorUtils { /*** * 此方法针对本地文件 * 提取文件信息并返回内容 * @param filePath 文件储存地址 * @param imgRoot 图片存储地址 * @return */ public static String processDocumentFromFilePath(String filePath,String imgRoot) throws IOException { File file = new File(filePath); FileInputStream fileInputStream = new FileInputStream(file); // 根据文件类型调用适当的处理方法 switch (fileTypeName(filePath)) { case "doc": return processWordDocDocumentFromStream(fileInputStream,imgRoot); case "docx": return processWordDocxDocumentFromStream(fileInputStream,imgRoot); case "pdf": return processPdfDocumentFromStream(fileInputStream,imgRoot); case "ppt": return processPptDocumentFromStream(fileInputStream,imgRoot); case "pptx": return processPptxDocumentFromStream(fileInputStream,imgRoot); default: throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)"); } } /*** * 此方法针对网络文件 * 提取文件信息并返回内容 * @param downloadUrl 文件下载链接 * @param imgRoot 图片存储地址 * @return */ public static String processDocumentFromUrl(String downloadUrl,String imgRoot) throws IOException { HttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(downloadUrl); HttpResponse response = httpClient.execute(httpGet); //获取文件类型 // TODO: 2023/9/14 此处并不是所有的下载链接都存在后缀信息,如果为了提升代码的健壮性,可以在此处修改代码以获取文件类型 String typeName = fileTypeName(downloadUrl); // 根据文件类型调用适当的处理方法 switch (typeName) { case "doc": return processWordDocDocumentFromStream(response.getEntity().getContent(),imgRoot); case "docx": return processWordDocxDocumentFromStream(response.getEntity().getContent(),imgRoot); case "pdf": return processPdfDocumentFromStream(response.getEntity().getContent(),imgRoot); case "ppt": return processPptDocumentFromStream(response.getEntity().getContent(),imgRoot); case "pptx": return processPptxDocumentFromStream(response.getEntity().getContent(),imgRoot); default: throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)"); } } /*** * word(doc)文件处理 * @param inputStream(文件流) * @return */ private static String processWordDocDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { HWPFDocument document = new HWPFDocument(inputStream); StringBuilder htmlText = new StringBuilder(); WordExtractor extractor = new WordExtractor(document); try { String[] paragraphs = extractor.getParagraphText(); for (int paragraphIndex = 0; paragraphIndex < paragraphs.length; paragraphIndex++) { String paragraphText = paragraphs[paragraphIndex]; //获取文本对齐方式 String justification = getJustification(document.getRange().getParagraph(paragraphIndex).getJustification()); // 根据需要添加其他HTML标签 htmlText.append("<p style='text-align:").append(justification).append("'><span>").append(paragraphText).append("</span>").append("</p>"); } // 提取图片 List<Picture> pictures = document.getPicturesTable().getAllPictures(); for (int i = 0; i < pictures.size(); i++) { Picture picture = pictures.get(i); byte[] pictureData = picture.getContent(); String newFileName = new Date().getTime() + i + "_image." + picture.suggestFileExtension(); // 可以根据需要更改扩展名,suggestFileExtension()方法自动获取合适的图片类型 String imgPath = saveImageToFile(pictureData, newFileName, imageRoot); htmlText.append("<p><img alt='' src='").append(imgPath).append("'></p >"); } } finally { extractor.close(); document.close(); } return htmlText.toString(); } /*** * word(docx)文件处理 * @param inputStream(文件流) * @return */ private static String processWordDocxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { //获取文件内容 XWPFDocument document = new XWPFDocument(inputStream); StringBuilder htmlText = new StringBuilder(); try { //获取所有元素 List<XWPFParagraph> paragraphs = document.getParagraphs(); //根据元素类型追加 for (XWPFParagraph paragraph : paragraphs) { //获取文本对齐方式 ParagraphAlignment alignment = paragraph.getAlignment(); htmlText.append("<p style='text-align:").append(alignment).append("'>"); List<XWPFRun> runs = paragraph.getRuns(); for (XWPFRun run : runs) { // 处理字体大小、样式等信息 String fontSize = run.getFontSize() + "pt"; String fontFamily = run.getFontFamily(); // 添加样式信息到HTML htmlText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily +";'>" + run.text() + "</span>"); } htmlText.append("</p>"); // 检查当前行段落是否有图片存在 List<XWPFPicture> pictures = paragraph.getRuns().stream() .flatMap(run -> run.getEmbeddedPictures().stream()) .collect(Collectors.toList()); if(CollectionUtils.isNotEmpty(pictures)){ if(pictures.size()>0){ pictures.forEach( bean ->{ XWPFPictureData pictureData = bean.getPictureData(); String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension(); String imgPath = null; try { imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot); } catch (IOException e) { throw new RuntimeException(e); } htmlText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>"); }); } } } } finally { document.close(); } return htmlText.toString(); } /*** * Pdf文件处理 * @param inputStream(文件流) * @return */ private static String processPdfDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { PDDocument pdfDocument = PDDocument.load(inputStream); PDFTextStripper textStripper = new PDFTextStripper(); StringBuilder htmlText = new StringBuilder(); String[] lines = textStripper.getText(pdfDocument).split("\n"); for (String line : lines) { htmlText.append("<p style='text-align:left'>").append(line).append("</p>"); } pdfDocument.close(); return htmlText.toString(); } /** * 处理PPT(.ppt)文件 * @param inputStream(文件流) * @return * @throws IOException */ private static String processPptDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { HSLFSlideShow ppt = new HSLFSlideShow(inputStream); StringBuilder pptText = new StringBuilder(); try { // 提取文本内容 for (HSLFSlide slide : ppt.getSlides()) { for (HSLFShape shape : slide.getShapes()) { //如果是文本处理文本 if (shape instanceof HSLFTextShape) { HSLFTextShape textShape = (HSLFTextShape) shape; for (HSLFTextParagraph paragraph : textShape.getTextParagraphs()) { //获取文本对齐方式 TextParagraph.TextAlign textAlign = paragraph.getTextAlign(); pptText.append("<p style='text-align:").append(textAlign).append("'>"); for (HSLFTextRun run : paragraph.getTextRuns()) { // 处理字体大小、字体样式等信息 String fontSize = run.getFontSize() + "pt"; String fontFamily = run.getFontFamily(); run.getRawText(); // 添加样式信息到HTML pptText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.getRawText() + "</span>"); } pptText.append("</p>"); // 换行处理 } }else if (shape instanceof HSLFPictureShape) { // 如果是图片,处理图片 HSLFPictureShape pictureShape = (HSLFPictureShape) shape; HSLFPictureData pictureData = pictureShape.getPictureData(); String contentType = pictureData.getContentType(); String newFileName = new Date().getTime() + "_image." + imageTypeName(contentType); String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot); pptText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>"); } } } } finally { ppt.close(); } return pptText.toString(); } /** * 处理PPTX(.pptx)文件 * @param inputStream(文件流) * @return * @throws IOException */ private static String processPptxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException { XMLSlideShow pptx = new XMLSlideShow(inputStream); StringBuilder pptxText = new StringBuilder(); try { // 提取文本内容 for (XSLFSlide slide : pptx.getSlides()) { for (XSLFShape shape : slide.getShapes()) { if (shape instanceof XSLFTextShape) { XSLFTextShape textShape = (XSLFTextShape) shape; for (XSLFTextParagraph paragraph : textShape.getTextParagraphs()) { //获取文本对齐方式 TextParagraph.TextAlign textAlign = paragraph.getTextAlign(); pptxText.append("<p style='text-align:").append(textAlign).append("'>"); for (XSLFTextRun run : paragraph.getTextRuns()) { // 处理字体大小、字体样式等信息 String fontSize = run.getFontSize() + "pt"; String fontFamily = run.getFontFamily(); // 添加样式信息到HTML pptxText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.getRawText() + "</span>"); } pptxText.append("</p>"); // 换行处理 } }else if (shape instanceof XSLFPictureShape) { // 如果是图片,处理图片 XSLFPictureShape pictureShape = (XSLFPictureShape) shape; XSLFPictureData pictureData = pictureShape.getPictureData(); String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension(); String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot); pptxText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>"); } } } } finally { pptx.close(); } return pptxText.toString(); } /** * 保存图片到指定位置,并返回引用地址 * @param imageData * @param imageRoot * @return * @throws IOException */ public static String saveImageToFile(byte[] imageData, String imageFileName, String imageRoot) throws IOException { String imagePath = imageRoot + File.separator + imageFileName; File file = new File(imageRoot); if(!file.exists()){ file.mkdir(); } try (FileOutputStream fos = new FileOutputStream(imagePath)) { fos.write(imageData); } return imagePath; } /** * 表格处理 * @param table * @return */ private static String getTableHtmlText(XWPFTable table) { StringBuilder tableHtml = new StringBuilder("<table>"); for (XWPFTableRow row : table.getRows()) { tableHtml.append("<tr>"); for (XWPFTableCell cell : row.getTableCells()) { tableHtml.append("<td>").append(cell.getText()).append("</td>"); } tableHtml.append("</tr>"); } tableHtml.append("</table>"); return tableHtml.toString(); } /*** * 获取文件后缀 * @param filePath * @return */ private static String fileTypeName(String filePath) { int dotIndex = filePath.lastIndexOf("."); if (dotIndex > 0) { return filePath.substring(dotIndex + 1).toLowerCase(); } return ""; } /*** * 获取图片类型 * @param imagePath * @return */ private static String imageTypeName(String imagePath) { int dotIndex = imagePath.lastIndexOf("/"); if (dotIndex > 0) { return imagePath.substring(dotIndex + 1).toLowerCase(); } return ""; } /*** * doc文档获取当前行对齐方式 默认左对齐 * @param type * @return */ private static String getJustification(Integer type) { switch (type) { case 0: return "left"; case 1: return "center"; case 2: return "right"; default: return "left"; } } }
到了这里,关于JAVA读取(DOC、DOCX、PDF、PPT、PPTX)文件文本内容及图片的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!