JAVA解析pdf文件-Toy模板网

这篇具有很好参考价值的文章主要介绍了JAVA解析pdf文件。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

自己记录一下，方便下次用，判断文件是否为pdf类型，并且解析文件内容
1.需要依赖的包

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.19</version>
</dependency>

2.Util类文章来源地址https://www.toymoban.com/news/detail-656830.html

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import sun.misc.BASE64Decoder;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;

/**
 * 解析PDF文件內容
 * @author: Tang qiqi
 * @create: 2023-02-21 15:11
 * @Description:
 */
@Slf4j
public class PDFParserUtils {
	public static final String PDF = "PDF";

    /**
     * base64文件字符串
     * @param base64Content
     * @return
     */
    public static String parserFileContent(String base64Content, String fileName){
        // 判斷base64字符串內容是否為空
        if(base64Content == null || base64Content.trim().length() == 0) {
            return "";
        }
        
        FileOutputStream fos = null;
        try {
            File tempFile = File.createTempFile(fileName, ".PDF");
            fos = new FileOutputStream(tempFile);
            BASE64Decoder decoder = new BASE64Decoder();
            // Base64解码,对字节数组字符串进行Base64解码并生成文件
            byte[] byt = decoder.decodeBuffer(base64Content);
            for (int i = 0, len = byt.length; i < len; ++i) {
                // 调整异常数据
                if (byt[i] < 0) {
                    byt[i] += 256;
                }
            }
            // 判断文件是否PDF类型，如果不是，直接返回
            String fileType = getFileType(byt);
            if(!PDF.equals(fileType)) {
                log.info("文件格式不是PDF");
                return "";
            }
            fos.write(byt);
            fos.flush();
            fos.close();
            String text = readFile(tempFile);
            tempFile.deleteOnExit();
            return text;
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if(fos != null) {
                try {
                    fos.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

    /**
     * 一次獲取整個文件內容
     * @param file 文件
     * @return
     */
    public static String readFile(File file) {
        if(file == null) {
            return "";
        }
        PDDocument doc = null;
        try {
            RandomAccessFile  is = new RandomAccessFile(file, "r");
            PDFParser parser = null;
            parser = new PDFParser(is);
            parser.parse();
            doc = parser.getPDDocument();
            PDFTextStripper textStripper = new PDFTextStripper();
            String content = textStripper.getText(doc);
            doc.close();
            return content;
        } catch (IOException e) {
            e.printStackTrace();
            log.info("解析PDF文件內容異常[{}]", e.getMessage());
        } finally {
            if(doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

    /**
     * 分頁獲取文字內容，並將多頁內容拼接返回
     * @param file 文件
     * @return
     * @throws Exception
     */
    public static String readPage(File file) {
        if(file == null) {
            return "";
        }
        StringBuilder sb = new StringBuilder("");
        PDDocument doc = null;
        try {
            RandomAccessFile is = new RandomAccessFile(file, "r");
            PDFParser parser = new PDFParser(is);
            parser.parse();
            doc = parser.getPDDocument();
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 一次输出多个页时，按顺序输出
                textStripper.setSortByPosition(true);
                String s = textStripper.getText(doc);
                sb.append(s);
            }
            doc.close();
        } catch (Exception e) {
            e.printStackTrace();
            log.info("解析PDF文件內容異常[{}]", e.getMessage());
        } finally {
            if(doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return sb.toString();
    }

    /**
     * 读取文本内容和图片
     * @param file 文件路徑
     */
    public static void readTextImage(File file) {
        if(file == null) {
            return;
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
//                String s = textStripper.getText(doc);
                // 读取图片
                PDPage page = doc.getPage(i - 1);
                PDResources resources = page.getResources();
                // 获取页中的对象
                Iterable<COSName> xobjects = resources.getXObjectNames();
                if (xobjects != null) {
                    Iterator<COSName> imageIter = xobjects.iterator();
                    while (imageIter.hasNext()) {
                        COSName cosName = imageIter.next();
                        boolean isImageXObject = resources.isImageXObject(cosName);
                        if (isImageXObject) {
                            // 获取每页资源的图片
                            PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                            File outputfile = new File("第 " + (i) + " 页" + cosName.getName() + ".jpg");
                            ImageIO.write(ixt.getImage(), "jpg", outputfile);
                        }
                    }
                }
            }
            doc.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }

    /**
     *
     * @param file
     * @param x 指定的x坐标
     * @param y 指定的y坐标
     * @param width 矩形的宽度
     * @param height 矩形的高度
     * @return
     */
    public static String readRectangle(File file, int x, int y, int width, int height){
        if(file == null) {
            return "";
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            // y轴向下为正，x轴向右为正。
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
            stripperByArea.setSortByPosition(true);
            // 划定区域
            Rectangle2D rect = new Rectangle(x, y, width, height);
            stripperByArea.addRegion("area", rect);
            PDPage page = doc.getPage(1);
            stripperByArea.extractRegions(page);
            // 获取区域的text
            String text = stripperByArea.getTextForRegion("area");
            text = text.trim();
            doc.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }
	
	/**
     * 根據io流前4個字節，判斷文件類型
     * @param ioBytes
     * @return
     */
    private static String getFileType(byte []ioBytes){
        if(ioBytes == null || ioBytes.length < 4){
            log.error("非正常文件");
            throw new ErrorException("Abnormal image file.");
        }
        byte[] b = new byte[4];
        for(int i = 0; i < 4; i ++){
            b[i] = ioBytes[i];
        }
        String type = bytesToHexString(b).toUpperCase();
        if (type.contains("25504446")){
            return "PDF";
        }else if(type.contains("504B0304")) {
            return "ZIP";
        } else if(type.contains("52617221")){
            return "RAR";
        }
        return "";
    }

    /**
     * byte数组转换成16进制字符串
     * @param src
     * @return
     */
    private static String bytesToHexString(byte[] src){
        StringBuilder stringBuilder = new StringBuilder();
        if (src == null || src.length <= 0) {
            return null;
        }
        for (int i = 0; i < src.length; i++) {
            int v = src[i] & 0xFF;
            String hv = Integer.toHexString(v);
            if (hv.length() < 2) {
                stringBuilder.append(0);
            }
            stringBuilder.append(hv);
        }
        return stringBuilder.toString();
    }
/** 常用文件的文件头如下：
JPEG (jpg)，文件头：FFD8FF
PNG (png)，文件头：89504E47
GIF (gif)，文件头：47494638
TIFF (tif)，文件头：49492A00
Windows Bitmap (bmp)，文件头：424D
CAD (dwg)，文件头：41433130
Adobe Photoshop (psd)，文件头：38425053
Rich Text Format (rtf)，文件头：7B5C727466
XML (xml)，文件头：3C3F786D6C
HTML (html)，文件头：68746D6C3E
Email [thorough only] (eml)，文件头：44656C69766572792D646174653A
Outlook Express (dbx)，文件头：CFAD12FEC5FD746F
Outlook (pst)，文件头：2142444E
MS Word/Excel (xls.or.doc)，文件头：D0CF11E0
MS Access (mdb)，文件头：5374616E64617264204A
WordPerfect (wpd)，文件头：FF575043
Postscript. (eps.or.ps)，文件头：252150532D41646F6265
Adobe Acrobat (pdf)，文件头：255044462D312E
Quicken (qdf)，文件头：AC9EBD8F
Windows Password (pwl)，文件头：E3828596
ZIP Archive (zip)，文件头：504B0304
RAR Archive (rar)，文件头：52617221
Wave (wav)，文件头：57415645
AVI (avi)，文件头：41564920
Real Audio (ram)，文件头：2E7261FD
Real Media (rm)，文件头：2E524D46
MPEG (mpg)，文件头：000001BA
MPEG (mpg)，文件头：000001B3
Quicktime (mov)，文件头：6D6F6F76
Windows Media (asf)，文件头：3026B2758E66CF11
MIDI (mid)，文件头：4D546864
*/	
}