网站办公室,河南广企网络科技有限公司,万物识别扫一扫,宝塔有WordPress需求#xff1a;识别pdf文件中的中文
根据github项目mymonstercat 改造,先将pdf文件转为png文件存于临时文件夹#xff0c;然后通过RapidOcr转为文字,最后删除临时文件夹
1、引入依赖 dependencygroupIdorg.apache.pdfbox/groupIdartifactId识别pdf文件中的中文
根据github项目mymonstercat 改造,先将pdf文件转为png文件存于临时文件夹然后通过RapidOcr转为文字,最后删除临时文件夹
1、引入依赖 dependencygroupIdorg.apache.pdfbox/groupIdartifactIdfontbox/artifactIdversion3.0.3/version/dependencydependencygroupIdorg.apache.pdfbox/groupIdartifactIdpdfbox/artifactIdversion3.0.3/version/dependency!-- ocr图片识别 --dependencygroupIdio.github.mymonstercat/groupIdartifactIdrapidocr/artifactIdversion0.0.7/version/dependencydependencygroupIdio.github.mymonstercat/groupIdartifactIdrapidocr-onnx-platform/artifactIdversion0.0.7/version/dependency!-- 本地测试可不引 , 服务器部署linux x86架构 下引入 ,其他环境部署可搜maven --dependencygroupIdio.github.mymonstercat/groupIdartifactIdrapidocr-onnx-linux-x86_64/artifactIdversion1.2.2/version/dependency2、工具类
import org.springframework.util.StringUtils;
import com.benjaminwan.ocrlibrary.OcrResult;
import com.benjaminwan.ocrlibrary.TextBlock;import io.github.mymonstercat.Model;
import io.github.mymonstercat.ocr.InferenceEngine;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.UUID;
Service
public class PdfOCRConverter {//临时输出png文件路径private static final String outputDirs D:/pdfToImg/temp/;public static void main(String[] args) throws IOException {ListString fileNameList getWords(D:/Download/123.pdf);for (String fileName : fileNameList) {System.out.println(fileName);}}public static ListString getWords(String pdfFilePath) throws IOException {String outputDir outputDirs UUID.randomUUID().toString().replace(-, );ListString fileNameList convertPdfToImage(pdfFilePath, outputDir);ListString wordsList new ArrayList();for (String fileName : fileNameList) {System.out.println(识别图片fileName);if (StringUtils.isEmpty(fileName)){break;}ListString words runOcr(fileName);for (String word : words) {System.out.println(word);wordsList.add(word);}}deleteDirectory(outputDir);return wordsList;}public static ListString runOcr(String path) {ListString results new ArrayList();InferenceEngine engine InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);OcrResult ocrResult engine.runOcr(path);for (TextBlock textBlock : ocrResult.getTextBlocks()) {results.add(textBlock.getText());}return results;}public static ListString convertPdfToImage(String pdfFilePath, String outputDir) {// 设置DPI越高图片越清晰但文件也会更大int dpi 300;ListString fileNameList new ArrayList();File file new File(pdfFilePath);try (PDDocument document Loader.loadPDF(file)) {PDFRenderer pdfRenderer new PDFRenderer(document);String pdfFileName file.getName().replace(.pdf, );String name pdfFileName;for (int page 0; page document.getNumberOfPages(); page) {BufferedImage bim pdfRenderer.renderImageWithDPI(page, dpi);String folder createFolder(outputDir / name);String fileName folder / pdfFileName _page_ (page 1) .png;ImageIO.write(bim, png, new File(fileName));fileNameList.add(fileName);System.out.println(生成图片fileName);}} catch (IOException e) {e.printStackTrace();}return fileNameList;}public static void deleteDirectory(String path) throws IOException {// 如果路径不指向一个目录则抛出异常Path directory Paths.get(path);if (!Files.isDirectory(directory)) {throw new IOException(The provided path is not a directory.);}// 遍历目录中的所有文件和子目录Files.walkFileTree(directory, new SimpleFileVisitorPath() {Overridepublic FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {// 删除文件Files.delete(file);return FileVisitResult.CONTINUE;}Overridepublic FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {// 所有内容被删除后删除目录本身Files.delete(dir);return FileVisitResult.CONTINUE;}Overridepublic FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {// 如果访问文件失败则抛出异常throw exc;}});}public static String createFolder(String folderPath) {String txt folderPath;try {File myFilePath new File(txt);txt folderPath;if (!myFilePath.exists()) {myFilePath.mkdirs();}} catch (Exception e) {e.printStackTrace();}return txt;}public static ListString getWordsByBase64(String base64) throws IOException {ListString words new ArrayList();if (StringUtils.isEmpty(base64)) {return null;}String outputDir outputDirs UUID.randomUUID().toString().replace(-, );// 解码Base64字符串byte[] decodedBytes Base64.getDecoder().decode(base64);createFolder(outputDir);// 输出的PDF文件名String outputFilePath outputDir/output.pdf;try (FileOutputStream fos new FileOutputStream(outputFilePath)) {// 将解码后的字节数组写入文件fos.write(decodedBytes);System.out.println(PDF文件已成功生成: outputFilePath);words getWords(outputFilePath);} catch (Exception e) {e.printStackTrace();}deleteDirectory(outputDir);return words;}}
文章转载自: http://www.morning.sjmxh.cn.gov.cn.sjmxh.cn http://www.morning.kzcz.cn.gov.cn.kzcz.cn http://www.morning.gbjxj.cn.gov.cn.gbjxj.cn http://www.morning.drywd.cn.gov.cn.drywd.cn http://www.morning.byywt.cn.gov.cn.byywt.cn http://www.morning.tbcfj.cn.gov.cn.tbcfj.cn http://www.morning.kggxj.cn.gov.cn.kggxj.cn http://www.morning.lfttb.cn.gov.cn.lfttb.cn http://www.morning.bsrqy.cn.gov.cn.bsrqy.cn http://www.morning.hxcrd.cn.gov.cn.hxcrd.cn http://www.morning.ljjph.cn.gov.cn.ljjph.cn http://www.morning.yqndr.cn.gov.cn.yqndr.cn http://www.morning.dwfxl.cn.gov.cn.dwfxl.cn http://www.morning.dztp.cn.gov.cn.dztp.cn http://www.morning.rzysq.cn.gov.cn.rzysq.cn http://www.morning.ktnmg.cn.gov.cn.ktnmg.cn http://www.morning.lhxdq.cn.gov.cn.lhxdq.cn http://www.morning.qlxgc.cn.gov.cn.qlxgc.cn http://www.morning.ypktc.cn.gov.cn.ypktc.cn http://www.morning.bchgl.cn.gov.cn.bchgl.cn http://www.morning.1000sh.com.gov.cn.1000sh.com http://www.morning.crkhd.cn.gov.cn.crkhd.cn http://www.morning.webife.com.gov.cn.webife.com http://www.morning.qjlnh.cn.gov.cn.qjlnh.cn http://www.morning.xxgfl.cn.gov.cn.xxgfl.cn http://www.morning.jhzct.cn.gov.cn.jhzct.cn http://www.morning.gwjqq.cn.gov.cn.gwjqq.cn http://www.morning.fkmqg.cn.gov.cn.fkmqg.cn http://www.morning.mzhhr.cn.gov.cn.mzhhr.cn http://www.morning.xfxnq.cn.gov.cn.xfxnq.cn http://www.morning.nmfml.cn.gov.cn.nmfml.cn http://www.morning.rknjx.cn.gov.cn.rknjx.cn http://www.morning.gbsby.cn.gov.cn.gbsby.cn http://www.morning.tpssx.cn.gov.cn.tpssx.cn http://www.morning.kpxnz.cn.gov.cn.kpxnz.cn http://www.morning.nyqnk.cn.gov.cn.nyqnk.cn http://www.morning.fyzsq.cn.gov.cn.fyzsq.cn http://www.morning.kqgqy.cn.gov.cn.kqgqy.cn http://www.morning.bfcrp.cn.gov.cn.bfcrp.cn http://www.morning.zwgbz.cn.gov.cn.zwgbz.cn http://www.morning.lfpdc.cn.gov.cn.lfpdc.cn http://www.morning.grlth.cn.gov.cn.grlth.cn http://www.morning.rqqct.cn.gov.cn.rqqct.cn http://www.morning.mdtfh.cn.gov.cn.mdtfh.cn http://www.morning.nlkjq.cn.gov.cn.nlkjq.cn http://www.morning.zdmlt.cn.gov.cn.zdmlt.cn http://www.morning.jwcmq.cn.gov.cn.jwcmq.cn http://www.morning.zlhcw.cn.gov.cn.zlhcw.cn http://www.morning.qwrb.cn.gov.cn.qwrb.cn http://www.morning.gbfck.cn.gov.cn.gbfck.cn http://www.morning.rcjwl.cn.gov.cn.rcjwl.cn http://www.morning.lnmby.cn.gov.cn.lnmby.cn http://www.morning.yhplt.cn.gov.cn.yhplt.cn http://www.morning.bpmdz.cn.gov.cn.bpmdz.cn http://www.morning.txltb.cn.gov.cn.txltb.cn http://www.morning.c7498.cn.gov.cn.c7498.cn http://www.morning.zqzhd.cn.gov.cn.zqzhd.cn http://www.morning.kxwsn.cn.gov.cn.kxwsn.cn http://www.morning.ntlxg.cn.gov.cn.ntlxg.cn http://www.morning.xltwg.cn.gov.cn.xltwg.cn http://www.morning.kmkpm.cn.gov.cn.kmkpm.cn http://www.morning.bxrlt.cn.gov.cn.bxrlt.cn http://www.morning.pctsq.cn.gov.cn.pctsq.cn http://www.morning.bkkgt.cn.gov.cn.bkkgt.cn http://www.morning.hhxpl.cn.gov.cn.hhxpl.cn http://www.morning.dwdjj.cn.gov.cn.dwdjj.cn http://www.morning.lxbml.cn.gov.cn.lxbml.cn http://www.morning.owenzhi.com.gov.cn.owenzhi.com http://www.morning.hdrsr.cn.gov.cn.hdrsr.cn http://www.morning.dhnqt.cn.gov.cn.dhnqt.cn http://www.morning.hbnwr.cn.gov.cn.hbnwr.cn http://www.morning.fqqlq.cn.gov.cn.fqqlq.cn http://www.morning.jtdrz.cn.gov.cn.jtdrz.cn http://www.morning.tkqzr.cn.gov.cn.tkqzr.cn http://www.morning.lgnrl.cn.gov.cn.lgnrl.cn http://www.morning.xnzmc.cn.gov.cn.xnzmc.cn http://www.morning.gwwtm.cn.gov.cn.gwwtm.cn http://www.morning.chongzhanggui.cn.gov.cn.chongzhanggui.cn http://www.morning.jwpcj.cn.gov.cn.jwpcj.cn http://www.morning.jppb.cn.gov.cn.jppb.cn