百度飞桨PaddleOCR-VL识别发票图片输出json格式
飞桨ocr 识别 图片或pdf, 最终生成标准的json文件。
百度飞桨产品线,pp-ocrv5,pp-structureV3.pp-chatOCRv4,pdddleocr-vl,的区别。
1. PP-OCRv5(基础 OCR)
- 定位:纯文字识别(检测 + 识别)
- 能力:只输出「文字 + 坐标」,无结构、无表格、无字段抽取
- 输出:
[{"text":"xxx","bbox":[]}, ...] - 适合:纯文字场景(名片、街景、截图)
- 不适合:发票(要结构、要表格、要字段)
2. PP-StructureV3(文档结构化)
- 定位:版面分析 + 表格识别 + 文档结构化
- 能力:
- 识别文字、标题、表格、公式、印章
- 自动解析发票 / 表单 / 合同结构
- 直接输出 JSON/Markdown/Excel
- 输出:带字段名的结构化 JSON(
invoice_num,total_amount,date等) - 适合:你的需求(发票→JSON)首选
3. PP-ChatOCRv4(对话式 OCR)
- 定位:OCR + 大模型(ERNIE 4.5),支持自然语言提问
- 能力:
- 问:"发票总金额多少?" → 直接答数字
- 支持复杂信息抽取、纠错、总结
- 适合:需要对话交互、模糊查询、多文档问答
- 缺点:重、慢、依赖 API,没必要用于单纯发票转 JSONPaddleOCR
4. PaddleOCR-VL(多模态文档大模型)
- 定位:0.9B 视觉 - 语言模型,超强文档理解
- 能力:
- 支持 109 种语言、异形文本、倾斜 / 模糊文档
- 表格 / 公式 / 图表全能,结构还原极强
- 直接输出 JSON/Markdown
- 适合:极复杂发票、低质扫描件、混合多语言票据
- 版本:你要的 PaddleOCR-VL-1.5(最强)
---------------------------
docker OCR 最终环境
PaddlePaddle 版本: 3.0.0
PaddleOCR 版本 : 2.9.1
docker VL最终环境
✅ Python 3.10.11
✅ PaddlePaddle 3.2.1
✅ PaddleOCR 3.5.0
✅ PaddleNLP 2.6.1
✅ PaddleX 3.5.1
✅ PaddlePaddle 3.2.1
✅ PaddleOCR 3.5.0
✅ PaddleNLP 2.6.1
✅ PaddleX 3.5.1
👉 全部满足 PaddleOCR-VL-1.5 运行要求,环境 100% 合格!
要支持识别pdf: pip install pymupdf -i https://pypi.tuna.tsinghua.edu.cn/simple
docker 环境
Dockerfile
FROM paddlepaddle/paddle:3.2.0 WORKDIR /app # 换清华APT源 + 安装系统依赖(opencv必须) RUN sed -i 's|deb.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \ sed -i 's|security.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ libgl1-mesa-glx \ libglib2.0-0 \ && rm -rf /var/lib/apt/lists/* # 升级pip RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple # 安装核心库(修复你Docker报错的关键) RUN pip install "paddlex[ocr]==3.5.1" -i https://pypi.tuna.tsinghua.edu.cn/simple RUN pip install "paddleocr[vl]==3.5.0" paddlenlp -i https://pypi.tuna.tsinghua.edu.cn/simple # 🔥 删掉了错误命令: paddlex setup ocr_vl # 模型会在第一次运行代码时自动下载 CMD ["/bin/bash"]
docker-compose.yml
version: '3.8' services: paddle-ocr-vl: build: . container_name: paddle-ocr-vl volumes: - ./:/app # 本地目录 ↔ 容器 /app 自动同步 stdin_open: true tty: true working_dir: /app
识别图片和pdf代码:
from paddleocr import PaddleOCRVL from PIL import Image import numpy as np import cv2 import os # ====================== # 图像增强(提升准确率) # ====================== def enhance_image(img): if isinstance(img, Image.Image): img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) binary = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 10 ) kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32) sharp = cv2.filter2D(binary, -1, kernel) return Image.fromarray(sharp).convert("RGB") # ====================== # PDF 转图片(自动处理) # ====================== def pdf_to_images(pdf_path): try: import fitz # PyMuPDF images = [] doc = fitz.open(pdf_path) for page in doc: pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0)) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) return images except Exception as e: print("PDF转换失败,请安装:pip install pymupdf") return [] # ====================== # 统一识别函数(图片/PDF通用) # ====================== def recognize_file(file_path, pipeline): ext = os.path.splitext(file_path)[1].lower() results = [] # 图片 if ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]: img = Image.open(file_path).convert("RGB") img_enhanced = enhance_image(img) res = pipeline.predict(np.array(img_enhanced)) results.extend(res) # PDF elif ext == ".pdf": images = pdf_to_images(file_path) for img in images: img_enhanced = enhance_image(img) res = pipeline.predict(np.array(img_enhanced)) results.extend(res) return results # ====================== # 主程序 # ====================== if __name__ == "__main__": # 加载模型 pipeline = PaddleOCRVL(device="cpu") # ====================== # 这里可以换:图片 或 PDF # ====================== file_path = os.path.join("2026-04-22", "141bdbfd-2aa629c7af95.jpg") #file_path = "2026-04-22\年审发票.pdf" # 识别 output = recognize_file(file_path, pipeline) # 输出 JSON + Markdown for res in output: res.print() res.save_to_json("./output") res.save_to_markdown("./output")
下面是 paddleOCR 环境
Dockerfile
FROM paddlepaddle/paddle:3.0.0 WORKDIR /app # 系统依赖 RUN sed -i 's|deb.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \ sed -i 's|security.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ libgl1-mesa-glx libglib2.0-0 libsm6 libxrender1 libxext6 && \ rm -rf /var/lib/apt/lists/* # 安装固定依赖 COPY requirements.txt . RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple # 拷贝代码 COPY . . # 运行 CMD ["/bin/bash"] #CMD ["python", "main.py"]
docker-compose.yml
version: '3.8' services: paddle-ocr: build: . container_name: paddle-ocr volumes: - ./:/app # 本地目录 ↔ 容器 /app 自动同步 stdin_open: true tty: true working_dir: /app
发票结构化解析
import os import cv2 import numpy as np from paddleocr import PaddleOCR # 初始化 (发票/表格/结构化解析) ocr = PaddleOCR( lang="ch", device="cpu", show_log=False ) # 你的发票图片 file_path = "../images/491b5d3a-ab7e-4ebb-a53b-5f108ea18527.jpg" # 结构化解析(直接输出格式化结果) result = ocr.ocr(file_path) # 打印结果 print("\n===== 发票结构化结果 =====\n") for item in result: print(item) print("===== 识别文字结果 =====") for res in result: for line in res: text = line[1][0] print(text)
检查环境
import sys from packaging import version def main(): print("=" * 60) print("📌 PaddleOCR-VL 官方要求版本检测脚本") print("=" * 60) # ====================== # 要求的版本规则 # ====================== required = { "python": {"min": "3.8.0", "max": "3.11.0", "recommend": "3.10.x"}, "paddlepaddle": {"min": "3.0.0", "recommend": "3.2.0"}, "paddleocr": {"options": ["3.4.0", "3.5.0"]}, "paddlenlp": {"min": "2.6.0", "recommend": "3.0.0"}, "paddlex": {"must_install": True} } # ====================== # 1. 检查 Python # ====================== py_ver = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" print(f"\🐍 Python 版本: {py_ver}") py_ok = (version.parse(py_ver) >= version.parse(required["python"]["min"]) and version.parse(py_ver) <= version.parse(required["python"]["max"])) if py_ok: print(f" ✅ 符合要求(要求 {required['python']['min']} ~ {required['python']['max']})") else: print(f" ❌ 不支持!请安装 3.8 ~ 3.11(推荐 3.10)") # ====================== # 2. 检查已安装库 # ====================== installed = {} try: import pkg_resources for pkg in pkg_resources.working_set: installed[pkg.key.lower()] = pkg.version except: print("\n⚠️ 无法读取库版本,请确保 setuptools 正常") return # 检查清单 check_pkgs = [ ("paddlepaddle", "PaddlePaddle"), ("paddleocr", "PaddleOCR"), ("paddlenlp", "PaddleNLP"), ("paddlex", "PaddleX") ] print("\n📦 依赖库版本检查:") # PaddlePaddle pkg = "paddlepaddle" v = installed.get(pkg, "未安装") print(f"\n🔹 {pkg.upper()}: {v}") if pkg in installed: if version.parse(v) >= version.parse(required[pkg]["min"]): print(f" ✅ 正常(要求 ≥{required[pkg]['min']},推荐 {required[pkg]['recommend']})") else: print(f" ❌ 版本过低,请升级至 ≥{required[pkg]['min']}") else: print(f" ❌ 未安装!") # PaddleOCR pkg = "paddleocr" v = installed.get(pkg, "未安装") print(f"\n🔹 {pkg.upper()}: {v}") if pkg in installed: if v in required[pkg]["options"]: print(f" ✅ 支持 VL-1.5(支持版本:{'/'.join(required[pkg]['options'])})") else: print(f" ⚠️ 非推荐版本,建议安装 3.4.0 或 3.5.0") else: print(f" ❌ 未安装!请安装 paddleocr[vl]==3.4.0") # PaddleNLP pkg = "paddlenlp" v = installed.get(pkg, "未安装") print(f"\n🔹 {pkg.upper()}: {v}") if pkg in installed: if version.parse(v) >= version.parse(required[pkg]["min"]): print(f" ✅ 正常(要求 ≥{required[pkg]['min']},推荐 {required[pkg]['recommend']})") else: print(f" ❌ 版本过低,请升级") else: print(f" ❌ 未安装!") # PaddleX pkg = "paddlex" v = installed.get(pkg, "未安装") print(f"\n🔹 {pkg.upper()}: {v}") if pkg in installed: print(f" ✅ 已安装") else: print(f" ❌ 缺失!PaddleOCR-VL 必须依赖 paddlex") # ====================== # 3. 最终总结 # ====================== print("\n" + "=" * 60) print("📊 总结:") print("✅ = 符合要求") print("⚠️ = 可运行但非最优") print("❌ = 必须修复才能运行 PaddleOCR-VL-1.5") print("=" * 60) if __name__ == "__main__": main()

浙公网安备 33010602011771号