百度飞桨PaddleOCR-VL识别发票图片输出json格式

飞桨ocr 识别 图片或pdf, 最终生成标准的json文件。

百度飞桨产品线,pp-ocrv5,pp-structureV3.pp-chatOCRv4,pdddleocr-vl,的区别。

1. PP-OCRv5(基础 OCR)

  • 定位:纯文字识别(检测 + 识别)
  • 能力:只输出「文字 + 坐标」,无结构、无表格、无字段抽取
  • 输出:[{"text":"xxx","bbox":[]}, ...]
  • 适合:纯文字场景(名片、街景、截图)
  • 不适合:发票(要结构、要表格、要字段)

2. PP-StructureV3(文档结构化)

  • 定位:版面分析 + 表格识别 + 文档结构化
  • 能力:
    • 识别文字、标题、表格、公式、印章
    • 自动解析发票 / 表单 / 合同结构
    • 直接输出 JSON/Markdown/Excel
     
  • 输出:带字段名的结构化 JSON(invoice_num, total_amount, date 等)
  • 适合:你的需求(发票→JSON)首选

3. PP-ChatOCRv4(对话式 OCR)

  • 定位:OCR + 大模型(ERNIE 4.5),支持自然语言提问
  • 能力:
    • 问:"发票总金额多少?" → 直接答数字
    • 支持复杂信息抽取、纠错、总结
     
  • 适合:需要对话交互、模糊查询、多文档问答
  • 缺点:重、慢、依赖 API,没必要用于单纯发票转 JSONPaddleOCR

4. PaddleOCR-VL(多模态文档大模型)

  • 定位:0.9B 视觉 - 语言模型,超强文档理解
  • 能力:
    • 支持 109 种语言、异形文本、倾斜 / 模糊文档
    • 表格 / 公式 / 图表全能,结构还原极强
    • 直接输出 JSON/Markdown
     
  • 适合:极复杂发票、低质扫描件、混合多语言票据
  • 版本:你要的 PaddleOCR-VL-1.5(最强)

---------------------------

docker OCR 最终环境

PaddlePaddle 版本: 3.0.0
PaddleOCR 版本 : 2.9.1

docker VL最终环境

✅ Python 3.10.11
✅ PaddlePaddle 3.2.1
✅ PaddleOCR 3.5.0
✅ PaddleNLP 2.6.1
✅ PaddleX 3.5.1
👉 全部满足 PaddleOCR-VL-1.5 运行要求,环境 100% 合格!

要支持识别pdf: pip install pymupdf -i https://pypi.tuna.tsinghua.edu.cn/simple

docker 环境

Dockerfile
FROM paddlepaddle/paddle:3.2.0

WORKDIR /app

# 换清华APT源 + 安装系统依赖(opencv必须)
RUN sed -i 's|deb.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \
    sed -i 's|security.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
    libgl1-mesa-glx \
    libglib2.0-0 \
    && rm -rf /var/lib/apt/lists/*

# 升级pip
RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple

# 安装核心库(修复你Docker报错的关键)
RUN pip install "paddlex[ocr]==3.5.1" -i https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install "paddleocr[vl]==3.5.0" paddlenlp -i https://pypi.tuna.tsinghua.edu.cn/simple

# 🔥 删掉了错误命令: paddlex setup ocr_vl
# 模型会在第一次运行代码时自动下载

CMD ["/bin/bash"]

 

docker-compose.yml
version: '3.8'

services:
  paddle-ocr-vl:
    build: .
    container_name: paddle-ocr-vl
    volumes:
      - ./:/app  # 本地目录 ↔ 容器 /app 自动同步
    stdin_open: true
    tty: true
    working_dir: /app

 

 

 

识别图片和pdf代码:

from paddleocr import PaddleOCRVL
from PIL import Image
import numpy as np
import cv2
import os


# ======================
# 图像增强(提升准确率)
# ======================
def enhance_image(img):
    if isinstance(img, Image.Image):
        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    binary = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        15, 10
    )
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32)
    sharp = cv2.filter2D(binary, -1, kernel)
    return Image.fromarray(sharp).convert("RGB")


# ======================
# PDF 转图片(自动处理)
# ======================
def pdf_to_images(pdf_path):
    try:
        import fitz  # PyMuPDF
        images = []
        doc = fitz.open(pdf_path)
        for page in doc:
            pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            images.append(img)
        return images
    except Exception as e:
        print("PDF转换失败,请安装:pip install pymupdf")
        return []


# ======================
# 统一识别函数(图片/PDF通用)
# ======================
def recognize_file(file_path, pipeline):
    ext = os.path.splitext(file_path)[1].lower()
    results = []

    # 图片
    if ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
        img = Image.open(file_path).convert("RGB")
        img_enhanced = enhance_image(img)
        res = pipeline.predict(np.array(img_enhanced))
        results.extend(res)

    # PDF
    elif ext == ".pdf":
        images = pdf_to_images(file_path)
        for img in images:
            img_enhanced = enhance_image(img)
            res = pipeline.predict(np.array(img_enhanced))
            results.extend(res)

    return results


# ======================
# 主程序
# ======================
if __name__ == "__main__":
    # 加载模型
    pipeline = PaddleOCRVL(device="cpu")

    # ======================
    # 这里可以换:图片 或 PDF
    # ======================
    file_path = os.path.join("2026-04-22", "141bdbfd-2aa629c7af95.jpg")
    #file_path = "2026-04-22\年审发票.pdf"


    # 识别
    output = recognize_file(file_path, pipeline)

    # 输出 JSON + Markdown
    for res in output:
        res.print()
        res.save_to_json("./output")
        res.save_to_markdown("./output")

 

 

 

下面是 paddleOCR 环境

 Dockerfile
FROM paddlepaddle/paddle:3.0.0

WORKDIR /app

# 系统依赖
RUN sed -i 's|deb.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \
    sed -i 's|security.debian.org|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
    libgl1-mesa-glx libglib2.0-0 libsm6 libxrender1 libxext6 && \
    rm -rf /var/lib/apt/lists/*

# 安装固定依赖
COPY requirements.txt .
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

# 拷贝代码
COPY . .

# 运行
CMD ["/bin/bash"]
#CMD ["python", "main.py"]

 

docker-compose.yml

version: '3.8'

services:
  paddle-ocr:
    build: .
    container_name: paddle-ocr
    volumes:
      - ./:/app  # 本地目录 ↔ 容器 /app 自动同步
    stdin_open: true
    tty: true
    working_dir: /app

 

 

发票结构化解析

import os
import cv2
import numpy as np
from paddleocr import PaddleOCR

# 初始化 (发票/表格/结构化解析)
ocr = PaddleOCR(
    lang="ch",
    device="cpu",
    show_log=False
)

# 你的发票图片
file_path = "../images/491b5d3a-ab7e-4ebb-a53b-5f108ea18527.jpg"

# 结构化解析(直接输出格式化结果)
result = ocr.ocr(file_path)

# 打印结果
print("\n===== 发票结构化结果 =====\n")
for item in result:
    print(item)

print("===== 识别文字结果 =====")
for res in result:
    for line in res:
        text = line[1][0]
        print(text)

 

 

 

检查环境

import sys
from packaging import version

def main():
    print("=" * 60)
    print("📌 PaddleOCR-VL 官方要求版本检测脚本")
    print("=" * 60)

    # ======================
    # 要求的版本规则
    # ======================
    required = {
        "python": {"min": "3.8.0", "max": "3.11.0", "recommend": "3.10.x"},
        "paddlepaddle": {"min": "3.0.0", "recommend": "3.2.0"},
        "paddleocr": {"options": ["3.4.0", "3.5.0"]},
        "paddlenlp": {"min": "2.6.0", "recommend": "3.0.0"},
        "paddlex": {"must_install": True}
    }

    # ======================
    # 1. 检查 Python
    # ======================
    py_ver = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
    print(f"\🐍 Python 版本: {py_ver}")

    py_ok = (version.parse(py_ver) >= version.parse(required["python"]["min"]) and
             version.parse(py_ver) <= version.parse(required["python"]["max"]))

    if py_ok:
        print(f"  ✅ 符合要求(要求 {required['python']['min']} ~ {required['python']['max']})")
    else:
        print(f"  ❌ 不支持!请安装 3.8 ~ 3.11(推荐 3.10)")

    # ======================
    # 2. 检查已安装库
    # ======================
    installed = {}
    try:
        import pkg_resources
        for pkg in pkg_resources.working_set:
            installed[pkg.key.lower()] = pkg.version
    except:
        print("\n⚠️  无法读取库版本,请确保 setuptools 正常")
        return

    # 检查清单
    check_pkgs = [
        ("paddlepaddle", "PaddlePaddle"),
        ("paddleocr", "PaddleOCR"),
        ("paddlenlp", "PaddleNLP"),
        ("paddlex", "PaddleX")
    ]

    print("\n📦 依赖库版本检查:")

    # PaddlePaddle
    pkg = "paddlepaddle"
    v = installed.get(pkg, "未安装")
    print(f"\n🔹 {pkg.upper()}: {v}")
    if pkg in installed:
        if version.parse(v) >= version.parse(required[pkg]["min"]):
            print(f"  ✅ 正常(要求 ≥{required[pkg]['min']},推荐 {required[pkg]['recommend']})")
        else:
            print(f"  ❌ 版本过低,请升级至 ≥{required[pkg]['min']}")
    else:
        print(f"  ❌ 未安装!")

    # PaddleOCR
    pkg = "paddleocr"
    v = installed.get(pkg, "未安装")
    print(f"\n🔹 {pkg.upper()}: {v}")
    if pkg in installed:
        if v in required[pkg]["options"]:
            print(f"  ✅ 支持 VL-1.5(支持版本:{'/'.join(required[pkg]['options'])})")
        else:
            print(f"  ⚠️  非推荐版本,建议安装 3.4.0 或 3.5.0")
    else:
        print(f"  ❌ 未安装!请安装 paddleocr[vl]==3.4.0")

    # PaddleNLP
    pkg = "paddlenlp"
    v = installed.get(pkg, "未安装")
    print(f"\n🔹 {pkg.upper()}: {v}")
    if pkg in installed:
        if version.parse(v) >= version.parse(required[pkg]["min"]):
            print(f"  ✅ 正常(要求 ≥{required[pkg]['min']},推荐 {required[pkg]['recommend']})")
        else:
            print(f"  ❌ 版本过低,请升级")
    else:
        print(f"  ❌ 未安装!")

    # PaddleX
    pkg = "paddlex"
    v = installed.get(pkg, "未安装")
    print(f"\n🔹 {pkg.upper()}: {v}")
    if pkg in installed:
        print(f"  ✅ 已安装")
    else:
        print(f"  ❌ 缺失!PaddleOCR-VL 必须依赖 paddlex")

    # ======================
    # 3. 最终总结
    # ======================
    print("\n" + "=" * 60)
    print("📊 总结:")
    print("✅ = 符合要求")
    print("⚠️  = 可运行但非最优")
    print("❌ = 必须修复才能运行 PaddleOCR-VL-1.5")
    print("=" * 60)

if __name__ == "__main__":
    main()

 

posted @ 2026-04-23 16:56  与f  阅读(40)  评论(0)    收藏  举报