手写字体文字识别

点击查看代码
import pytesseract
from PIL import Image
import cv2
import numpy as np

class TesseractHandwritingRecognizer:
    def __init__(self):
        # 设置Tesseract路径(根据您的安装位置调整)
        # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows
        # 对于Linux/Mac: 通常不需要设置,或使用 which tesseract 找到路径
        
        # 使用专门的手写识别模型
        self.config = '--oem 3 --psm 8 -l eng'
    
    def preprocess_image(self, image_path):
        """图像预处理"""
        image = cv2.imread(image_path)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # 噪声去除
        denoised = cv2.medianBlur(gray, 5)
        
        # 二值化
        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        return binary
    
    def recognize_text(self, image_path):
        """识别手写文字"""
        try:
            # 预处理
            processed_image = self.preprocess_image(image_path)
            
            # 使用PIL打开图像
            pil_image = Image.fromarray(processed_image)
            
            # 识别文字
            text = pytesseract.image_to_string(pil_image, config=self.config)
            
            return {
                'text': text.strip(),
                'confidence': 'N/A'  # Tesseract的置信度需要额外处理
            }
            
        except Exception as e:
            return {'error': str(e)}

# 使用示例
def test_tesseract():
    recognizer = TesseractHandwritingRecognizer()
    result = recognizer.recognize_text('handwritten_sample.jpg')
    print(f"识别结果: {result}")
posted @ 2025-11-20 23:40  Lay“  阅读(49)  评论(0)    收藏  举报