使用 OCaml 和 Tesseract 实现验证码识别

一、准备环境
安装 OCaml

sudo apt install ocaml opam
opam init
opam install dune
安装 Tesseract
更多内容访问ttocr.com或联系1436423940
sudo apt install tesseract-ocr
二、创建项目

mkdir ocaml_captcha && cd ocaml_captcha
dune init exe captcha
三、添加依赖
我们需要使用 unix 模块调用系统命令,以及 str 进行字符串处理。

编辑 dune 文件,确保包含:

(executables
(names captcha)
(libraries unix str))
四、实现识别程序
编辑 captcha.ml:

open Unix
open Str

let run_tesseract image_path =
let output_base = "ocaml_output" in
let whitelist = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" in
let cmd = Printf.sprintf
"tesseract %s %s -l eng -c tessedit_char_whitelist=%s"
image_path output_base whitelist
in
let status = Sys.command cmd in
if status <> 0 then
"识别失败"
else
let txt_file = output_base ^ ".txt" in
try
let ch = open_in txt_file in
let content = input_line ch in
close_in ch;
let re = regexp "[^A-Z0-9]" in
let cleaned = global_replace re "" (String.uppercase_ascii content) in
Sys.remove txt_file;
cleaned
with _ -> "读取失败"

let () =
if Array.length Sys.argv < 2 then
Printf.printf "用法: %s <图片路径>\n" Sys.argv.(0)
else
let result = run_tesseract Sys.argv.(1) in
Printf.printf "识别结果: %s\n" result
五、构建并运行
构建:

dune build
运行:

dune exec ./captcha.exe captcha.png
输出示例:

识别结果: 7FML
六、扩展:批量识别目录下所有图片
你可以添加如下逻辑遍历目录中的 .png 文件:

let process_dir dir =
let files = Sys.readdir dir in
Array.iter (fun file ->
if Filename.check_suffix file ".png" then
let path = Filename.concat dir file in
let res = run_tesseract path in
Printf.printf "%s -> %s\n" file res
) files

posted @ 2025-07-07 11:32  ttocr、com  阅读(14)  评论(0)    收藏  举报