OmniParser - 改为 REST API 调用（二）

上一章（ OmniParser - 安装 linux 系统（一））讲的是如何安装以及运行 gradio_demo.py，启动一个 web 应用，如下图。这一章的内容是如何启动一个 REST API 服务，通过接口来提供服务。

一、服务端代码

在 gradio_demo.py 同一个目录中添加 main.py 文件，将如下代码复制进去

# pip install fastapi uvicorn python-multipart pillow requests

from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
import torch
from PIL import Image
import io
import base64
from typing import Optional
import numpy as np

from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img

app = FastAPI()

# 初始化模型
yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
caption_model_processor = get_caption_model_processor(
    model_name="florence2", 
    model_name_or_path="weights/icon_caption_florence"
)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

@app.post("/process_image")
async def process_image(
    file: UploadFile = File(...),
    box_threshold: float = 0.05,
    iou_threshold: float = 0.1,
    use_paddleocr: bool = True,
    imgsz: int = 640
):
    try:
        # 读取上传的图片
        contents = await file.read()
        image = Image.open(io.BytesIO(contents))
        
        # 临时保存图片
        image_save_path = 'imgs/temp_image.png'
        image.save(image_save_path)
        
        # 配置绘制边界框的参数
        box_overlay_ratio = image.size[0] / 3200
        draw_bbox_config = {
            'text_scale': 0.8 * box_overlay_ratio,
            'text_thickness': max(int(2 * box_overlay_ratio), 1),
            'text_padding': max(int(3 * box_overlay_ratio), 1),
            'thickness': max(int(3 * box_overlay_ratio), 1),
        }

        # OCR处理
        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
            image_save_path,
            display_img=False,
            output_bb_format='xyxy',
            goal_filtering=None,
            easyocr_args={'paragraph': False, 'text_threshold': 0.9},
            use_paddleocr=use_paddleocr
        )
        text, ocr_bbox = ocr_bbox_rslt

        # 获取标记后的图片和解析内容
        dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
            image_save_path,
            yolo_model,
            BOX_TRESHOLD=box_threshold,
            output_coord_in_ratio=True,
            ocr_bbox=ocr_bbox,
            draw_bbox_config=draw_bbox_config,
            caption_model_processor=caption_model_processor,
            ocr_text=text,
            iou_threshold=iou_threshold,
            imgsz=imgsz,
        )

        # 格式化解析结果
        parsed_content = '\n'.join([f'icon {i}: {str(v)}' for i, v in enumerate(parsed_content_list)])

        return JSONResponse({
            "status": "success",
            "labeled_image": dino_labled_img,  # base64编码的图片
            "parsed_content": parsed_content,
            "label_coordinates": label_coordinates
        })

    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"status": "error", "message": str(e)}
        )

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

目录如下图

或者直接使用压缩包 main.zip 中的 main.py

二、启动服务端

执行 python main.py 启动服务端，启动后如下图

三、编写客户端

在电脑上创建一个目录，目录中 s.png 是待识别的截图，client.py 是上传截图并->遍历->执行点击操作的代码。

import requests
from PIL import Image
import base64
import io
import pyautogui
from time import sleep
import json
import ast  # 用于解析字符串形式的字典
from pynput.mouse import Button, Controller
import time

def process_image(
        image_path: str,
        api_url: str = "http://172.31.100.27:8000/process_image",
        box_threshold: float = 0.05,
        iou_threshold: float = 0.1,
        use_paddleocr: bool = True,
        imgsz: int = 640
):
    files = {
        'file': ('image.png', open(image_path, 'rb'), 'image/png')
    }

    params = {
        'box_threshold': box_threshold,
        'iou_threshold': iou_threshold,
        'use_paddleocr': use_paddleocr,
        'imgsz': imgsz
    }

    response = requests.post(api_url, files=files, params=params)

    if response.status_code == 200:
        result = response.json()

        if result['status'] == 'success':
            labeled_image = Image.open(io.BytesIO(base64.b64decode(result['labeled_image'])))
            return {
                'status': 'success',
                'labeled_image': labeled_image,
                'parsed_content': result['parsed_content'],
                'label_coordinates': result['label_coordinates']
            }
        else:
            return {'status': 'error', 'message': result.get('message', 'Unknown error')}
    else:
        return {'status': 'error', 'message': f'HTTP error {response.status_code}'}

def parse_icon_data(content_str):
    """解析包含图标数据的字符串为列表."""
    icons = []
    lines = content_str.strip().split('\n')
    for line in lines:
        if line.startswith('icon '):
            try:
                # 提取花括号中的内容
                dict_str = line[line.index('{'):line.rindex('}') + 1]
                # 解析字符串为字典
                icon_data = ast.literal_eval(dict_str)
                icons.append(icon_data)
            except Exception as e:
                print(f"解析错误: {e}")
                continue
    return icons

def bbox_to_coords(bbox, screen_width, screen_height):
    """将 bbox 坐标转换为屏幕坐标."""
    xmin, ymin, xmax, ymax = bbox

    # 考虑 Mac 顶部菜单栏的偏移
    menu_bar_height = 25

    # 向上偏移以避免点击到文件名
    y_offset = -15  # 向上偏移15像素

    # 计算相对坐标
    x_center = int((xmin + xmax) / 2 * screen_width)
    y_center = int((ymin + ymax) / 2 * (screen_height - menu_bar_height)) + menu_bar_height + y_offset

    # 添加调试信息
    print(f"\n坐标转换详情:")
    print(f"屏幕尺寸: {screen_width} x {screen_height}")
    print(f"原始bbox: {bbox}")
    print(f"x轴变换: {xmin:.4f} -> {xmax:.4f} 中点: {(xmin + xmax) / 2:.4f}")
    print(f"y轴变换: {ymin:.4f} -> {ymax:.4f} 中点: {(ymin + ymax) / 2:.4f}")
    print(f"考虑菜单栏偏移: {menu_bar_height}px")
    print(f"向上偏移: {y_offset}px")
    print(f"计算结果: x={x_center}, y={y_center}")

    # 确保坐标在屏幕范围内
    x_center = max(0, min(x_center, screen_width))
    y_center = max(0, min(y_center, screen_height))

    return x_center, y_center

# def bbox_to_coords(bbox, screen_width, screen_height):
#     """将 bbox 坐标转换为屏幕坐标."""
#     xmin, ymin, xmax, ymax = bbox
#
#     # 考虑 Mac 顶部菜单栏的偏移（大约25像素）
#     menu_bar_height = 25
#
#     # 考虑窗口边框和其他可能的偏移
#     x_offset = 0
#     y_offset = menu_bar_height
#
#     # 计算相对坐标
#     x_center = int((xmin + xmax) / 2 * screen_width)
#     y_center = int((ymin + ymax) / 2 * (screen_height - menu_bar_height)) + y_offset
#
#     # 添加调试信息
#     print(f"\n坐标转换详情:")
#     print(f"屏幕尺寸: {screen_width} x {screen_height}")
#     print(f"原始bbox: {bbox}")
#     print(f"x轴变换: {xmin:.4f} -> {xmax:.4f} 中点: {(xmin + xmax) / 2:.4f}")
#     print(f"y轴变换: {ymin:.4f} -> {ymax:.4f} 中点: {(ymin + ymax) / 2:.4f}")
#     print(f"考虑菜单栏偏移: {menu_bar_height}px")
#     print(f"计算结果: x={x_center}, y={y_center}")
#
#     # 确保坐标在屏幕范围内
#     x_center = max(0, min(x_center, screen_width))
#     y_center = max(0, min(y_center, screen_height))
#
#     return x_center, y_center

def click_bbox(bbox):
    """双击指定的 bbox."""
    mouse = Controller() # 初始化鼠标控制器

    # 获取屏幕分辨率
    screen_width, screen_height = pyautogui.size()
    print(f"当前屏幕分辨率: {screen_width}x{screen_height}")

    # 获取点击坐标
    x, y = bbox_to_coords(bbox, screen_width, screen_height)

    print(f"\n即将执行双击:")
    print(f"目标坐标: x={x}, y={y}")
    print("1秒准备时间...")
    sleep(1)

    # 移动鼠标到指定位置
    pyautogui.moveTo(x, y, duration=0.5)

    print("鼠标已就位，0.3秒后双击...")
    sleep(0.3)

    # 执行双击
    # pyautogui.doubleClick()
    mouse.click(Button.left, 2)  # 双击左键

    print(f"已双击坐标: x={x}, y={y}")

# def click_bbox(bbox):
#     """点击指定的 bbox."""
#     # 获取屏幕分辨率
#     screen_width, screen_height = pyautogui.size()
#     print(f"当前屏幕分辨率: {screen_width}x{screen_height}")
#
#     # 获取点击坐标
#     x, y = bbox_to_coords(bbox, screen_width, screen_height)
#
#     print(f"\n即将执行点击:")
#     print(f"目标坐标: x={x}, y={y}")
#     print("3秒准备时间...")
#     sleep(3)
#
#     # 移动鼠标到指定位置（使用缓动效果）
#     pyautogui.moveTo(x, y, duration=1, tween=pyautogui.easeOutQuad)
#
#     print("鼠标已就位，1秒后点击...")
#     sleep(1)
#
#     # 获取当前鼠标位置以验证
#     current_x, current_y = pyautogui.position()
#     print(f"当前鼠标位置: x={current_x}, y={current_y}")
#
#     # 点击鼠标
#     pyautogui.click()
#     print(f"已点击坐标: x={x}, y={y}")

def find_dog_avif_coordinates(icons, icon_name):
    """在解析内容中查找 icon_name 的图标."""
    print("================= 遍历 icons，开始 =================")

    for i, icon in enumerate(icons):
        if isinstance(icon, dict) and 'content' in icon:
            content = icon['content'].strip().lower()
            print(f"遍历 icons，content: {content}")
            if icon_name in content:
                print(f"找到 {icon_name}，图标索引: {i}")
                print("================= 遍历 icons，结束，找到 =================")
                return icon['bbox']

    print("================= 遍历 icons，结束，未找到 =================")
    return None

if __name__ == "__main__":
    # 获取并打印屏幕分辨率
    screen_width, screen_height = pyautogui.size()
    print(f"当前屏幕分辨率: {screen_width}x{screen_height}")

    image_path = "s.png"
    result = process_image(
        image_path=image_path,
        box_threshold=0.05,
        iou_threshold=0.1,
        use_paddleocr=True,
        imgsz=640
    )

    if result['status'] == 'success':
        print("================= 接口 /process_image 返回，开始 =================")
        print(result)
        print("================= 接口 /process_image 返回，结束 =================")
        
        icons = parse_icon_data(result['parsed_content'])
        image_bbox = find_dog_avif_coordinates(icons, 'ragfiow.txt')

        if image_bbox:
            print("找到 image 坐标:", image_bbox)
            click_bbox(image_bbox)
        else:
            print("未找到 image 图标")
    else:
        print("Error:", result['message'])

创建完成后，如下图。

也可以直接下载 client.zip

1、修改服务器地址，根据你运行 main.py 服务器的地址，修改对应 ip 和端口

2、修改为你要解析的截图文件名

3、改成你想点击的图标的名称

4、修改为大模型

目前代码中是使用遍历的方式查找要点击的图标，当然你可以改成通过大模型 + 提示词的方式返回要点击的图标。

四、运行

启动服务器，再运行客户端 python client.py，运行后如下图

posted @ 2026-04-07 08:25 rslai 阅读(4) 评论(0) 收藏举报

刷新页面返回顶部

赖荣生

OmniParser - 改为 REST API 调用（二）

一、服务端代码

二、启动服务端

三、编写客户端

1、修改服务器地址，根据你运行 main.py 服务器的地址，修改对应 ip 和端口

2、修改为你要解析的截图文件名

3、改成你想点击的图标的名称

4、修改为大模型

四、运行

公告

赖荣生

OmniParser - 改为 REST API 调用（二）

一、服务端代码

二、启动服务端

三、编写客户端

1、修改服务器地址，根据你运行 main.py 服务器的地址，修改对应 ip 和 端口

2、修改为你要解析的截图文件名

3、改成你想点击的图标的名称

4、修改为大模型

四、运行

公告

1、修改服务器地址，根据你运行 main.py 服务器的地址，修改对应 ip 和端口