本地运行Agent-S，替换多模态大模型为豆包。

设置环境变量
OPENAI_BASE_URL: https://ark.cn-beijing.volces.com/api/v3
OPENAI_API_KEY: xxxx-xxxx-xxxx-xxxx-xxxx
修改embedding模型
修改gui_agents/s2/core/engine.py中类名称为OpenAIEmbeddingEngine的self.model变量为豆包的向量模型名称

新建main.py

import io
import logging
import os
import platform
import sys
import time
from datetime import datetime

import pyautogui
from PIL import Image

from gui_agents.s2.agents.agent_s import AgentS2
from gui_agents.s2.agents.grounding import OSWorldACI


def scale_screen_dimensions(width: int, height: int, max_dim_size: int):
   scale_factor = min(max_dim_size / width, max_dim_size / height, 1)
   safe_width = int(width * scale_factor)
   safe_height = int(height * scale_factor)
   return safe_width, safe_height

def run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):
   obs = {}
   traj = "Task:\n" + instruction
   subtask_traj = ""
   for _ in range(15):
       # Get screen shot using pyautogui
       screenshot = pyautogui.screenshot()
       screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS)
       
       # Save the screenshot to a BytesIO object
       buffered = io.BytesIO()
       screenshot.save(buffered, format="PNG")
       
       # Get the byte value of the screenshot
       screenshot_bytes = buffered.getvalue()
       # Convert to base64 string.
       obs["screenshot"] = screenshot_bytes

       # Get next action code from the agent
       info, code = agent.predict(instruction=instruction, observation=obs)
       
       if "done" in code[0].lower() or "fail" in code[0].lower():
           if platform.system() == "Darwin":
               os.system(
                   f'osascript -e \'display dialog "Task Completed" with title "OpenACI Agent" buttons "OK" default button "OK"\''
               )
           elif platform.system() == "Linux":
               os.system(
                   f'zenity --info --title="OpenACI Agent" --text="Task Completed" --width=200 --height=100'
               )
           agent.update_narrative_memory(traj)
           break

       if "next" in code[0].lower():
           continue

       if "wait" in code[0].lower():
           time.sleep(5)
           continue
       else:
           time.sleep(1.0)
           print("EXECUTING CODE:", code[0])

           # Ask for permission before executing
           exec(code[0])
           time.sleep(1.0)

           # Update task and subtask trajectories and optionally the episodic memory
           traj += (
               "\n\nReflection:\n"
               + str(info["reflection"])
               + "\n\n----------------------\n\nPlan:\n"
               + info["executor_plan"]
           )
           subtask_traj = agent.update_episodic_memory(info, subtask_traj)

current_platform = platform.system().lower()
screen_width, screen_height = pyautogui.size()
scaled_width, scaled_height = scale_screen_dimensions(
   screen_width, screen_height, max_dim_size=2400
)

# grounding_width 参数以模型为准
engine_params = {"engine_type": 'openai', "model": "doubao-1.5-vision-pro-250328"}
engine_params_for_grounding = {
   "engine_type": "openai",
   "model": "doubao-1.5-vision-pro-250328",
   "grounding_width": grounding_width,
   "grounding_height": screen_height
       * grounding_width
       / screen_width,
}
grounding_agent = OSWorldACI(
   platform=current_platform,
   engine_params_for_generation=engine_params,
   engine_params_for_grounding=engine_params_for_grounding,
   width=screen_width,
   height=screen_height,
)

agent = AgentS2(
   engine_params,
   grounding_agent,
   platform=current_platform,
   action_space="pyautogui",
   observation_type="mixed",
   search_engine=None,
)
if __name__ == '__main__':
   while True:
       query = input("Query: ")

       agent.reset()
       # Run the agent on your own device
       run_agent(agent, query, scaled_width, scaled_height)

       response = input("Would you like to provide another query? (y/n): ")
       if response.lower() != "y":
           break

运行main.py

当出现矩阵维度不兼容错误时修改 gui_agents/s2/core/knowledge.py中的KnowledgeBase类的retrieve_narrative_experience以及retrieve_episodic_experience方法，使维度对其即可。以下为示例代码。

class KnowledgeBase:
    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str]:
        """Retrieve narrative experience using embeddings"""
    
        knowledge_base = load_knowledge_base(self.narrative_memory_path)
        if not knowledge_base:
            return "None", "None"
    
        embeddings = load_embeddings(self.embeddings_path)
    
        # Get or create instruction embedding
        instruction_embedding = embeddings.get(instruction)
    
        if instruction_embedding is None:
            instruction_embedding = self.embedding_engine.get_embeddings(instruction)
            embeddings[instruction] = instruction_embedding
        target_dim = instruction_embedding.shape[1]
        # Get or create embeddings for knowledge base entries
        candidate_embeddings = []
        for key in knowledge_base:
            candidate_embedding = embeddings.get(key)
            if candidate_embedding is None:
                candidate_embedding = self.embedding_engine.get_embeddings(key)
                embeddings[key] = candidate_embedding
            current_dim = candidate_embedding.shape[1]
            if current_dim > target_dim:
                # 如果当前维度大于目标维度，进行截断
                candidate_embedding = candidate_embedding[:, :target_dim]
            elif current_dim < target_dim:
                # 如果当前维度小于目标维度，进行填充
                padding = target_dim - current_dim
                candidate_embedding = np.pad(candidate_embedding, ((0, 0), (0, padding)), mode='constant',
                                             constant_values=0)
            candidate_embeddings.append(candidate_embedding)
    
        save_embeddings(self.embeddings_path, embeddings)
    
        similarities = cosine_similarity(
            instruction_embedding, np.vstack(candidate_embeddings)
        )[0]
        sorted_indices = np.argsort(similarities)[::-1]
    
        keys = list(knowledge_base.keys())
        idx = 1 if keys[sorted_indices[0]] == instruction else 0
        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]
        
    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str]:
        """Retrieve similar task experience using embeddings"""

        knowledge_base = load_knowledge_base(self.episodic_memory_path)
        if not knowledge_base:
            return "None", "None"

        embeddings = load_embeddings(self.embeddings_path)

        # Get or create instruction embedding
        instruction_embedding = embeddings.get(instruction)

        if instruction_embedding is None:
            instruction_embedding = self.embedding_engine.get_embeddings(instruction)
            embeddings[instruction] = instruction_embedding

        target_dim = instruction_embedding.shape[1]

        # Get or create embeddings for knowledge base entries
        candidate_embeddings = []
        for key in knowledge_base:
            candidate_embedding = embeddings.get(key)
            if candidate_embedding is None:
                candidate_embedding = self.embedding_engine.get_embeddings(key)
                embeddings[key] = candidate_embedding
            current_dim = candidate_embedding.shape[1]
            if current_dim > target_dim:
                # 如果当前维度大于目标维度，进行截断
                candidate_embedding = candidate_embedding[:, :target_dim]
            elif current_dim < target_dim:
                # 如果当前维度小于目标维度，进行填充
                padding = target_dim - current_dim
                candidate_embedding = np.pad(candidate_embedding, ((0, 0), (0, padding)), mode='constant',
                                             constant_values=0)
            candidate_embeddings.append(candidate_embedding)

        save_embeddings(self.embeddings_path, embeddings)

        similarities = cosine_similarity(
            instruction_embedding, np.vstack(candidate_embeddings)
        )[0]
        sorted_indices = np.argsort(similarities)[::-1]

        keys = list(knowledge_base.keys())
        idx = 1 if keys[sorted_indices[0]] == instruction else 0
        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]

posted @ 2025-05-13 15:20 墨雨听风阅读(162) 评论(0) 收藏举报

刷新页面返回顶部

墨雨听风

本地运行Agent-S，替换多模态大模型为豆包。

公告