Initial commit with translated description

2026-03-29 13:02:57 +08:00
commit d97055d190
7 changed files with 2719 additions and 0 deletions
--- a/ai_agent.py
+++ b/ai_agent.py
@@ -0,0 +1,613 @@
+"""
+AI Desktop Agent - Cognitive Desktop Automation
+Combines vision, reasoning, and control for autonomous task execution
+"""
+
+import base64
+import time
+from typing import Dict, List, Optional, Any, Callable
+from pathlib import Path
+import logging
+
+from desktop_control import DesktopController
+
+logger = logging.getLogger(__name__)
+
+
+class AIDesktopAgent:
+    """
+    Intelligent desktop agent that combines computer vision, LLM reasoning,
+    and desktop control for autonomous task execution.
+    
+    Can understand screen content, plan actions, and execute complex workflows.
+    """
+    
+    def __init__(self, llm_client=None, failsafe: bool = True):
+        """
+        Initialize AI Desktop Agent.
+        
+        Args:
+            llm_client: OpenClaw LLM client for reasoning (optional, will try to auto-detect)
+            failsafe: Enable failsafe mode
+        """
+        self.dc = DesktopController(failsafe=failsafe)
+        self.llm_client = llm_client
+        self.screen_width, self.screen_height = self.dc.get_screen_size()
+        
+        # Action history for learning
+        self.action_history = []
+        
+        # Application knowledge base
+        self.app_knowledge = self._load_app_knowledge()
+        
+        logger.info("AI Desktop Agent initialized")
+    
+    def _load_app_knowledge(self) -> Dict[str, Dict]:
+        """
+        Load application-specific knowledge.
+        This can be extended with learned patterns.
+        """
+        return {
+            "mspaint": {
+                "name": "Microsoft Paint",
+                "launch_command": "mspaint",
+                "common_actions": {
+                    "select_pencil": {"menu": "Tools", "position": "toolbar_left"},
+                    "select_brush": {"menu": "Tools", "position": "toolbar"},
+                    "select_color": {"menu": "Colors", "action": "click_palette"},
+                    "draw_line": {"action": "drag", "tool_required": "line"},
+                }
+            },
+            "notepad": {
+                "name": "Notepad",
+                "launch_command": "notepad",
+                "common_actions": {
+                    "type_text": {"action": "type"},
+                    "save": {"hotkey": ["ctrl", "s"]},
+                    "new_file": {"hotkey": ["ctrl", "n"]},
+                }
+            },
+            "calculator": {
+                "name": "Calculator",
+                "launch_command": "calc",
+                "common_actions": {
+                    "calculate": {"action": "type_numbers"},
+                }
+            }
+        }
+    
+    def execute_task(self, task: str, max_steps: int = 50) -> Dict[str, Any]:
+        """
+        Execute a high-level task autonomously.
+        
+        Args:
+            task: Natural language task description
+            max_steps: Maximum number of steps to attempt
+            
+        Returns:
+            Execution result with status and details
+        """
+        logger.info(f"Executing task: {task}")
+        
+        # Initialize result
+        result = {
+            "task": task,
+            "status": "in_progress",
+            "steps": [],
+            "screenshots": [],
+            "success": False
+        }
+        
+        try:
+            # Step 1: Analyze task and plan
+            plan = self._plan_task(task)
+            logger.info(f"Generated plan with {len(plan)} steps")
+            
+            # Step 2: Execute plan step by step
+            for step_num, step in enumerate(plan, 1):
+                if step_num > max_steps:
+                    logger.warning(f"Reached max steps ({max_steps})")
+                    break
+                
+                logger.info(f"Step {step_num}/{len(plan)}: {step['description']}")
+                
+                # Capture screen before action
+                screenshot_before = self.dc.screenshot()
+                
+                # Execute step
+                step_result = self._execute_step(step)
+                result["steps"].append(step_result)
+                
+                # Capture screen after action
+                screenshot_after = self.dc.screenshot()
+                result["screenshots"].append({
+                    "step": step_num,
+                    "before": screenshot_before,
+                    "after": screenshot_after
+                })
+                
+                # Verify step success
+                if not step_result.get("success", False):
+                    logger.error(f"Step {step_num} failed: {step_result.get('error')}")
+                    result["status"] = "failed"
+                    result["failed_at_step"] = step_num
+                    return result
+                
+                # Small delay between steps
+                time.sleep(0.5)
+            
+            result["status"] = "completed"
+            result["success"] = True
+            logger.info(f"Task completed successfully in {len(result['steps'])} steps")
+            
+        except Exception as e:
+            logger.error(f"Task execution error: {e}")
+            result["status"] = "error"
+            result["error"] = str(e)
+        
+        return result
+    
+    def _plan_task(self, task: str) -> List[Dict[str, Any]]:
+        """
+        Plan task execution using LLM reasoning.
+        
+        Args:
+            task: Task description
+            
+        Returns:
+            List of execution steps
+        """
+        # For now, use rule-based planning
+        # TODO: Integrate with OpenClaw LLM for intelligent planning
+        
+        # Parse task intent
+        task_lower = task.lower()
+        
+        # Pattern matching for common tasks
+        if "draw" in task_lower and "paint" in task_lower:
+            return self._plan_paint_drawing(task)
+        elif "type" in task_lower or "write" in task_lower:
+            return self._plan_text_entry(task)
+        elif "play" in task_lower and "game" in task_lower:
+            return self._plan_game_play(task)
+        elif "open" in task_lower or "launch" in task_lower:
+            return self._plan_app_launch(task)
+        else:
+            # Generic plan - analyze and improvise
+            return self._plan_generic(task)
+    
+    def _plan_paint_drawing(self, task: str) -> List[Dict]:
+        """Plan for drawing in MS Paint."""
+        # Extract what to draw
+        drawing_subject = self._extract_subject(task)
+        
+        return [
+            {
+                "type": "launch_app",
+                "app": "mspaint",
+                "description": "Launch Microsoft Paint"
+            },
+            {
+                "type": "wait",
+                "duration": 2.0,
+                "description": "Wait for Paint to load"
+            },
+            {
+                "type": "activate_window",
+                "title": "Paint",
+                "description": "Ensure Paint window is active"
+            },
+            {
+                "type": "select_tool",
+                "tool": "pencil",
+                "description": "Select pencil tool"
+            },
+            {
+                "type": "draw",
+                "subject": drawing_subject,
+                "description": f"Draw {drawing_subject}"
+            },
+            {
+                "type": "screenshot",
+                "save_as": "drawing_result.png",
+                "description": "Capture the drawing"
+            }
+        ]
+    
+    def _plan_text_entry(self, task: str) -> List[Dict]:
+        """Plan for text entry task."""
+        # Extract text to type
+        text_content = self._extract_text_content(task)
+        
+        return [
+            {
+                "type": "launch_app",
+                "app": "notepad",
+                "description": "Launch Notepad"
+            },
+            {
+                "type": "wait",
+                "duration": 1.0,
+                "description": "Wait for Notepad to load"
+            },
+            {
+                "type": "type_text",
+                "text": text_content,
+                "wpm": 80,
+                "description": f"Type: {text_content[:50]}..."
+            }
+        ]
+    
+    def _plan_game_play(self, task: str) -> List[Dict]:
+        """Plan for playing a game."""
+        game_name = self._extract_game_name(task)
+        
+        return [
+            {
+                "type": "analyze_screen",
+                "description": "Analyze game screen"
+            },
+            {
+                "type": "detect_game_state",
+                "game": game_name,
+                "description": f"Detect {game_name} state"
+            },
+            {
+                "type": "execute_game_loop",
+                "game": game_name,
+                "max_iterations": 100,
+                "description": f"Play {game_name}"
+            }
+        ]
+    
+    def _plan_app_launch(self, task: str) -> List[Dict]:
+        """Plan for launching an application."""
+        app_name = self._extract_app_name(task)
+        
+        return [
+            {
+                "type": "launch_app",
+                "app": app_name,
+                "description": f"Launch {app_name}"
+            },
+            {
+                "type": "wait",
+                "duration": 2.0,
+                "description": f"Wait for {app_name} to load"
+            }
+        ]
+    
+    def _plan_generic(self, task: str) -> List[Dict]:
+        """Generic planning fallback."""
+        return [
+            {
+                "type": "analyze_screen",
+                "description": "Analyze current screen state"
+            },
+            {
+                "type": "infer_action",
+                "task": task,
+                "description": f"Infer action for: {task}"
+            }
+        ]
+    
+    def _execute_step(self, step: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a single step.
+        
+        Args:
+            step: Step definition
+            
+        Returns:
+            Execution result
+        """
+        step_type = step.get("type")
+        result = {"step": step, "success": False}
+        
+        try:
+            if step_type == "launch_app":
+                self._do_launch_app(step["app"])
+                result["success"] = True
+                
+            elif step_type == "wait":
+                time.sleep(step["duration"])
+                result["success"] = True
+                
+            elif step_type == "activate_window":
+                success = self.dc.activate_window(step["title"])
+                result["success"] = success
+                
+            elif step_type == "select_tool":
+                self._do_select_tool(step["tool"])
+                result["success"] = True
+                
+            elif step_type == "draw":
+                self._do_draw(step["subject"])
+                result["success"] = True
+                
+            elif step_type == "type_text":
+                self.dc.type_text(step["text"], wpm=step.get("wpm", 80))
+                result["success"] = True
+                
+            elif step_type == "screenshot":
+                filename = step.get("save_as", "screenshot.png")
+                self.dc.screenshot(filename=filename)
+                result["success"] = True
+                result["saved_to"] = filename
+                
+            elif step_type == "analyze_screen":
+                analysis = self._analyze_screen()
+                result["analysis"] = analysis
+                result["success"] = True
+                
+            elif step_type == "execute_game_loop":
+                game_result = self._execute_game_loop(step)
+                result["game_result"] = game_result
+                result["success"] = True
+                
+            else:
+                result["error"] = f"Unknown step type: {step_type}"
+                
+        except Exception as e:
+            logger.error(f"Step execution error: {e}")
+            result["error"] = str(e)
+        
+        return result
+    
+    def _do_launch_app(self, app: str) -> None:
+        """Launch an application."""
+        # Get launch command from knowledge base
+        app_info = self.app_knowledge.get(app, {})
+        launch_cmd = app_info.get("launch_command", app)
+        
+        # Open Run dialog
+        self.dc.hotkey('win', 'r')
+        time.sleep(0.5)
+        
+        # Type and execute command
+        self.dc.type_text(launch_cmd, wpm=100)
+        self.dc.press('enter')
+        
+        logger.info(f"Launched: {app}")
+    
+    def _do_select_tool(self, tool: str) -> None:
+        """Select a tool (e.g., in Paint)."""
+        # This is simplified - in reality would use computer vision
+        # to find and click the tool button
+        
+        # For Paint, tools are typically in the ribbon
+        # We'll use hotkeys where possible
+        if tool == "pencil":
+            # In Paint, press 'P' for pencil
+            self.dc.press('p')
+        elif tool == "brush":
+            self.dc.press('b')
+        elif tool == "eraser":
+            self.dc.press('e')
+        
+        logger.info(f"Selected tool: {tool}")
+    
+    def _do_draw(self, subject: str) -> None:
+        """
+        Draw something on screen.
+        This is a simplified implementation - would be enhanced with:
+        - Image generation (use wan2gp to generate reference)
+        - Trace generation (convert image to draw commands)
+        - Executed drawing (execute the commands)
+        """
+        logger.info(f"Drawing: {subject}")
+        
+        # Get canvas center (simplified - would detect canvas)
+        canvas_x = self.screen_width // 2
+        canvas_y = self.screen_height // 2
+        
+        # Simple drawing pattern (example: draw a    simple shape)
+        if "circle" in subject.lower():
+            self._draw_circle(canvas_x, canvas_y, radius=100)
+        elif "square" in subject.lower():
+            self._draw_square(canvas_x, canvas_y, size=200)
+        elif "star" in subject.lower():
+            self._draw_star(canvas_x, canvas_y, size=100)
+        else:
+            # Generic: draw a simple pattern
+            self._draw_simple_pattern(canvas_x, canvas_y)
+        
+        logger.info(f"Completed drawing: {subject}")
+    
+    def _draw_circle(self, cx: int, cy: int, radius: int) -> None:
+        """Draw a circle."""
+        import math
+        
+        points = []
+        for angle in range(0, 360, 5):
+            rad = math.radians(angle)
+            x = int(cx + radius * math.cos(rad))
+            y = int(cy + radius * math.sin(rad))
+            points.append((x, y))
+        
+        # Draw by connecting points
+        for i in range(len(points) - 1):
+            self.dc.drag(points[i][0], points[i][1], 
+                        points[i+1][0], points[i+1][1], 
+                        duration=0.01)
+        # Close the circle
+        self.dc.drag(points[-1][0], points[-1][1], 
+                    points[0][0], points[0][1], 
+                    duration=0.01)
+    
+    def _draw_square(self, cx: int, cy: int, size: int) -> None:
+        """Draw a square."""
+        half = size // 2
+        corners = [
+            (cx - half, cy - half),  # Top-left
+            (cx + half, cy - half),  # Top-right
+            (cx + half, cy + half),  # Bottom-right
+            (cx - half, cy + half),  # Bottom-left
+        ]
+        
+        # Draw sides
+        for i in range(4):
+            start = corners[i]
+            end = corners[(i + 1) % 4]
+            self.dc.drag(start[0], start[1], end[0], end[1], duration=0.2)
+    
+    def _draw_star(self, cx: int, cy: int, size: int) -> None:
+        """Draw a 5-pointed star."""
+        import math
+        
+        points = []
+        for i in range(10):
+            angle = math.radians(i * 36 - 90)
+            radius = size if i % 2 == 0 else size // 2
+            x = int(cx + radius * math.cos(angle))
+            y = int(cy + radius * math.sin(angle))
+            points.append((x, y))
+        
+        # Draw by connecting points
+        for i in range(len(points)):
+            start = points[i]
+            end = points[(i + 1) % len(points)]
+            self.dc.drag(start[0], start[1], end[0], end[1], duration=0.1)
+    
+    def _draw_simple_pattern(self, cx: int, cy: int) -> None:
+        """Draw a simple decorative pattern."""
+        # Draw a few curved lines
+        for offset in [-50, 0, 50]:
+            self.dc.drag(cx - 100, cy + offset, 
+                        cx + 100, cy + offset, 
+                        duration=0.3)
+    
+    def _analyze_screen(self) -> Dict[str, Any]:
+        """
+        Analyze current screen state.
+        Would use OCR, object detection in full implementation.
+        """
+        screenshot = self.dc.screenshot()
+        active_window = self.dc.get_active_window()
+        mouse_pos = self.dc.get_mouse_position()
+        
+        analysis = {
+            "active_window": active_window,
+            "mouse_position": mouse_pos,
+            "screen_size": (self.screen_width, self.screen_height),
+            "timestamp": time.time()
+        }
+        
+        # TODO: Add OCR, object detection, UI element detection
+        
+        return analysis
+    
+    def _execute_game_loop(self, step: Dict) -> Dict:
+        """
+        Execute game playing loop.
+        Would use reinforcement learning in full implementation.
+        """
+        game = step.get("game", "unknown")
+        max_iter = step.get("max_iterations", 100)
+        
+        logger.info(f"Starting game loop for: {game}")
+        
+        result = {
+            "game": game,
+            "iterations": 0,
+            "actions_taken": []
+        }
+        
+        # Simple game loop - would be much more sophisticated
+        for i in range(max_iter):
+            # Analyze game state
+            state = self._analyze_screen()
+            
+            # Decide action (simplified - would use ML model)
+            action = self._decide_game_action(state, game)
+            
+            # Execute action
+            self._execute_game_action(action)
+            
+            result["iterations"] += 1
+            result["actions_taken"].append(action)
+            
+            # Check win/lose condition
+            # (would detect from screen)
+            
+            time.sleep(0.1)
+        
+        return result
+    
+    def _decide_game_action(self, state: Dict, game: str) -> str:
+        """Decide next game action based on state."""
+        # Simplified - would use game-specific AI
+        return "continue"
+    
+    def _execute_game_action(self, action: str) -> None:
+        """Execute a game action."""
+        # Simplified - would translate to specific inputs
+        pass
+    
+    # Helper methods for parsing
+    
+    def _extract_subject(self, text: str) -> str:
+        """Extract subject from drawing request."""
+        # Simple extraction - would use NLP
+        if "draw" in text.lower():
+            parts = text.lower().split("draw")
+            if len(parts) > 1:
+                return parts[1].strip()
+        return "unknown"
+    
+    def _extract_text_content(self, text: str) -> str:
+        """Extract text content from typing request."""
+        # Simple extraction
+        if "type" in text.lower():
+            parts = text.split("type")
+            if len(parts) > 1:
+                return parts[1].strip().strip('"').strip("'")
+        return text
+    
+    def _extract_game_name(self, text: str) -> str:
+        """Extract game name from request."""
+        # Would use NER for better extraction
+        return "unknown_game"
+    
+    def _extract_app_name(self, text: str) -> str:
+        """Extract application name from request."""
+        # Simple extraction - would use NER
+        for app in self.app_knowledge.keys():
+            if app in text.lower():
+                return app
+        return "notepad"  # Default fallback
+
+
+# Quick access function
+def create_agent(**kwargs) -> AIDesktopAgent:
+    """Create an AI Desktop Agent instance."""
+    return AIDesktopAgent(**kwargs)
+
+
+if __name__ == "__main__":
+    print("🤖 AI Desktop Agent - Cognitive Automation")
+    print("=" * 60)
+    
+    # Create agent
+    agent = AIDesktopAgent(failsafe=True)
+    
+    print("\n✨ Examples of what you can ask:")
+    print("  - 'Draw a circle in Paint'")
+    print("  - 'Type Hello World in Notepad'")
+    print("  - 'Open Calculator'")
+    print("  - 'Play Solitaire for me'")
+    
+    print("\n🎯 Try it:")
+    task = input("\nWhat would you like me to do? ")
+    
+    if task.strip():
+        result = agent.execute_task(task)
+        print(f"\n{'='* 60}")
+        print(f"Task Status: {result['status']}")
+        print(f"Steps Executed: {len(result['steps'])}")
+        print(f"Success: {result['success']}")
+        
+        if result.get('screenshots'):
+            print(f"Screenshots captured: {len(result['screenshots'])}")
+    else:
+        print("\nNo task entered. Exiting.")