commit 952514b0827dde98151684ddadf47eef87158c9b Author: zlei9 Date: Sun Mar 29 09:39:00 2026 +0800 Initial commit with translated description diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..9280b66 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,61 @@ +--- +name: linux-gui-control +description: "使用xdotool、wmctrl和dogtail控制Linux桌面GUI。在需要与非浏览器应用程序交互、模拟鼠标/键盘输入、管理窗口或检查X11/GNOME上应用程序的UI层次结构时使用。支持:(1)在应用中点击/输入,(2)调整大小/移动窗口,(3)从应用中提取基于文本的UI树(A11y),(4)截取屏幕截图进行视觉分析。" +--- + +# Linux GUI Control + +This skill provides tools and procedures for automating interactions with the Linux desktop environment. + +## Quick Start + +### 1. Identify Target Window +Use `wmctrl` to find the exact name of the window you want to control. +```bash +wmctrl -l +``` + +### 2. Inspect UI Hierarchy +For apps supporting accessibility (GNOME apps, Electron apps with `--force-renderer-accessibility`), use the inspection script to find button names without taking screenshots. +```bash +python3 scripts/inspect_ui.py "" +``` + +### 3. Perform Actions +Use `xdotool` via the helper script for common actions. +```bash +# Activate window +./scripts/gui_action.sh activate "" + +# Click coordinates +./scripts/gui_action.sh click 500 500 + +# Type text +./scripts/gui_action.sh type "Hello World" + +# Press a key +./scripts/gui_action.sh key "Return" +``` + +## Workflows + +### Operating an App via Text UI +1. List windows with `wmctrl -l`. +2. Activate the target window. +3. Run `scripts/inspect_ui.py` to get the list of buttons and inputs. +4. Use `xdotool key Tab` and `Return` to navigate, or `click` if coordinates are known. +5. If text-based inspection fails, fallback to taking a screenshot and using vision. + +### Forcing Accessibility in Electron Apps +Many modern apps (VS Code, Discord, Cider, Chrome) need a flag to expose their UI tree: +```bash +pkill +nohup --force-renderer-accessibility > /dev/null 2>&1 & +``` + +## Tool Reference + +- **wmctrl**: Window management (list, activate, move, resize). +- **xdotool**: Input simulation (click, type, key, mousemove). +- **dogtail**: UI tree extraction via AT-SPI (Accessibility bus). +- **scrot**: Lightweight screenshot tool. diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..9762446 --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn73q6e6sgjtn6nd5pca7wgh9n80ax0y", + "slug": "guicountrol", + "version": "1.0.0", + "publishedAt": 1769949980822 +} \ No newline at end of file diff --git a/scripts/gui_action.sh b/scripts/gui_action.sh new file mode 100644 index 0000000..1c5bb86 --- /dev/null +++ b/scripts/gui_action.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# A simple wrapper for xdotool and wmctrl actions. +# Usage: ./gui_action.sh + +ACTION=$1 +shift + +case $ACTION in + click) + # click x y + xdotool mousemove --sync "$1" "$2" click 1 + ;; + type) + # type "text" + xdotool type "$1" + ;; + key) + # key "Return" + xdotool key "$1" + ;; + activate) + # activate "Window Name" + wmctrl -a "$1" + ;; + list-windows) + wmctrl -l + ;; + screenshot) + # screenshot filename + scrot -z "$1" + ;; + *) + echo "Unknown action: $ACTION" + exit 1 + ;; +esac diff --git a/scripts/inspect_ui.py b/scripts/inspect_ui.py new file mode 100644 index 0000000..1e0988b --- /dev/null +++ b/scripts/inspect_ui.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +import sys +import argparse +from dogtail.tree import root + +def dump_node(node, depth=0, max_depth=10): + if depth > max_depth: + return + try: + name = node.name.strip() + role = node.roleName + # Only print if it has a name or is an interesting interactive element + if name or role in ['push button', 'entry', 'menu item', 'toggle button', 'check box']: + print(' ' * depth + f'<{role}> {name}') + for child in node.children: + dump_node(child, depth + 1, max_depth) + except: + pass + +def main(): + parser = argparse.ArgumentParser(description="Inspect Linux GUI application UI tree.") + parser.add_argument("app_name", help="Name of the application to inspect.") + parser.add_argument("--max-depth", type=int, default=15, help="Maximum depth to traverse.") + args = parser.parse_args() + + try: + app = root.application(args.app_name) + print(f"Dump for application: {args.app_name}") + dump_node(app, max_depth=args.max_depth) + except Exception as e: + print(f"Error: Could not find or inspect application '{args.app_name}'. {e}") + +if __name__ == "__main__": + main()