--- license: cc-by-nc-sa-4.0 language: - en base_model: - Qwen/Qwen2.5-VL-7B-Instruct pipeline_tag: image-text-to-text tags: - gui-grounding - gui-agent ---
import torch
import re
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
MODEL_PATH = "Qwen/Qwen2.5-VL-7B-Instruct"
IMAGE_PATH = "path/to/your/image.jpg"
INSTRUCTION = "Click the 'Search' button"
def parse_coordinates(raw_string: str) -> tuple[int, int]:
matches = re.findall(r'\[(\d+),\s*(\d+)\]', raw_string)
if matches:
return tuple(map(int, matches[0]))
return -1, -1
print("Loading model...")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
device_map="auto"
).eval()
processor = AutoProcessor.from_pretrained(MODEL_PATH)
image = Image.open(IMAGE_PATH).convert("RGB")
messages = [
{
"role":"system",
"content": [
{
"type": "text",
"text": "You are a helpful assistant."
},
{
"type": "text",
"text": """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n \n\n{"name": "grounding", "arguments": }\n \n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n"""
}
]
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": INSTRUCTION}
]
}]
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[prompt], images=[image], return_tensors="pt").to(model.device)
print("Running inference...")
generated_ids = model.generate(**inputs, max_new_tokens=128)
response_ids = generated_ids[0, len(inputs["input_ids"][0]):]
raw_response = processor.decode(response_ids, skip_special_tokens=True)
point_x, point_y = parse_coordinates(raw_response)
print("\n" + "="*20 + " RESULT " + "="*20)
print(f"Instruction: {INSTRUCTION}")
print(f"Raw Response: {raw_response}")
if point_x != -1:
_, _, resized_height, resized_width = inputs['pixel_values'].shape
norm_x = point_x / resized_width
norm_y = point_y / resized_height
print(f"✅ Parsed Point (on resized image): ({point_x}, {point_y})")
print(f"✅ Normalized Point (0.0 to 1.0): ({norm_x:.4f}, {norm_y:.4f})")
else:
print("❌ Could not parse coordinates from the response.")
print("="*48)
## 📮 Contact
Fell free to contact `liangyuchen@ruc.edu.cn` if you have any questions.
## License
This repo follows CC-BY-NC-SA 4.0 license. Please use this repo for non-commercial use ONLY.
## Citation
If you use this repository or find it helpful in your research, please cite it as follows:
```bibtex
@misc{chen2025uiinsenhancingguigrounding,
title={UI-Ins: Enhancing GUI Grounding with Multi-Perspective Instruction-as-Reasoning},
author={Liangyu Chen and Hanzhang Zhou and Chenglin Cai and Jianan Zhang and Panrong Tong and Quyu Kong and Xu Zhang and Chen Liu and Yuqi Liu and Wenxuan Wang and Yue Wang and Qin Jin and Steven Hoi},
year={2025},
eprint={2510.20286},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2510.20286},
}
```