code and req

Browse files

Files changed (4) hide show

.gitignore +216 -0
README.md +1 -1
requirements.txt +4 -0
sample_code.py +104 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,216 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml

README.md CHANGED Viewed

@@ -186,7 +186,7 @@ pip install -r requirements.txt
 You can use [vLLM](https://docs.vllm.ai/en/latest/index.html) to serve the model.
 ```bash
-vllm serve uniphore/ActIO-UI-7B-SFT
 ```
 Then you can use the `demo.py` we provide to check out a sample response of the model with the training prompt.

 You can use [vLLM](https://docs.vllm.ai/en/latest/index.html) to serve the model.
 ```bash
+vllm serve Uniphore/actio-ui-7b-sft
 ```
 Then you can use the `demo.py` we provide to check out a sample response of the model with the training prompt.

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Pillow
+torch
+transformers
+vllm

sample_code.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import base64
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
+from PIL import Image
+def encode_image(image_path: str) -> str:
+    """Encode image to base64 string for model input."""
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode()
+def load_model(
+    model_path: str,
+) -> tuple[AutoModel, AutoTokenizer, AutoImageProcessor]:
+    """Load OpenCUA model, tokenizer, and image processor."""
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModel.from_pretrained(
+        model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
+    )
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_path, trust_remote_code=True
+    )
+    return model, tokenizer, image_processor
+def create_grounding_messages(image_path: str, instruction: str) -> list[dict]:
+    """Create chat messages for GUI grounding task."""
+    system_prompt = (
+        "You are a GUI agent. You are given a task and a screenshot of the screen. "
+        "You need to perform a series of pyautogui actions to complete the task."
+    )
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"data:image/png;base64,{encode_image(image_path)}",
+                },
+                {"type": "text", "text": instruction},
+            ],
+        },
+    ]
+    return messages
+def run_inference(
+    model: AutoModel,
+    tokenizer: AutoTokenizer,
+    image_processor: AutoImageProcessor,
+    messages: list[dict],
+    image_path: str,
+) -> str:
+    """Run inference on the model."""
+    # Prepare text input
+    input_ids = tokenizer.apply_chat_template(
+        messages, tokenize=True, add_generation_prompt=True
+    )
+    input_ids = torch.tensor([input_ids]).to(model.device)
+    # Prepare image input
+    image = Image.open(image_path).convert("RGB")
+    image_info = image_processor.preprocess(images=[image])
+    pixel_values = torch.tensor(image_info["pixel_values"]).to(
+        dtype=torch.bfloat16, device=model.device
+    )
+    grid_thws = torch.tensor(image_info["image_grid_thw"])
+    # Generate response
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_ids,
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            max_new_tokens=2048,
+            temperature=0,
+        )
+    # Decode output
+    prompt_len = input_ids.shape[1]
+    generated_ids = generated_ids[:, prompt_len:]
+    output_text = tokenizer.batch_decode(
+        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return output_text
+# Example usage
+model_path = "Uniphore/actio-ui-7b-sft"  # or other model variants
+image_path = "screenshot.png"
+instruction = "Click on the submit button"
+# Load model
+model, tokenizer, image_processor = load_model(model_path)
+# Create messages and run inference
+messages = create_grounding_messages(image_path, instruction)
+result = run_inference(model, tokenizer, image_processor, messages, image_path)
+print("Model output:", result)