import gradio as gr
import base64
import json
from PIL import ImageDraw
from io import BytesIO
import re
import requests
from transformers import Qwen2VLProcessor

processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", use_fast=True)
url = "http://localhost:8000/v2/models/vllm_model/generate"

# Function to handle the inference and visualization
def ask_triton(image):
    try:

        # Image Input
        buf = BytesIO()
        image.save(buf, format="PNG")
        img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
    
        # Build conversation
        messages = [
        {
            'role': 'system',
            'content': [{'type': 'text', 'text': "You are a Vision Language Model specialized in product images. Detect nutrition tables."}]
        },
        {
            'role': 'user',
            'content': [
                {
                    'type': 'image',
                    'image': img_b64,
                },
                {
                    'type': 'text',
                    'text': "Detect the bounding box of the nutrition table."
                }
            ]
        }
        ]
        
        # Apply chat template and build payload
        chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        payload = {
            "text_input": chat_text,
            "image": img_b64,
            "parameters": {
                "stream": False,
                "temperature": 0,
                "max_tokens": 2048
            }
        }

        # Send POST request to vLLM
        response = requests.post(url, json=payload)
        resp_json = response.json()
        output_text = resp_json.get("text_output", "")
        
        # Extract assistant response
        if "<|im_start|>assistant\n" in output_text:
            output_text = output_text.rsplit("<|im_start|>assistant\n", 1)[-1]
        
        # Extract and draw bounding box
        match = re.search(r"\((\d+),(\d+)\),\((\d+),(\d+)\)", output_text)
        if match:
            x1, y1, x2, y2 = map(int, match.groups())
            draw = ImageDraw.Draw(image)
            w, h = image.size
            draw.rectangle((x1 / 1000 * w, y1 / 1000 * h, x2 / 1000 * w, y2 / 1000 * h), outline="green", width=10)
        
        return image, output_text

    except Exception as e:      
        return f"Error: {e}"

# Gradio Interface
gr.Interface(
    fn=ask_triton,
    inputs=[
        gr.Image(type="pil")
    ],
    outputs=["image", "text"],
    title="Nutrition Table Detection",
    description="Please upload image containing a nutrition table to visualizes bounding box prediction."
).launch(server_name="0.0.0.0", server_port=7860)