import gradio as gr
import cv2
import numpy as np
from PIL import Image
import torch
from transformers import pipeline
import requests
from io import BytesIO
import os
from ultralytics import YOLO

class ImageStoryteller:
    def __init__(self):
        print("Initializing Image Storyteller with YOLOv8...")
        
        # Load YOLOv8 model for object detection
        try:
            self.yolo_model = YOLO('yolov8s.pt')  # Using nano version for faster inference
            print("YOLOv8 model loaded successfully!")
        except Exception as e:
            print(f"YOLOv8 loading failed: {e}")
            self.yolo_model = None
        
        # Initialize text generation pipelines
        try:
            # For narrative generation - using a smaller model for Hugging Face Spaces
            self.story_pipeline = pipeline(
                "text-generation",
                model="distilgpt2",  # Lighter model for Spaces
                torch_dtype=torch.float32
            )
            print("Story pipeline initialized!")
        except Exception as e:
            print(f"Story pipeline failed: {e}")
            self.story_pipeline = None
        
        # Common objects for fallback detection
        self.common_objects = [
            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
            'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
            'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
            'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
            'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
            'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
            'toothbrush'
        ]
    
    def detect_objects(self, image):
        """Detect objects in the image using YOLOv8"""
        if self.yolo_model is not None:
            try:
                # Convert PIL to numpy for YOLOv8
                img_np = np.array(image)
                
                # Run YOLOv8 detection
                results = self.yolo_model(img_np)
                
                objects = []
                for result in results:
                    boxes = result.boxes
                    if boxes is not None:
                        for box in boxes:
                            confidence = box.conf.item()
                            if confidence > 0.25:  # Confidence threshold
                                class_id = int(box.cls.item())
                                class_name = self.yolo_model.names[class_id]
                                bbox = box.xyxy[0].tolist()
                                
                                objects.append({
                                    'name': class_name,
                                    'confidence': confidence,
                                    'bbox': bbox
                                })
                
                return objects
                
            except Exception as e:
                print(f"YOLOv8 detection failed: {e}")
        
        # Fallback: Simple color-based object detection
        return self.fallback_object_detection(image)
    
    def draw_detections(self, image, objects):
        """Draw bounding boxes and labels on the image"""
        img_np = np.array(image)
        img_with_boxes = img_np.copy()
        
        # Colors for different object types
        colors = {
            'person': (0, 255, 0),      # Green
            'vehicle': (255, 0, 0),     # Blue (cars, bikes, etc.)
            'animal': (0, 165, 255),    # Orange
            'default': (255, 255, 0)    # Yellow
        }
        
        for obj in objects:
            bbox = obj['bbox']
            name = obj['name']
            confidence = obj['confidence']
            
            # Determine color based on object type
            if 'person' in name:
                color = colors['person']
            elif any(vehicle in name for vehicle in ['car', 'bicycle', 'motorcycle', 'bus', 'truck']):
                color = colors['vehicle']
            elif any(animal in name for animal in ['bird', 'cat', 'dog', 'horse', 'sheep', 'cow']):
                color = colors['animal']
            else:
                color = colors['default']
            
            # Convert coordinates to integers
            x1, y1, x2, y2 = map(int, bbox)
            
            # Draw bounding box
            cv2.rectangle(img_with_boxes, (x1, y1), (x2, y2), color, 2)
            
            # Draw label background
            label = f"{name} {confidence:.2f}"
            label_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
            cv2.rectangle(img_with_boxes, (x1, y1 - label_size[1] - 10), 
                         (x1 + label_size[0], y1), color, -1)
            
            # Draw label text
            cv2.putText(img_with_boxes, label, (x1, y1 - 5), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
        
        return Image.fromarray(img_with_boxes)
    
    def detect_facial_expressions(self, image, objects):
        """Simple facial expression detection based on face analysis"""
        img_np = np.array(image)
        expressions = []
        
        # Look for person objects
        person_objects = [obj for obj in objects if obj['name'] == 'person']
        
        if not person_objects:
            return expressions
        
        # Simple expression detection based on face position and context
        for person in person_objects:
            bbox = person['bbox']
            x1, y1, x2, y2 = map(int, bbox)
            
            # Extract face region (approximate)
            face_height = y2 - y1
            face_region = img_np[y1:y2, x1:x2]
            
            if face_region.size == 0:
                continue
            
            # Simple expression estimation based on face position and context
            expression = self.estimate_expression(face_region, bbox, img_np.shape)
            expressions.append({
                'person_bbox': bbox,
                'expression': expression,
                'confidence': 0.6  # Placeholder confidence
            })
        
        return expressions
    
    def estimate_expression(self, face_region, bbox, image_shape):
        """Estimate facial expression based on simple heuristics"""
        try:
            # Convert to grayscale for analysis
            if len(face_region.shape) == 3:
                gray_face = cv2.cvtColor(face_region, cv2.COLOR_RGB2GRAY)
            else:
                gray_face = face_region
            
            # Simple brightness and contrast analysis
            brightness = np.mean(gray_face)
            contrast = np.std(gray_face)
            
            # Face position in image
            x1, y1, x2, y2 = bbox
            img_height, img_width = image_shape[:2]
            face_center_y = (y1 + y2) / 2
            
            # Simple expression rules
            if brightness > 150 and contrast < 50:
                return "neutral"
            elif face_center_y < img_height * 0.3:  # Face in upper part
                return "surprised"
            elif contrast > 70:
                return "expressive"
            else:
                return "calm"
                
        except:
            return "neutral"
    
    def fallback_object_detection(self, image):
        """Simple fallback object detection based on color and composition"""
        img_np = np.array(image)
        height, width = img_np.shape[:2]
        
        objects = []
        
        # Detect based on dominant colors and regions
        hsv = cv2.cvtColor(img_np, cv2.COLOR_RGB2HSV)
        
        # Sky detection (blue regions at top)
        blue_mask = cv2.inRange(hsv, (100, 50, 50), (130, 255, 255))
        if np.sum(blue_mask) > height * width * 0.1:
            objects.append({'name': 'sky', 'confidence': 0.7, 'bbox': [0, 0, width, height//3]})
        
        # Green areas (grass, trees)
        green_mask = cv2.inRange(hsv, (35, 50, 50), (85, 255, 255))
        if np.sum(green_mask) > height * width * 0.1:
            objects.append({'name': 'nature', 'confidence': 0.6, 'bbox': [0, 2*height//3, width, height]})
        
        # Person-like shapes (skin tone detection)
        skin_mask = cv2.inRange(hsv, (0, 30, 60), (20, 150, 255))
        if np.sum(skin_mask) > 1000:
            objects.append({'name': 'person', 'confidence': 0.5, 'bbox': [width//4, height//4, 3*width//4, 3*height//4]})
        
        # Building-like structures (edges and lines)
        gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
        edges = cv2.Canny(gray, 50, 150)
        if np.sum(edges) > height * width * 0.05:
            objects.append({'name': 'building', 'confidence': 0.5, 'bbox': [width//3, 0, 2*width//3, height]})
        
        return objects
    
    def generate_narrative(self, objects, expressions, image_size):
        """Generate a narrative story based on detected objects and expressions"""
        if not objects:
            return "In this serene scene, the world holds its breath in quiet contemplation. " \
                   "Though specific elements remain mysterious, the composition speaks of " \
                   "peaceful moments and unspoken stories waiting to be discovered."
        
        # Extract object names
        object_names = [obj['name'] for obj in objects]
        unique_objects = list(set(object_names))
        
        # Include expressions in narrative
        expression_text = ""
        if expressions:
            exp_descriptions = []
            for exp in expressions:
                exp_descriptions.append(f"{exp['expression']} expression")
            expression_text = f" with {', '.join(exp_descriptions)}"
        
        # Create prompt for story generation
        prompt = f"In an image containing {', '.join(unique_objects)}{expression_text}, "
        
        if self.story_pipeline is not None:
            try:
                story = self.story_pipeline(
                    prompt + "tell a beautiful narrative story about this scene:",
                    max_length=200,
                    num_return_sequences=1,
                    temperature=0.8,
                    do_sample=True,
                    pad_token_id=50256
                )[0]['generated_text']
                return story
            except Exception as e:
                print(f"Story generation failed: {e}")
        
        # Fallback narrative generation
        return self.fallback_narrative(unique_objects, expressions, image_size)
    
    def fallback_narrative(self, objects, expressions, image_size):
        """Fallback method for generating narratives"""
        width, height = image_size
        
        # Include expression information
        expression_context = ""
        if expressions:
            main_expression = expressions[0]['expression']
            expression_context = f" with a {main_expression} demeanor"
        
        if 'person' in objects:
            if 'nature' in objects or 'sky' in objects:
                return f"In this {width}x{height} frame, a solitary figure stands amidst nature's embrace{expression_context}. " \
                       "The person seems lost in thought, surrounded by the gentle whispers of the environment. " \
                       "Each element in the scene tells a story of connection between humanity and the natural world."
            else:
                return f"Within this {width}x{height} composition, a human presence captures our attention{expression_context}. " \
                       "Their story unfolds silently, inviting us to imagine their journey, their dreams, " \
                       "and the moments that led them to this precise point in time."
        
        elif 'sky' in objects and 'nature' in objects:
            return f"This breathtaking {width}x{height} landscape paints a picture of serene beauty. " \
                   "The sky stretches endlessly above while nature flourishes below, creating a harmonious " \
                   "balance that speaks of timeless cycles and peaceful existence."
        
        elif 'building' in objects:
            return f"Architectural marvels dominate this {width}x{height} scene, telling stories of human " \
                   "ingenuity and civilization. Each structure holds memories of lives lived, dreams built, " \
                   "and the relentless march of progress through the ages."
        
        else:
            object_str = ', '.join(objects[:3])
            return f"In this captivating {width}x{height} image, {object_str} come together to create " \
                   "a unique visual symphony. Each element contributes to a larger story, inviting " \
                   "the viewer to explore the hidden narratives and emotional landscapes within the frame."
    
    def generate_poetry(self, narrative):
        """Generate poetic verses based on the narrative"""
        if self.story_pipeline is not None:
            try:
                poetry_prompt = f"Based on this description: '{narrative}' create 6 beautiful poetic lines:"
                
                poetry = self.story_pipeline(
                    poetry_prompt,
                    max_length=200,
                    num_return_sequences=1,
                    temperature=0.9,
                    do_sample=True,
                    pad_token_id=50256
                )[0]['generated_text']
                
                # Extract just the poetic lines
                lines = poetry.split('\n')
                poetic_lines = [line for line in lines if line.strip() and len(line.strip()) > 10]
                if len(poetic_lines) >= 4:
                    return '\n'.join(poetic_lines[:6])
            except Exception as e:
                print(f"Poetry generation failed: {e}")
        
        # Fallback poetry generation
        return self.fallback_poetry(narrative)
    
    def fallback_poetry(self, narrative):
        """Fallback method for generating poetry"""
        poetry_templates = [
            "In quiet frames, stories unfold\nWhispering secrets, brave and bold\nEach pixel holds a memory\nOf what was and what will be\nThrough lens and light, truth is told\nIn every story, life takes hold",
            
            "The canvas speaks in silent tones\nOf weathered stones and ancient bones\nA moment captured, frozen still\nAgainst time's relentless will\nWhere every object finds its home\nNo longer lost, no more to roam",
            
            "Light and shadow dance as one\nUntil the setting of the sun\nEach element with voice unique\nIn visual language, they speak\nOf journeys started and begun\nBefore the final race is run",
            
            "Through the lens, the world appears\nWith all its hopes, its dreams, its fears\nA silent symphony of sight\nBathed in celestial light\nEchoing through the coming years\nIn memory, it perseveres"
        ]
        
        # Choose template based on narrative content
        if 'person' in narrative.lower():
            return "A figure stands where pathways meet\nWith silent stories, bittersweet\nTheir journey captured in this space\nA momentary resting place\nWhere past and future gently greet\nMaking life's pattern more complete"
        elif 'nature' in narrative.lower():
            return "Where nature's breath does softly blow\nAnd gentle streams forever flow\nThe trees stand tall, the flowers bloom\nDispelling sorrow, grief, and gloom\nIn every leaf, life's patterns grow\nA truth that only nature knows"
        elif 'building' in narrative.lower():
            return "Stone and steel reach for the sky\nWhere human dreams learn how to fly\nEach window holds a different tale\nOf moments fragile, strong, or frail\nBeneath the sun's watchful eye\nTime's relentless river flows by"
        else:
            return poetry_templates[0]
    
    def process_image(self, image):
        """Main processing function"""
        try:
            # Detect objects
            objects = self.detect_objects(image)
            
            # Detect facial expressions if people are present
            expressions = []
            if any(obj['name'] == 'person' for obj in objects):
                expressions = self.detect_facial_expressions(image, objects)
            
            # Generate narrative
            narrative = self.generate_narrative(objects, expressions, image.size)
            
            # Generate poetry
            poetry = self.generate_poetry(narrative)
            
            # Create detection visualization
            detection_image = self.draw_detections(image, objects)
            
            return narrative, poetry, detection_image
            
        except Exception as e:
            error_msg = f"An error occurred while processing the image: {str(e)}"
            return error_msg, "Unable to generate poetry due to processing error.", image

# Initialize the storyteller
storyteller = ImageStoryteller()

# Check for local example images
example_images = []
for i in range(1, 3):
    filename = f"obj_{i:02d}.jpg"
    if os.path.exists(filename):
        example_images.append([filename])
        print(f"Found example image: {filename}")

if not example_images:
    print("No local example images found, using placeholder")
    # Create a placeholder if no local images
    example_images = [[np.ones((300, 300, 3), dtype=np.uint8) * 100]]

# Create Gradio interface
with gr.Blocks(title="AI Image Storyteller Pro", theme="soft") as demo:
    gr.Markdown("# 📖 AI Image Storyteller Pro")
    gr.Markdown("**Upload any image and watch AI detect objects, analyze scenes, and create beautiful stories!**")
    
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(
                type="pil", 
                label="🖼️ Upload Your Image",
                height=300
            )
            process_btn = gr.Button("✨ Analyze Image & Create Story", variant="primary", size="lg")
        
        with gr.Column():
            detection_output = gr.Image(
                label="🔍 Object Detection",
                height=300,
                show_download_button=True
            )
    
    with gr.Row():
        with gr.Column():
            with gr.Tab("📖 Narrative Story"):
                narrative_output = gr.Textbox(
                    label="Image Narrative",
                    lines=5,
                    max_lines=8,
                    placeholder="Your image's story will appear here...",
                    show_copy_button=True
                )
            
            with gr.Tab("🎭 Poetic Verses"):
                poetry_output = gr.Textbox(
                    label="6-Line Poetry",
                    lines=6,
                    max_lines=7,
                    placeholder="Poetic interpretation will appear here...",
                    show_copy_button=True
                )
    
    # Examples section with local images
    gr.Markdown("### 🎯 Try These Examples")
    gr.Examples(
        examples=example_images,
        inputs=input_image,
        outputs=[narrative_output, poetry_output, detection_output],
        fn=storyteller.process_image,
        cache_examples=True
    )
    
    # How it works section
    with gr.Accordion("🔍 How It Works", open=False):
        gr.Markdown("""
        **The Magic Behind the Stories:**
        
        1. **Object Detection**: YOLOv8 AI model identifies objects in your image with bounding boxes
        2. **Facial Analysis**: Simple expression detection for human faces
        3. **Scene Analysis**: The system analyzes the composition and relationships between objects
        4. **Narrative Generation**: AI creates a compelling story based on the detected elements
        5. **Poetry Creation**: Transformers model converts the narrative into beautiful 6-line verses
        
        **Features:**
        - Real-time object detection with YOLOv8
        - Visual bounding box display
        - Facial expression estimation
        - Context-aware storytelling
        - Beautiful poetic interpretations
        
        **Perfect for:**
        - Personal photos
        - Landscape images  
        - Urban scenes
        - Group photos
        - Travel memories
        """)
    
    # Set up the processing
    process_btn.click(
        fn=storyteller.process_image,
        inputs=input_image,
        outputs=[narrative_output, poetry_output, detection_output]
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )