Spaces:

sagar007
/

multimodal-gemma-270m-demo

Runtime error

App Files Files Community

sagar007 commited on Sep 20

Commit

f4f545d

verified ·

1 Parent(s): 18b63c5

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

app.py +44 -97
configs/config.yaml +12 -0
configs/data_config.yaml +66 -0
configs/gemma_270m_a100.yaml +147 -0
configs/model_config.yaml +40 -0
configs/multimodal_optimized.yaml +149 -0
configs/training_config.yaml +65 -0

app.py CHANGED Viewed

@@ -12,7 +12,19 @@ from PIL import Image
 import io
 import time
 import logging
-from huggingface_hub import hf_hub_download
 # Model imports
 from src.models import MultimodalGemmaLightning
@@ -39,103 +51,38 @@ def download_and_load_model():
             cache_dir="./model_cache"
         )
-        print("📁 Loading model from checkpoint...")
-        # Load checkpoint data to inspect what's inside
-        checkpoint = torch.load(checkpoint_path, map_location="cpu")
-        print(f"Checkpoint keys: {list(checkpoint.keys())}")
-        # Extract the saved hyperparameters if they exist
-        if "hyper_parameters" in checkpoint:
-            saved_config = checkpoint["hyper_parameters"].get("config", {})
-            print("Found saved config in checkpoint")
-            # Override any gated models in the saved config
-            if "model" in saved_config and "gemma_model_name" in saved_config["model"]:
-                if "google/gemma" in saved_config["model"]["gemma_model_name"]:
-                    print("Replacing gated Gemma model with accessible alternative")
-                    saved_config["model"]["gemma_model_name"] = "microsoft/DialoGPT-medium"
-                    saved_config["model"]["use_4bit"] = False  # Disable quantization for compatibility
-            config = saved_config
-        else:
-            print("No saved config found, creating minimal config")
-            # Create minimal config for loading
-            config = {
-                "model": {
-                    "gemma_model_name": "microsoft/DialoGPT-medium",  # Use non-gated model
-                    "vision_model_name": "openai/clip-vit-large-patch14",
-                    "use_4bit": False,  # Disable quantization for loading
-                    "projector_hidden_dim": 2048,
-                    "lora": {"r": 16, "alpha": 32, "dropout": 0.1}
-                },
-                "special_tokens": {"image_token": "<image>"},
-                "training": {"projector_lr": 1e-3, "lora_lr": 1e-4}
-            }
-        try:
-            # First try: Use the checkpoint's config if available
-            model = MultimodalGemmaLightning.load_from_checkpoint(
-                checkpoint_path,
-                config=config,
-                strict=False,
-                map_location="cuda" if torch.cuda.is_available() else "cpu"
-            )
-            print("✅ Loaded with checkpoint config")
-        except Exception as e1:
-            print(f"Failed with checkpoint config: {e1}")
-            try:
-                # Second try: Minimal config with no quantization
-                minimal_config = {
-                    "model": {
-                        "gemma_model_name": "microsoft/DialoGPT-small",  # Even smaller model
-                        "vision_model_name": "openai/clip-vit-base-patch32",  # Smaller CLIP
-                        "use_4bit": False,  # No quantization
-                        "projector_hidden_dim": 512,
-                        "lora": {"r": 8, "alpha": 16, "dropout": 0.1, "target_modules": ["q_proj", "v_proj"]}
-                    },
-                    "special_tokens": {"image_token": "<image>"},
-                    "training": {"projector_lr": 1e-3, "lora_lr": 1e-4}
-                }
-                model = MultimodalGemmaLightning.load_from_checkpoint(
-                    checkpoint_path,
-                    config=minimal_config,
-                    strict=False,
-                    map_location="cuda" if torch.cuda.is_available() else "cpu"
-                )
-                print("✅ Loaded with minimal config")
-            except Exception as e2:
-                print(f"Failed with minimal config: {e2}")
-                try:
-                    # Third try: Direct state dict loading
-                    print("Attempting direct state dict loading...")
-                    # Create a dummy model just to get the structure
-                    dummy_config = {
-                        "model": {
-                            "gemma_model_name": "microsoft/DialoGPT-small",
-                            "vision_model_name": "openai/clip-vit-base-patch32",
-                            "use_4bit": False,
-                            "projector_hidden_dim": 512,
-                        },
-                        "special_tokens": {"image_token": "<image>"},
-                        "training": {"projector_lr": 1e-3, "lora_lr": 1e-4}
-                    }
-                    model = MultimodalGemmaLightning(dummy_config)
-                    # Load only compatible weights
-                    checkpoint_state = checkpoint['state_dict']
-                    model_state = model.state_dict()
-                    # Filter and load compatible weights
-                    compatible_weights = {}
-                    for key, value in checkpoint_state.items():
-                        if key in model_state and model_state[key].shape == value.shape:
-                            compatible_weights[key] = value
-                    model.load_state_dict(compatible_weights, strict=False)
-                    print(f"✅ Loaded {len(compatible_weights)} compatible weights")
-                except Exception as e3:
-                    print(f"All loading methods failed: {e3}")
-                    return f"❌ Model loading failed - checkpoint incompatible. Last error: {str(e3)}"
         model.eval()
         # Move to appropriate device

 import io
 import time
 import logging
+import os
+from huggingface_hub import hf_hub_download, login
+# Try to login with HF token if available (for Spaces with secrets)
+try:
+    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
+    if hf_token:
+        login(token=hf_token)
+        print("✅ Logged in to Hugging Face")
+    else:
+        print("⚠️ No HF token found - will try to load anyway")
+except Exception as e:
+    print(f"⚠️ HF login failed: {e}")
 # Model imports
 from src.models import MultimodalGemmaLightning
             cache_dir="./model_cache"
         )
+        # Download config files (same as local setup)
+        model_config_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="configs/model_config.yaml",
+            cache_dir="./model_cache"
+        )
+        training_config_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="configs/training_config.yaml",
+            cache_dir="./model_cache"
+        )
+        data_config_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="configs/data_config.yaml",
+            cache_dir="./model_cache"
+        )
+        # Load configs exactly like local gradio_app.py
+        print("📁 Loading configs...")
+        model_config = load_config(model_config_path)
+        training_config = load_config(training_config_path)
+        data_config = load_config(data_config_path)
+        config = merge_configs([model_config, training_config, data_config])
+        print("📁 Loading model from checkpoint...")
+        # Load model exactly like local gradio_app.py
+        model = MultimodalGemmaLightning.load_from_checkpoint(
+            checkpoint_path,
+            config=config,
+            strict=False,
+            map_location="cuda" if torch.cuda.is_available() else "cpu"
+        )
         model.eval()
         # Move to appropriate device

configs/config.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Main Hydra Configuration
+defaults:
+  - model_config
+  - training_config
+  - data_config
+# Override settings
+hydra:
+  run:
+    dir: ./logs/hydra/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  job:
+    name: multimodal_gemma_training

configs/data_config.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# Data Configuration
+data:
+  # Dataset settings - using more accessible multimodal dataset
+  dataset_name: "liuhaotian/LLaVA-Instruct-150K"
+  cache_dir: "./data/cache"
+  num_workers: 8  # Increased for faster loading
+  pin_memory: true
+  persistent_workers: true
+  # Data splits
+  train_split: "train"
+  val_split: "train"  # LLaVA doesn't have a separate val split
+  val_size: 0.02      # Use 2% of train data for validation
+  # Text processing - optimized for speed
+  max_length: 256     # Reduced from 512 for faster training
+  truncation: true
+  padding: true
+  # Speed optimizations
+  filter_long_conversations: true
+  max_conversation_turns: 6    # Limit to 6 turns (3 human + 3 assistant)
+  use_subset: false           # Set to true for quick testing
+  subset_size: 10000          # Use only 10K samples for testing
+  # Image processing
+  image_size: 224
+  image_mean: [0.48145466, 0.4578275, 0.40821073]  # CLIP normalization
+  image_std: [0.26862954, 0.26130258, 0.27577711]
+  # Data augmentation (for images)
+  augmentation:
+    enabled: false  # Start without augmentation
+    random_resized_crop: 0.9
+    color_jitter: 0.1
+    horizontal_flip: 0.5
+  # Conversation formatting
+  conversation:
+    system_message: ""
+    user_prefix: "Human: "
+    assistant_prefix: "Assistant: "
+    turn_separator: "\n"
+  # Data filtering - enhanced for speed
+  filtering:
+    min_length: 10      # Minimum text length
+    max_length: 800     # Reduced from 1000 for faster training
+    filter_empty_images: true
+    filter_corrupt_images: true
+    filter_long_conversations: true
+    max_tokens_per_sample: 256  # Skip samples that would exceed max_length
+    min_image_questions: 1      # Skip samples without image-related questions
+  # Preprocessing
+  preprocessing:
+    cache_processed_data: true
+    precompute_image_features: false  # Set to true to cache CLIP features
+# COCO Images
+coco:
+  base_url: "http://images.cocodataset.org/train2017/"
+  download_timeout: 30
+  retry_attempts: 3
+  fallback_image_size: [224, 224]
+  fallback_image_color: "white"

configs/gemma_270m_a100.yaml ADDED Viewed

	@@ -0,0 +1,147 @@

+# Optimized configuration for Gemma-270M on A100 GPU
+# This configuration maximizes the potential of the smaller 270M model
+# Model Configuration
+model:
+  gemma_model_name: "google/gemma-3-270m"  # 270M parameter model
+  vision_model_name: "openai/clip-vit-large-patch14"
+  audio_model_name: "openai/whisper-small"
+  enable_audio: false
+  projector_hidden_dim: 1024  # Optimized for 270M
+  audio_hidden_dim: 512
+  # LoRA configuration - can be more aggressive with smaller model
+  lora:
+    r: 32          # Higher rank for 270M model
+    alpha: 64      # Higher alpha for better learning
+    dropout: 0.1
+    target_modules:
+      - "q_proj"
+      - "v_proj"
+      - "k_proj"
+      - "o_proj"
+      - "gate_proj"
+      - "up_proj"
+      - "down_proj"
+  # Quantization (optional for 270M - could train in full precision)
+  use_4bit: false      # 270M is small enough for full precision
+  bnb_4bit_compute_dtype: "bfloat16"
+  bnb_4bit_quant_type: "nf4"
+  use_nested_quant: false
+# Training Configuration - Optimized for A100 + 270M
+training:
+  max_epochs: 5        # More epochs for smaller model
+  batch_size: 16       # Large batch size for 270M on A100
+  accumulate_grad_batches: 2  # Effective batch size = 16 * 2 = 32
+  gradient_clip_val: 1.0
+  # Learning rates - can be higher for smaller model
+  lora_lr: 5e-4        # Higher learning rate
+  projector_lr: 2e-3   # Higher learning rate
+  weight_decay: 0.01
+  warmup_ratio: 0.05   # More warmup
+  # Validation
+  val_check_interval: 0.25  # Check more frequently
+  limit_val_batches: 50
+  # Checkpointing
+  save_top_k: 5
+  monitor: "val/loss"
+  mode: "min"
+  # Precision
+  precision: "bf16-mixed"  # A100 optimized
+  strategy: "auto"
+  # Early stopping
+  patience: 3
+  min_delta: 0.0005
+# Data Configuration
+data:
+  dataset_name: "liuhaotian/LLaVA-Instruct-150K"
+  cache_dir: "./data/cache"
+  num_workers: 8       # More workers for A100
+  pin_memory: true
+  persistent_workers: true
+  train_split: "train"
+  val_split: "train"
+  val_size: 0.02
+  max_length: 512
+  truncation: true
+  padding: true
+  image_size: 224
+  image_mean: [0.48145466, 0.4578275, 0.40821073]
+  image_std: [0.26862954, 0.26130258, 0.27577711]
+  # No augmentation for initial training
+  augmentation:
+    enabled: false
+  conversation:
+    system_message: ""
+    user_prefix: "Human: "
+    assistant_prefix: "Assistant: "
+    turn_separator: "\n"
+  filtering:
+    min_length: 10
+    max_length: 1000
+    filter_empty_images: true
+    filter_corrupt_images: true
+  preprocessing:
+    cache_processed_data: true
+    precompute_image_features: false
+# Trainer settings
+trainer:
+  accelerator: "gpu"
+  devices: 1
+  num_nodes: 1
+  log_every_n_steps: 25
+  enable_checkpointing: true
+  enable_progress_bar: true
+  enable_model_summary: true
+  fast_dev_run: false
+  overfit_batches: 0
+  detect_anomaly: false
+  deterministic: false
+  benchmark: true
+# Optimization
+optimization:
+  compile_model: true   # Enable for PyTorch 2.0+ speedup
+  use_fused_adamw: true
+# Logging
+logging:
+  use_wandb: true
+  wandb_project: "multimodal-gemma-270m"
+  wandb_name: "gemma-270m-llava-a100-optimized"
+  log_model: true
+  use_tensorboard: true
+  tb_log_dir: "logs/tensorboard"
+# Special tokens
+special_tokens:
+  image_token: "<image>"
+  audio_token: "<audio>"
+  pad_token: "<pad>"
+# Tokenizer settings
+tokenizer:
+  padding_side: "right"
+  truncation: true
+  max_length: 512
+  add_special_tokens: true

configs/model_config.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+# Model Configuration
+model:
+  # Base models
+  gemma_model_name: "google/gemma-3-270m"  # 270M parameter model - fast training on A100
+  vision_model_name: "openai/clip-vit-large-patch14"
+  # Model settings - vision-language only
+  projector_hidden_dim: 2048  # Larger projection for better alignment
+  # LoRA configuration - optimized for multimodal
+  lora:
+    r: 64          # Higher rank for better multimodal understanding
+    alpha: 128     # Higher alpha for better learning
+    dropout: 0.1   # Slightly higher dropout for regularization
+    target_modules:
+      - "q_proj"
+      - "v_proj"
+      - "k_proj"
+      - "o_proj"
+      - "gate_proj"
+      - "up_proj"
+      - "down_proj"
+  # Quantization
+  use_4bit: true
+  bnb_4bit_compute_dtype: "bfloat16"
+  bnb_4bit_quant_type: "nf4"
+  use_nested_quant: false
+# Tokenizer settings
+tokenizer:
+  padding_side: "right"
+  truncation: true
+  max_length: 512
+  add_special_tokens: true
+# Special tokens
+special_tokens:
+  image_token: "<image>"
+  pad_token: "<pad>"

configs/multimodal_optimized.yaml ADDED Viewed

	@@ -0,0 +1,149 @@

+# Optimized configuration for Gemma-270M multimodal training
+# This addresses previous inference quality issues
+# Model Configuration - Optimized for Multimodal
+model:
+  gemma_model_name: "google/gemma-3-270m"
+  vision_model_name: "openai/clip-vit-large-patch14"
+  audio_model_name: "openai/whisper-small"
+  enable_audio: false
+  projector_hidden_dim: 2048  # Larger for better vision-language alignment
+  audio_hidden_dim: 512
+  # LoRA configuration - Higher capacity for multimodal
+  lora:
+    r: 128         # Much higher rank for complex multimodal relationships
+    alpha: 256     # Higher alpha for better adaptation
+    dropout: 0.1   # Regularization for better generalization
+    target_modules:
+      - "q_proj"
+      - "v_proj"
+      - "k_proj"
+      - "o_proj"
+      - "gate_proj"
+      - "up_proj"
+      - "down_proj"
+  # Keep 4-bit quantization for memory efficiency
+  use_4bit: true
+  bnb_4bit_compute_dtype: "bfloat16"
+  bnb_4bit_quant_type: "nf4"
+  use_nested_quant: false
+# Training Configuration - Optimized for Multimodal Quality
+training:
+  max_epochs: 15        # More epochs for better convergence
+  batch_size: 6         # Slightly smaller for stability
+  accumulate_grad_batches: 8  # Effective batch size = 6 * 8 = 48
+  gradient_clip_val: 1.0
+  # Better learning rate balance
+  lora_lr: 1e-3         # Higher for language adaptation
+  projector_lr: 5e-3    # Much higher for vision-language alignment
+  weight_decay: 0.01
+  warmup_ratio: 0.15    # More warmup for stable training
+  # Validation
+  val_check_interval: 0.33  # Check validation more frequently
+  limit_val_batches: 50
+  # Checkpointing
+  save_top_k: 5
+  monitor: "val/loss"
+  mode: "min"
+  # Precision and optimization
+  precision: "bf16-mixed"
+  strategy: "auto"
+  # Early stopping
+  patience: 5
+  min_delta: 0.0001
+# Data Configuration - Focus on quality
+data:
+  dataset_name: "liuhaotian/LLaVA-Instruct-150K"
+  cache_dir: "./data/cache"
+  num_workers: 6         # More workers for better data loading
+  pin_memory: true
+  persistent_workers: true
+  train_split: "train"
+  val_split: "train"
+  val_size: 0.05        # Larger validation set
+  max_length: 512
+  truncation: true
+  padding: true
+  image_size: 224
+  image_mean: [0.48145466, 0.4578275, 0.40821073]
+  image_std: [0.26862954, 0.26130258, 0.27577711]
+  augmentation:
+    enabled: true       # Enable augmentation for better generalization
+    random_resized_crop: 0.9
+    color_jitter: 0.2
+    horizontal_flip: 0.3
+  conversation:
+    system_message: ""
+    user_prefix: "Human: "
+    assistant_prefix: "Assistant: "
+    turn_separator: "\n"
+  filtering:
+    min_length: 20      # Filter very short conversations
+    max_length: 800     # Allow longer conversations
+    filter_empty_images: true
+    filter_corrupt_images: true
+  preprocessing:
+    cache_processed_data: true
+    precompute_image_features: false
+# Trainer settings
+trainer:
+  accelerator: "gpu"
+  devices: 1
+  num_nodes: 1
+  log_every_n_steps: 10
+  enable_checkpointing: true
+  enable_progress_bar: true
+  enable_model_summary: true
+  fast_dev_run: false
+  overfit_batches: 0
+  detect_anomaly: false
+  deterministic: false
+  benchmark: true
+# Optimization
+optimization:
+  compile_model: false
+  use_fused_adamw: true
+# Logging - Enable for monitoring
+logging:
+  use_wandb: true
+  wandb_project: "gemma-270m-multimodal-optimized"
+  wandb_name: "gemma-270m-llava-quality-training"
+  log_model: true
+  use_tensorboard: true
+  tb_log_dir: "logs/tensorboard"
+# Special tokens
+special_tokens:
+  image_token: "<image>"
+  audio_token: "<audio>"
+  pad_token: "<pad>"
+# Tokenizer settings
+tokenizer:
+  padding_side: "right"
+  truncation: true
+  max_length: 512
+  add_special_tokens: true

configs/training_config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+# Training Configuration
+training:
+  # Training hyperparameters - optimized for speed
+  max_epochs: 8         # Reduced epochs (shorter sequences = faster convergence)
+  batch_size: 16        # Increased batch size (shorter sequences = more GPU memory)
+  accumulate_grad_batches: 2  # Effective batch size = 16 * 2 = 32
+  gradient_clip_val: 1.0
+  # Learning rates - better balance for multimodal
+  lora_lr: 5e-4         # Higher for better adaptation
+  projector_lr: 2e-3    # Higher for vision-language alignment
+  weight_decay: 0.01
+  warmup_ratio: 0.1     # More warmup for stability
+  # Validation
+  val_check_interval: 0.5  # Check validation every half epoch
+  limit_val_batches: 100   # Limit validation batches for speed
+  # Checkpointing
+  save_top_k: 3
+  monitor: "val/loss"
+  mode: "min"
+  # Precision and optimization
+  precision: "bf16-mixed"  # Use mixed precision for A100
+  strategy: "auto"  # Let Lightning choose the best strategy
+  # Early stopping
+  patience: 2
+  min_delta: 0.001
+# Lightning Trainer settings
+trainer:
+  accelerator: "gpu"
+  devices: 1  # Single GPU training
+  num_nodes: 1
+  log_every_n_steps: 10
+  enable_checkpointing: true
+  enable_progress_bar: true
+  enable_model_summary: true
+  # Debugging and profiling
+  fast_dev_run: false
+  overfit_batches: 0
+  detect_anomaly: false
+  # Reproducibility
+  deterministic: false  # Set to true for reproducible results (slower)
+  benchmark: true       # Optimize for consistent input sizes
+# Optimization settings
+optimization:
+  compile_model: false  # Set to true for PyTorch 2.0+ compilation
+  use_fused_adamw: true # Use fused AdamW for better performance
+# Logging and monitoring
+logging:
+  use_wandb: false  # Disable for now - needs API key
+  wandb_project: "multimodal-gemma"
+  wandb_name: "gemma-270m-llava-training"
+  log_model: false
+  # TensorBoard
+  use_tensorboard: true  # Use TensorBoard instead
+  tb_log_dir: "logs/tensorboard"