Spaces:

sagar007
/

multimodal-gemma-270m-demo

Runtime error

App Files Files Community

sagar007 commited on Sep 20

Commit

34b253d

verified ·

1 Parent(s): 085f1c9

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

src/data/__init__.py +11 -0
src/data/__pycache__/__init__.cpython-311.pyc +0 -0
src/data/__pycache__/datamodule.cpython-311.pyc +0 -0
src/data/__pycache__/dataset.cpython-311.pyc +0 -0
src/data/__pycache__/processors.cpython-311.pyc +0 -0
src/data/datamodule.py +179 -0
src/data/dataset.py +543 -0
src/data/processors.py +181 -0
src/training/__init__.py +7 -0
src/training/callbacks.py +74 -0
src/training/utils.py +115 -0
src/utils/logging.py +73 -0

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .dataset import LLaVADataset, MultimodalCollator
+from .datamodule import LLaVADataModule
+from .processors import ImageProcessor, TextProcessor
+__all__ = [
+    "LLaVADataset",
+    "MultimodalCollator",
+    "LLaVADataModule",
+    "ImageProcessor",
+    "TextProcessor"
+]

src/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (448 Bytes). View file

src/data/__pycache__/datamodule.cpython-311.pyc ADDED Viewed

Binary file (8.05 kB). View file

src/data/__pycache__/dataset.cpython-311.pyc ADDED Viewed

Binary file (25.2 kB). View file

src/data/__pycache__/processors.cpython-311.pyc ADDED Viewed

Binary file (8.77 kB). View file

src/data/datamodule.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+PyTorch Lightning DataModule for LLaVA dataset
+"""
+import lightning as L
+import torch
+from torch.utils.data import DataLoader, random_split
+from typing import Optional, Dict, Any
+import logging
+from .dataset import LLaVADataset, MultimodalCollator
+logger = logging.getLogger(__name__)
+class LLaVADataModule(L.LightningDataModule):
+    """Lightning DataModule for LLaVA dataset"""
+    def __init__(
+        self,
+        tokenizer,
+        vision_processor,
+        config: Dict[str, Any]
+    ):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.vision_processor = vision_processor
+        self.config = config
+        # Data configuration
+        data_config = config["data"]
+        self.batch_size = config["training"]["batch_size"]
+        self.num_workers = data_config.get("num_workers", 4)
+        self.pin_memory = data_config.get("pin_memory", True)
+        self.persistent_workers = data_config.get("persistent_workers", True)
+        # Dataset splits
+        self.train_split = data_config.get("train_split", "train")
+        self.val_split = data_config.get("val_split", "train")  # LLaVA doesn't have separate val
+        self.val_size = data_config.get("val_size", 0.02)
+        # Initialize datasets to None
+        self.train_dataset = None
+        self.val_dataset = None
+        # Create collator
+        self.collator = MultimodalCollator(
+            tokenizer=self.tokenizer,
+            vision_processor=self.vision_processor,
+            config=self.config
+        )
+        logger.info("LLaVADataModule initialized")
+    def prepare_data(self) -> None:
+        """Download and prepare data (called only on rank 0)"""
+        # This will download the dataset if not already cached
+        try:
+            LLaVADataset(
+                config=self.config,
+                split=self.train_split
+            )
+            logger.info("Dataset preparation completed")
+        except Exception as e:
+            logger.error(f"Failed to prepare dataset: {e}")
+            raise
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Setup datasets for training/validation/testing"""
+        if stage == "fit" or stage is None:
+            # Load full training dataset
+            full_dataset = LLaVADataset(
+                config=self.config,
+                split=self.train_split
+            )
+            # Split into train and validation
+            total_size = len(full_dataset)
+            val_size = int(total_size * self.val_size)
+            train_size = total_size - val_size
+            self.train_dataset, self.val_dataset = random_split(
+                full_dataset,
+                [train_size, val_size],
+                generator=torch.Generator().manual_seed(42)  # For reproducibility
+            )
+            logger.info(f"Dataset split: {train_size} train, {val_size} validation")
+        if stage == "test":
+            # For testing, we'll use a small subset of the training data
+            self.test_dataset = LLaVADataset(
+                config=self.config,
+                split=self.train_split
+            )
+        if stage == "predict":
+            # For prediction, setup can be done dynamically
+            pass
+    def train_dataloader(self) -> DataLoader:
+        """Create training dataloader"""
+        if self.train_dataset is None:
+            raise RuntimeError("Train dataset not initialized. Call setup() first.")
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers and self.num_workers > 0,
+            collate_fn=self.collator,
+            drop_last=True  # Drop incomplete batches for consistent training
+        )
+    def val_dataloader(self) -> DataLoader:
+        """Create validation dataloader"""
+        if self.val_dataset is None:
+            raise RuntimeError("Validation dataset not initialized. Call setup() first.")
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers and self.num_workers > 0,
+            collate_fn=self.collator,
+            drop_last=False
+        )
+    def test_dataloader(self) -> DataLoader:
+        """Create test dataloader"""
+        if not hasattr(self, 'test_dataset') or self.test_dataset is None:
+            raise RuntimeError("Test dataset not initialized. Call setup() first.")
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=self.collator,
+            drop_last=False
+        )
+    def predict_dataloader(self) -> DataLoader:
+        """Create prediction dataloader"""
+        # This can be implemented based on specific prediction needs
+        return self.val_dataloader()
+    def teardown(self, stage: Optional[str] = None) -> None:
+        """Clean up after training/testing"""
+        # Log dataset statistics if available
+        if hasattr(self, 'train_dataset') and self.train_dataset is not None:
+            if hasattr(self.train_dataset.dataset, 'get_stats'):
+                stats = self.train_dataset.dataset.get_stats()
+                logger.info(f"Training dataset stats: {stats}")
+        if hasattr(self, 'val_dataset') and self.val_dataset is not None:
+            if hasattr(self.val_dataset.dataset, 'get_stats'):
+                stats = self.val_dataset.dataset.get_stats()
+                logger.info(f"Validation dataset stats: {stats}")
+    def get_dataset_info(self) -> Dict[str, Any]:
+        """Get information about the loaded datasets"""
+        info = {}
+        if self.train_dataset is not None:
+            info["train_size"] = len(self.train_dataset)
+        if self.val_dataset is not None:
+            info["val_size"] = len(self.val_dataset)
+        info["batch_size"] = self.batch_size
+        info["num_workers"] = self.num_workers
+        return info

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,543 @@

+"""
+Dataset implementation for LLaVA multimodal training
+"""
+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset
+import requests
+from PIL import Image
+import io
+from typing import Dict, Any, List, Optional, Union
+import logging
+import time
+from pathlib import Path
+from .processors import ImageProcessor, TextProcessor
+logger = logging.getLogger(__name__)
+class LLaVADataset(Dataset):
+    """LLaVA dataset for multimodal training"""
+    def __init__(
+        self,
+        config: Dict[str, Any],
+        split: str = "train",
+        transform: Optional[Any] = None
+    ):
+        self.config = config
+        self.split = split
+        self.transform = transform
+        # Initialize processors
+        self.image_processor = ImageProcessor(config)
+        self.text_processor = TextProcessor(config)
+        # Dataset configuration
+        data_config = config["data"]
+        self.cache_dir = data_config.get("cache_dir", "./data/cache")
+        self.image_size = data_config["image_size"]
+        # COCO configuration
+        coco_config = config.get("coco", {})
+        self.coco_base_url = coco_config.get("base_url", "http://images.cocodataset.org/train2017/")
+        self.download_timeout = coco_config.get("download_timeout", 30)
+        self.retry_attempts = coco_config.get("retry_attempts", 3)
+        self.fallback_size = tuple(coco_config.get("fallback_image_size", [224, 224]))
+        self.fallback_color = coco_config.get("fallback_image_color", "white")
+        # Load dataset
+        self._load_dataset()
+        # Apply filtering optimizations
+        if config["data"].get("filter_long_conversations", True):
+            self._filter_dataset()
+        # Statistics
+        self.successful_images = 0
+        self.failed_images = 0
+        logger.info(f"Initialized LLaVADataset with {len(self.dataset)} samples for split '{split}'")
+    def _load_dataset(self):
+        """Load the LLaVA dataset from HuggingFace"""
+        dataset_name = self.config["data"]["dataset_name"]
+        # Create cache directory
+        Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+        # Try different loading approaches
+        loading_strategies = [
+            # Strategy 1: Simple loading without problematic parameters
+            lambda: load_dataset(
+                dataset_name,
+                split=self.split,
+                cache_dir=self.cache_dir
+            ),
+            # Strategy 2: With streaming disabled
+            lambda: load_dataset(
+                dataset_name,
+                split=self.split,
+                cache_dir=self.cache_dir,
+                streaming=False
+            ),
+            # Strategy 3: Different data format approach
+            lambda: self._load_alternative_format(dataset_name),
+            # Strategy 4: Load from local files if available
+            lambda: self._load_local_dataset(dataset_name)
+        ]
+        for i, strategy in enumerate(loading_strategies):
+            try:
+                logger.info(f"Trying dataset loading strategy {i+1}...")
+                self.dataset = strategy()
+                # Validate dataset
+                if len(self.dataset) == 0:
+                    raise ValueError("Dataset is empty")
+                logger.info(f"Successfully loaded {len(self.dataset)} examples from {dataset_name}")
+                return
+            except Exception as e:
+                logger.warning(f"Strategy {i+1} failed: {e}")
+                # Continue to next strategy
+        # If all strategies fail, create a larger dummy dataset for development
+        logger.warning("All loading strategies failed, creating larger dummy dataset...")
+        self.dataset = self._create_development_dataset()
+    def _load_alternative_format(self, dataset_name):
+        """Try alternative loading format for LLaVA dataset"""
+        try:
+            # Try loading with explicit JSON format
+            from datasets import load_dataset, DownloadConfig
+            download_config = DownloadConfig(
+                resume_download=True,
+                force_download=False,
+                use_etag=False
+            )
+            return load_dataset(
+                "json",
+                data_files={
+                    "train": "hf://datasets/liuhaotian/LLaVA-Instruct-150K/llava_instruct_150k.json"
+                },
+                split=self.split,
+                cache_dir=self.cache_dir,
+                download_config=download_config
+            )
+        except Exception as e:
+            logger.warning(f"Alternative format loading failed: {e}")
+            raise
+    def _load_local_dataset(self, dataset_name):
+        """Try to load dataset from local files or alternative sources"""
+        try:
+            # Try loading with minimal parameters
+            return load_dataset(
+                dataset_name,
+                split=self.split,
+                cache_dir=self.cache_dir
+            )
+        except Exception:
+            # If local loading fails, create dummy data
+            logger.warning("Local loading failed, using dummy dataset")
+            return self._create_dummy_dataset()
+    def _create_dummy_dataset(self):
+        """Create a small dummy dataset for testing"""
+        from datasets import Dataset
+        dummy_data = []
+        for i in range(100):  # Small dataset for testing
+            # Use realistic COCO-style filenames that will trigger fallback
+            coco_filename = f"{str(i).zfill(12)}.jpg"
+            dummy_data.append({
+                "id": str(i),
+                "image": coco_filename,
+                "conversations": [
+                    {
+                        "from": "human",
+                        "value": f"What do you see in image {i}?"
+                    },
+                    {
+                        "from": "gpt",
+                        "value": f"I can see an image numbered {i}."
+                    }
+                ]
+            })
+        return Dataset.from_list(dummy_data)
+    def _create_development_dataset(self):
+        """Create a larger dummy dataset for development/testing"""
+        from datasets import Dataset
+        import random
+        # Create more realistic sample data for development
+        dummy_data = []
+        # Common visual questions and responses
+        questions = [
+            "What do you see in this image?",
+            "Describe the main objects in the picture.",
+            "What is the person doing?",
+            "What colors are prominent in this image?",
+            "Can you identify any animals in the picture?",
+            "What's the setting or location of this image?",
+            "Are there any vehicles visible?",
+            "What's the weather like in the image?",
+            "How many people are in the picture?",
+            "What objects are on the table?",
+        ]
+        responses = [
+            "I can see a person standing in a park with trees in the background.",
+            "The image shows a cat sitting on a windowsill, looking outside.",
+            "There's a red car parked on a street with buildings nearby.",
+            "I notice several people walking on a busy sidewalk.",
+            "The picture contains a bowl of fruit on a wooden table.",
+            "I can see a dog playing in a grassy field.",
+            "The image shows a bicycle leaning against a wall.",
+            "There's a group of children playing in a playground.",
+            "I can see mountains in the distance with a clear blue sky.",
+            "The picture shows a kitchen with modern appliances.",
+        ]
+        # Generate realistic sample size for development
+        num_samples = self.config["data"].get("subset_size", 10000) if self.config["data"].get("use_subset", False) else 50000
+        for i in range(num_samples):
+            # Use realistic COCO-style filenames
+            coco_filename = f"{str(i % 1000).zfill(12)}.jpg"
+            question = random.choice(questions)
+            response = random.choice(responses)
+            dummy_data.append({
+                "id": str(i),
+                "image": coco_filename,
+                "conversations": [
+                    {
+                        "from": "human",
+                        "value": question
+                    },
+                    {
+                        "from": "gpt",
+                        "value": response
+                    }
+                ]
+            })
+        logger.info(f"Created development dataset with {len(dummy_data)} samples")
+        return Dataset.from_list(dummy_data)
+    def _filter_dataset(self):
+        """Filter dataset for faster training"""
+        logger.info("Applying speed optimization filters...")
+        filtering_config = self.config["data"]["filtering"]
+        data_config = self.config["data"]
+        original_size = len(self.dataset)
+        filtered_indices = []
+        # Use subset for testing if enabled
+        if data_config.get("use_subset", False):
+            subset_size = data_config.get("subset_size", 10000)
+            indices = list(range(min(subset_size, original_size)))
+            logger.info(f"Using subset of {len(indices)} samples for testing")
+        else:
+            indices = list(range(original_size))
+        max_turns = data_config.get("max_conversation_turns", 6)
+        max_tokens = filtering_config.get("max_tokens_per_sample", 256)
+        max_length = filtering_config.get("max_length", 800)
+        for idx in indices:
+            try:
+                item = self.dataset[idx]
+                conversations = item.get("conversations", [])
+                # Filter by conversation length
+                if len(conversations) > max_turns:
+                    continue
+                # Estimate token count (rough approximation: 1 token ≈ 4 chars)
+                total_text = ""
+                for conv in conversations:
+                    total_text += conv.get("value", "")
+                estimated_tokens = len(total_text) // 4
+                if estimated_tokens > max_tokens:
+                    continue
+                # Check if it's image-related (has visual keywords)
+                has_visual_content = any(
+                    keyword in total_text.lower()
+                    for keyword in ["see", "image", "picture", "photo", "visual", "look", "show", "appear", "visible"]
+                )
+                if filtering_config.get("min_image_questions", 1) > 0 and not has_visual_content:
+                    continue
+                # Check final text length
+                if len(total_text) > max_length:
+                    continue
+                filtered_indices.append(idx)
+            except Exception as e:
+                logger.debug(f"Error filtering item {idx}: {e}")
+                continue
+        # Apply filtering
+        if filtered_indices:
+            self.dataset = self.dataset.select(filtered_indices)
+        filtered_size = len(self.dataset)
+        reduction_pct = (1 - filtered_size / original_size) * 100
+        logger.info(f"Dataset filtered: {original_size:,} → {filtered_size:,} samples")
+        logger.info(f"Reduction: {reduction_pct:.1f}% (faster training!)")
+        return self.dataset
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get a single sample from the dataset with improved error handling"""
+        try:
+            item = self.dataset[idx]
+            # Load and process image
+            image = self._load_image(item.get("image", ""))
+            # Process conversation text with robust handling
+            conversations = item.get("conversations", [])
+            if not conversations or not isinstance(conversations, list):
+                # Fallback if no valid conversations
+                conversations = [
+                    {"from": "human", "value": "What do you see in this image?"},
+                    {"from": "gpt", "value": "I can see an image that contains various visual elements."}
+                ]
+            formatted_text = self.text_processor.format_conversation(conversations)
+            # Add image token if image is present
+            formatted_text = self.text_processor.add_image_token(formatted_text, image is not None)
+            # More lenient validation - only reject if truly problematic
+            if not self.text_processor.validate_text(formatted_text):
+                # Create a better fallback based on original conversations
+                try:
+                    # Try to extract any usable content
+                    fallback_content = "What do you see in this image?"
+                    if conversations and len(conversations) > 0:
+                        first_conv = conversations[0]
+                        if isinstance(first_conv, dict) and "value" in first_conv:
+                            user_text = str(first_conv["value"]).strip()
+                            if user_text and len(user_text) > 5:
+                                fallback_content = user_text
+                    formatted_text = f"<image>\nHuman: {fallback_content}\nAssistant: I can see an image."
+                except Exception:
+                    formatted_text = "<image>\nHuman: What do you see?\nAssistant: I see an image."
+            return {
+                "image": image,
+                "text": formatted_text,
+                "conversations": conversations,
+                "id": item.get("id", f"sample_{idx}"),
+                "image_filename": item.get("image", ""),
+                "has_image": image is not None
+            }
+        except Exception as e:
+            logger.debug(f"Error processing item {idx}: {e}")
+            # Return a fallback sample (reduce logging level to debug)
+            return self._get_fallback_sample(idx)
+    def _load_image(self, image_filename: str) -> Optional[Image.Image]:
+        """Load image from COCO dataset with retry logic"""
+        if not image_filename or not image_filename.strip():
+            return None
+        # Check if it's a dummy image (contains "dummy_")
+        if "dummy_" in image_filename:
+            logger.debug(f"Using placeholder image for {image_filename}")
+            return self._create_fallback_image()
+        # For actual dummy filenames from our generated dataset (short numbers), use placeholder
+        filename_without_ext = image_filename.replace('.jpg', '').replace('.png', '')
+        if image_filename and filename_without_ext.isdigit() and len(filename_without_ext) <= 6:
+            logger.debug(f"Using placeholder image for dummy filename: {image_filename}")
+            return self._create_fallback_image()
+        # Check cache first
+        cache_path = Path(self.cache_dir) / "images" / image_filename
+        if cache_path.exists():
+            try:
+                image = Image.open(cache_path).convert('RGB')
+                self.successful_images += 1
+                return image
+            except Exception:
+                cache_path.unlink(missing_ok=True)  # Remove corrupted cache
+        image_url = f"{self.coco_base_url}{image_filename}"
+        for attempt in range(self.retry_attempts):
+            try:
+                response = requests.get(
+                    image_url,
+                    timeout=self.download_timeout,
+                    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+                )
+                response.raise_for_status()
+                # Load and validate image
+                image = Image.open(io.BytesIO(response.content)).convert('RGB')
+                # Basic validation
+                if image.size[0] < 10 or image.size[1] < 10:
+                    raise ValueError("Image too small")
+                # Cache the image
+                cache_path.parent.mkdir(parents=True, exist_ok=True)
+                image.save(cache_path, "JPEG", quality=85)
+                logger.debug(f"Cached image: {cache_path}")
+                self.successful_images += 1
+                return image
+            except Exception as e:
+                if attempt == self.retry_attempts - 1:
+                    logger.debug(f"Failed to load image {image_filename} after {self.retry_attempts} attempts: {e}")
+                    self.failed_images += 1
+                    return self._create_fallback_image()
+                else:
+                    time.sleep(0.5)  # Brief pause before retry
+        return self._create_fallback_image()
+    def _create_fallback_image(self) -> Image.Image:
+        """Create a fallback image when loading fails"""
+        return Image.new('RGB', self.fallback_size, color=self.fallback_color)
+    def _get_fallback_sample(self, idx: int) -> Dict[str, Any]:
+        """Get a fallback sample when processing fails"""
+        fallback_image = self._create_fallback_image()
+        fallback_text = "Human: What do you see in this image?\nAssistant: I can see a simple image."
+        return {
+            "image": fallback_image,
+            "text": fallback_text,
+            "conversations": [
+                {"from": "human", "value": "What do you see in this image?"},
+                {"from": "gpt", "value": "I can see a simple image."}
+            ],
+            "id": f"fallback_{idx}",
+            "image_filename": "",
+            "has_image": True
+        }
+    def get_stats(self) -> Dict[str, int]:
+        """Get dataset statistics"""
+        return {
+            "total_samples": len(self),
+            "successful_images": self.successful_images,
+            "failed_images": self.failed_images,
+            "success_rate": self.successful_images / (self.successful_images + self.failed_images) * 100
+                           if (self.successful_images + self.failed_images) > 0 else 0
+        }
+class MultimodalCollator:
+    """Custom collator for multimodal data batching"""
+    def __init__(
+        self,
+        tokenizer,
+        vision_processor,
+        config: Dict[str, Any],
+        max_length: Optional[int] = None
+    ):
+        self.tokenizer = tokenizer
+        self.vision_processor = vision_processor
+        self.config = config
+        self.max_length = max_length or config["data"]["max_length"]
+        # Image token for processing
+        self.image_token = config.get("special_tokens", {}).get("image_token", "<image>")
+    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        """Collate a batch of samples"""
+        images = []
+        texts = []
+        has_images = []
+        for sample in batch:
+            # Collect images
+            if sample["image"] is not None:
+                images.append(sample["image"])
+                has_images.append(True)
+            else:
+                # Create placeholder image for samples without images
+                placeholder = Image.new('RGB', (224, 224), color='white')
+                images.append(placeholder)
+                has_images.append(False)
+            # Collect texts
+            texts.append(sample["text"])
+        # Process images using vision processor
+        try:
+            vision_inputs = self.vision_processor(
+                images=images,
+                return_tensors="pt"
+            )
+            pixel_values = vision_inputs["pixel_values"]
+        except Exception as e:
+            logger.error(f"Error processing images: {e}")
+            # Create dummy pixel values
+            pixel_values = torch.zeros(len(batch), 3, 224, 224)
+        # Tokenize texts
+        try:
+            text_inputs = self.tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt"
+            )
+        except Exception as e:
+            logger.error(f"Error tokenizing texts: {e}")
+            # Create dummy inputs
+            text_inputs = {
+                "input_ids": torch.zeros(len(batch), self.max_length, dtype=torch.long),
+                "attention_mask": torch.ones(len(batch), self.max_length, dtype=torch.long)
+            }
+        # Create labels (same as input_ids for causal LM)
+        labels = text_inputs["input_ids"].clone()
+        # Mask padding tokens in labels (-100 is ignored by loss function)
+        labels[labels == self.tokenizer.pad_token_id] = -100
+        batch_dict = {
+            "input_ids": text_inputs["input_ids"],
+            "attention_mask": text_inputs["attention_mask"],
+            "labels": labels,
+            "images": pixel_values,
+            "has_images": torch.tensor(has_images, dtype=torch.bool)
+        }
+        return batch_dict

src/data/processors.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+Data processors for images and text
+"""
+import torch
+from PIL import Image
+import torchvision.transforms as transforms
+from typing import List, Dict, Any, Optional
+import logging
+logger = logging.getLogger(__name__)
+class ImageProcessor:
+    """Image preprocessing for CLIP vision encoder"""
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.image_size = config["data"]["image_size"]
+        # CLIP normalization values
+        self.mean = config["data"]["image_mean"]
+        self.std = config["data"]["image_std"]
+        # Setup transforms
+        self.transform = self._setup_transforms()
+    def _setup_transforms(self):
+        """Setup image transformations"""
+        transform_list = [
+            transforms.Resize((self.image_size, self.image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std)
+        ]
+        # Add augmentations if enabled
+        if self.config["data"]["augmentation"]["enabled"]:
+            aug_transforms = []
+            # Random resized crop
+            if self.config["data"]["augmentation"].get("random_resized_crop"):
+                scale = self.config["data"]["augmentation"]["random_resized_crop"]
+                aug_transforms.append(
+                    transforms.RandomResizedCrop(
+                        self.image_size,
+                        scale=(scale, 1.0)
+                    )
+                )
+            # Color jitter
+            if self.config["data"]["augmentation"].get("color_jitter"):
+                brightness = self.config["data"]["augmentation"]["color_jitter"]
+                aug_transforms.append(
+                    transforms.ColorJitter(brightness=brightness)
+                )
+            # Horizontal flip
+            if self.config["data"]["augmentation"].get("horizontal_flip"):
+                prob = self.config["data"]["augmentation"]["horizontal_flip"]
+                aug_transforms.append(
+                    transforms.RandomHorizontalFlip(p=prob)
+                )
+            # Insert augmentations before normalization
+            transform_list = (
+                transform_list[:-2] +  # Resize, ToTensor
+                aug_transforms +
+                transform_list[-2:]    # Normalize
+            )
+        return transforms.Compose(transform_list)
+    def __call__(self, image: Image.Image) -> torch.Tensor:
+        """Process a single image"""
+        if not isinstance(image, Image.Image):
+            raise ValueError(f"Expected PIL Image, got {type(image)}")
+        return self.transform(image)
+    def process_batch(self, images: List[Image.Image]) -> torch.Tensor:
+        """Process a batch of images"""
+        processed = []
+        for img in images:
+            processed.append(self(img))
+        return torch.stack(processed)
+class TextProcessor:
+    """Text preprocessing for conversations"""
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.max_length = config["data"]["max_length"]
+        # Conversation formatting
+        conv_config = config["data"]["conversation"]
+        self.system_message = conv_config.get("system_message", "")
+        self.user_prefix = conv_config.get("user_prefix", "Human: ")
+        self.assistant_prefix = conv_config.get("assistant_prefix", "Assistant: ")
+        self.turn_separator = conv_config.get("turn_separator", "\n")
+    def format_conversation(self, conversations: List[Dict[str, str]]) -> str:
+        """Format conversation into training text with robust error handling"""
+        formatted_parts = []
+        # Add system message if present
+        if self.system_message:
+            formatted_parts.append(self.system_message)
+        # Ensure conversations is a valid list
+        if not isinstance(conversations, list):
+            conversations = []
+        # Process conversation turns with error handling
+        for turn in conversations:
+            try:
+                if not isinstance(turn, dict):
+                    continue
+                role = turn.get("from", "").lower().strip()
+                content = turn.get("value", "")
+                # Clean and validate content
+                if not isinstance(content, str):
+                    content = str(content) if content else ""
+                content = content.strip()
+                if not content:
+                    continue
+                # Remove problematic characters that might cause issues
+                content = content.replace('\x00', '').replace('\n\n\n', '\n\n')
+                if role in ["human", "user"]:
+                    formatted_parts.append(f"{self.user_prefix}{content}")
+                elif role in ["gpt", "assistant", "ai"]:
+                    formatted_parts.append(f"{self.assistant_prefix}{content}")
+                else:
+                    # Default to human if role is unclear
+                    formatted_parts.append(f"{self.user_prefix}{content}")
+            except Exception as e:
+                logger.debug(f"Error processing conversation turn: {e}")
+                continue
+        # Ensure we have at least some content
+        if not formatted_parts:
+            return f"{self.user_prefix}What do you see in this image?{self.turn_separator}{self.assistant_prefix}I can see an image."
+        return self.turn_separator.join(formatted_parts)
+    def add_image_token(self, text: str, has_image: bool = True) -> str:
+        """Add image token to text if image is present"""
+        if has_image:
+            image_token = self.config.get("special_tokens", {}).get("image_token", "<image>")
+            return f"{image_token}\n{text}"
+        return text
+    def validate_text(self, text: str) -> bool:
+        """Validate text meets filtering criteria - more lenient validation"""
+        if not isinstance(text, str):
+            return False
+        # Basic cleanup
+        text = text.strip()
+        # Check for completely empty content
+        if not text:
+            return False
+        # More lenient length check - just ensure it's not absurdly long or short
+        text_length = len(text)
+        if text_length < 5:  # Very short
+            return False
+        if text_length > 2000:  # Very long
+            return False
+        # Check for basic structure (should have some content)
+        if len(text.split()) < 2:  # Less than 2 words
+            return False
+        return True

src/training/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .callbacks import CustomCallback
+from .utils import TrainingUtils
+__all__ = [
+    "CustomCallback",
+    "TrainingUtils"
+]

src/training/callbacks.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Custom Lightning callbacks
+"""
+import lightning as L
+from lightning.pytorch.callbacks import Callback
+import torch
+from typing import Any
+import logging
+logger = logging.getLogger(__name__)
+class CustomCallback(Callback):
+    """Custom callback for monitoring training progress"""
+    def __init__(self):
+        super().__init__()
+        self.start_time = None
+    def on_train_start(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None:
+        """Called when training starts"""
+        import time
+        self.start_time = time.time()
+        logger.info("Training started")
+        # Log model info
+        total_params = sum(p.numel() for p in pl_module.parameters())
+        trainable_params = sum(p.numel() for p in pl_module.parameters() if p.requires_grad)
+        logger.info(f"Total parameters: {total_params:,}")
+        logger.info(f"Trainable parameters: {trainable_params:,}")
+        logger.info(f"Trainable ratio: {trainable_params/total_params:.2%}")
+    def on_train_end(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None:
+        """Called when training ends"""
+        if self.start_time:
+            import time
+            duration = time.time() - self.start_time
+            logger.info(f"Training completed in {duration:.2f} seconds")
+    def on_train_epoch_start(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None:
+        """Called at the start of each training epoch"""
+        logger.info(f"Starting epoch {trainer.current_epoch + 1}/{trainer.max_epochs}")
+    def on_validation_epoch_end(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None:
+        """Called at the end of validation epoch"""
+        if trainer.logged_metrics:
+            val_loss = trainer.logged_metrics.get("val/loss", None)
+            if val_loss is not None:
+                logger.info(f"Validation loss: {val_loss:.4f}")
+class MemoryMonitorCallback(Callback):
+    """Monitor GPU memory usage during training"""
+    def __init__(self, log_every_n_steps: int = 100):
+        super().__init__()
+        self.log_every_n_steps = log_every_n_steps
+    def on_train_batch_end(
+        self,
+        trainer: L.Trainer,
+        pl_module: L.LightningModule,
+        outputs: Any,
+        batch: Any,
+        batch_idx: int
+    ) -> None:
+        """Log memory usage"""
+        if batch_idx % self.log_every_n_steps == 0 and torch.cuda.is_available():
+            memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
+            memory_reserved = torch.cuda.memory_reserved() / 1024**3   # GB
+            pl_module.log("train/memory_allocated_gb", memory_allocated, on_step=True)
+            pl_module.log("train/memory_reserved_gb", memory_reserved, on_step=True)

src/training/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Training utilities
+"""
+import torch
+import logging
+from typing import Dict, Any, Optional
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class TrainingUtils:
+    """Utility functions for training"""
+    @staticmethod
+    def count_parameters(model: torch.nn.Module) -> Dict[str, int]:
+        """Count model parameters"""
+        total_params = sum(p.numel() for p in model.parameters())
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        frozen_params = total_params - trainable_params
+        return {
+            "total": total_params,
+            "trainable": trainable_params,
+            "frozen": frozen_params,
+            "trainable_percentage": (trainable_params / total_params) * 100 if total_params > 0 else 0
+        }
+    @staticmethod
+    def print_model_summary(model: torch.nn.Module, model_name: str = "Model") -> None:
+        """Print detailed model summary"""
+        params = TrainingUtils.count_parameters(model)
+        logger.info(f"\n{model_name} Summary:")
+        logger.info(f"  Total parameters: {params['total']:,}")
+        logger.info(f"  Trainable parameters: {params['trainable']:,}")
+        logger.info(f"  Frozen parameters: {params['frozen']:,}")
+        logger.info(f"  Trainable percentage: {params['trainable_percentage']:.2f}%")
+    @staticmethod
+    def save_model_state(
+        model: torch.nn.Module,
+        path: str,
+        additional_info: Optional[Dict[str, Any]] = None
+    ) -> None:
+        """Save model state with additional information"""
+        save_path = Path(path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        state_dict = {
+            "model_state_dict": model.state_dict(),
+            "model_class": model.__class__.__name__,
+        }
+        if additional_info:
+            state_dict.update(additional_info)
+        torch.save(state_dict, save_path)
+        logger.info(f"Model state saved to: {save_path}")
+    @staticmethod
+    def load_model_state(model: torch.nn.Module, path: str, strict: bool = True) -> Dict[str, Any]:
+        """Load model state and return additional information"""
+        checkpoint = torch.load(path, map_location="cpu")
+        if "model_state_dict" in checkpoint:
+            model.load_state_dict(checkpoint["model_state_dict"], strict=strict)
+            logger.info(f"Model state loaded from: {path}")
+            # Return additional info
+            additional_info = {k: v for k, v in checkpoint.items() if k != "model_state_dict"}
+            return additional_info
+        else:
+            # Assume the checkpoint is just the state dict
+            model.load_state_dict(checkpoint, strict=strict)
+            logger.info(f"Model state loaded from: {path}")
+            return {}
+    @staticmethod
+    def get_device_info() -> Dict[str, Any]:
+        """Get information about available devices"""
+        info = {
+            "cuda_available": torch.cuda.is_available(),
+            "cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+        }
+        if torch.cuda.is_available():
+            info["cuda_current_device"] = torch.cuda.current_device()
+            info["cuda_device_name"] = torch.cuda.get_device_name()
+            info["cuda_memory_total"] = torch.cuda.get_device_properties(0).total_memory / 1024**3  # GB
+        return info
+    @staticmethod
+    def log_device_info() -> None:
+        """Log device information"""
+        info = TrainingUtils.get_device_info()
+        logger.info("\nDevice Information:")
+        logger.info(f"  CUDA Available: {info['cuda_available']}")
+        if info['cuda_available']:
+            logger.info(f"  CUDA Device Count: {info['cuda_device_count']}")
+            logger.info(f"  Current Device: {info['cuda_current_device']}")
+            logger.info(f"  Device Name: {info['cuda_device_name']}")
+            logger.info(f"  Total Memory: {info['cuda_memory_total']:.2f} GB")
+        else:
+            logger.info("  Using CPU for training")
+    @staticmethod
+    def cleanup_memory() -> None:
+        """Clean up GPU memory"""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            logger.info("GPU memory cache cleared")

src/utils/logging.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Logging utilities
+"""
+import logging
+import sys
+from pathlib import Path
+from typing import Optional
+from rich.logging import RichHandler
+from rich.console import Console
+def setup_logging(
+    level: int = logging.INFO,
+    log_file: Optional[str] = None,
+    use_rich: bool = True
+) -> None:
+    """Setup logging configuration"""
+    # Create logs directory if needed
+    if log_file:
+        log_path = Path(log_file)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+    # Clear existing handlers
+    root_logger = logging.getLogger()
+    root_logger.handlers.clear()
+    # Setup formatters
+    if use_rich:
+        # Rich handler for console output
+        console_handler = RichHandler(
+            console=Console(stderr=True),
+            show_time=True,
+            show_path=True,
+            rich_tracebacks=True
+        )
+        console_handler.setLevel(level)
+        root_logger.addHandler(console_handler)
+    else:
+        # Standard console handler
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(level)
+        console_formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        console_handler.setFormatter(console_formatter)
+        root_logger.addHandler(console_handler)
+    # File handler if specified
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(level)
+        file_formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
+        )
+        file_handler.setFormatter(file_formatter)
+        root_logger.addHandler(file_handler)
+    # Set root logger level
+    root_logger.setLevel(level)
+    # Reduce noise from some libraries
+    logging.getLogger("transformers").setLevel(logging.WARNING)
+    logging.getLogger("datasets").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+    logging.getLogger("requests").setLevel(logging.WARNING)
+    logging.info("Logging setup completed")
+def get_logger(name: str) -> logging.Logger:
+    """Get a logger with the specified name"""
+    return logging.getLogger(name)