Spaces:

Tas01
/

background-removal-api

Running

App Files Files Community

Tas01 commited on 14 days ago

Commit

bc37146

verified ·

1 Parent(s): 1fa30cb

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -55

app.py CHANGED Viewed

@@ -145,8 +145,10 @@ class ImageStoryteller:
             'success': False
         }
     # def generate_story(self, analysis_result, creativity_level=0.7):
-    #     """Generate a story based on detected objects and scene"""
     #     if self.llm_model is None:
     #         return "Story generation model not available."
@@ -156,7 +158,7 @@ class ImageStoryteller:
     #         scenes = [scene['type'] for scene in analysis_result['scenes']]
     #         # Create a prompt for the LLM
-    #         objects_str = ", ".join(objects[:3])  # Use top 3 objects
     #         scene_str = scenes[0] if scenes else "general scene"
     #         # FIXED: Convert creativity_level to float if it's a tuple
@@ -166,16 +168,19 @@ class ImageStoryteller:
     #         # Different prompt templates for creativity
     #         if creativity_level > 0.8:
     #             prompt = f"""Based on this image containing {objects_str} in a {scene_str}, write a creative and imaginative short story (3-4 paragraphs).
-    #             Make it engaging and add interesting details about the scene."""
     #         elif creativity_level > 0.5:
     #             prompt = f"""Create a short story about an image with {objects_str} in a {scene_str}.
-    #             Write 2-3 paragraphs that describe what might be happening in this scene."""
     #         else:
     #             prompt = f"""Describe what you see in an image containing {objects_str} in a {scene_str}.
-    #             Write a simple 1-2 paragraph description."""
-    #         # Format for the specific LLM
-    #         if "phi" in self.llm_model_id:
     #             # Phi-2 specific formatting
     #             formatted_prompt = f"Instruct: {prompt}\nOutput:"
     #         elif "gemma" in self.llm_model_id:
@@ -183,39 +188,82 @@ class ImageStoryteller:
     #             formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
     #         else:
     #             # Generic formatting
-    #             formatted_prompt = f"Write a story: {prompt}\n\nStory:"
     #         # Tokenize and generate
     #         inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device)
     #         with torch.no_grad():
-    #             outputs = self.llm_model.generate(
-    #                 **inputs,
-    #                 max_new_tokens=250,  # Shorter for faster generation
-    #                 temperature=creativity_level,
-    #                 do_sample=True,
-    #                 top_p=0.9,
-    #                 repetition_penalty=1.1,
-    #                 pad_token_id=self.tokenizer.eos_token_id
-    #             )
     #         # Decode and clean up
     #         story = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-    #         # Remove the prompt from the beginning if present
-    #         if story.startswith(formatted_prompt):
     #             story = story[len(formatted_prompt):].strip()
-    #         return story
     #     except Exception as e:
     #         print(f"Story generation failed: {e}")
     #         objects_str = ", ".join(objects) if 'objects' in locals() else "unknown"
     #         scene_str = scenes[0] if 'scenes' in locals() and scenes else "unknown scene"
-    #         return f"Failed to generate story. Detected objects: {objects_str} in a {scene_str}."
     def generate_story(self, analysis_result, creativity_level=0.7):
-        """Generate a story based on detected objects and scene using Qwen"""
         if self.llm_model is None:
             return "Story generation model not available."
@@ -232,52 +280,65 @@ class ImageStoryteller:
             if isinstance(creativity_level, (tuple, list)):
                 creativity_level = float(creativity_level[0])
-            # Different prompt templates for creativity
             if creativity_level > 0.8:
-                prompt = f"""Based on this image containing {objects_str} in a {scene_str}, write a creative and imaginative short story (3-4 paragraphs).
-                Make it engaging and add interesting details about the scene. Story:"""
             elif creativity_level > 0.5:
-                prompt = f"""Create a short story about an image with {objects_str} in a {scene_str}.
-                Write 2-3 paragraphs that describe what might be happening in this scene. Story:"""
             else:
-                prompt = f"""Describe what you see in an image containing {objects_str} in a {scene_str}.
-                Write a simple 1-2 paragraph description. Description:"""
-            # QWEN 1.8B SPECIFIC FORMATTING - SIMPLE AND EFFECTIVE
             if "qwen" in self.llm_model_id.lower():
-                # Qwen works best with this simple format
                 formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-            elif "phi" in self.llm_model_id:  # For Phi models
-                # Phi-2 specific formatting
                 formatted_prompt = f"Instruct: {prompt}\nOutput:"
             elif "gemma" in self.llm_model_id:
-                # Gemma specific formatting
                 formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
             else:
-                # Generic formatting
                 formatted_prompt = f"{prompt}\n\n"
             # Tokenize and generate
             inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device)
             with torch.no_grad():
-                # QWEN OPTIMIZED GENERATION PARAMETERS
                 if "qwen" in self.llm_model_id.lower():
                     outputs = self.llm_model.generate(
                         **inputs,
-                        max_new_tokens=300,  # Good length for stories
                         temperature=creativity_level,
                         do_sample=True,
                         top_p=0.9,
                         repetition_penalty=1.1,
                         eos_token_id=self.tokenizer.eos_token_id,
                         pad_token_id=self.tokenizer.eos_token_id,
-                        no_repeat_ngram_size=3  # Prevent repetition
                     )
                 else:
                     outputs = self.llm_model.generate(
                         **inputs,
-                        max_new_tokens=250,
                         temperature=creativity_level,
                         do_sample=True,
                         top_p=0.9,
@@ -290,37 +351,61 @@ class ImageStoryteller:
             # Clean up Qwen specific tokens
             if "qwen" in self.llm_model_id.lower():
-                # Remove the prompt and Qwen chat tokens
                 story = story.replace(formatted_prompt, "").strip()
                 story = story.replace("<|im_end|>", "").strip()
                 story = story.replace("<|im_start|>", "").strip()
                 story = story.replace("<|endoftext|>", "").strip()
-                # Sometimes Qwen repeats, clean that up
-                if "Story:" in story:
-                    story = story.split("Story:")[-1].strip()
-                if "Description:" in story:
-                    story = story.split("Description:")[-1].strip()
             elif story.startswith(formatted_prompt):
                 story = story[len(formatted_prompt):].strip()
-            # Additional cleanup for any model
             story = story.strip()
-            # If story is too short, try a simpler approach
-            if len(story.split()) < 10:
-                # Fallback: use a direct prompt
-                simple_prompt = f"Tell me a story about {objects_str} in {scene_str}."
-                simple_inputs = self.tokenizer(simple_prompt, return_tensors="pt").to(self.llm_model.device)
                 with torch.no_grad():
                     simple_outputs = self.llm_model.generate(
                         **simple_inputs,
-                        max_new_tokens=200,
                         temperature=0.8,
                         do_sample=True
                     )
                 story = self.tokenizer.decode(simple_outputs[0], skip_special_tokens=True)
-                story = story.replace(simple_prompt, "").strip()
             return story

             'success': False
         }
     # def generate_story(self, analysis_result, creativity_level=0.7):
+    #     """Generate a story based on detected objects and scene using Qwen"""
     #     if self.llm_model is None:
     #         return "Story generation model not available."
     #         scenes = [scene['type'] for scene in analysis_result['scenes']]
     #         # Create a prompt for the LLM
+    #         objects_str = ", ".join(objects)  # Use top 3 objects
     #         scene_str = scenes[0] if scenes else "general scene"
     #         # FIXED: Convert creativity_level to float if it's a tuple
     #         # Different prompt templates for creativity
     #         if creativity_level > 0.8:
     #             prompt = f"""Based on this image containing {objects_str} in a {scene_str}, write a creative and imaginative short story (3-4 paragraphs).
+    #             Make it engaging and add interesting details about the scene. Story:"""
     #         elif creativity_level > 0.5:
     #             prompt = f"""Create a short story about an image with {objects_str} in a {scene_str}.
+    #             Write 2-3 paragraphs that describe what might be happening in this scene. Story:"""
     #         else:
     #             prompt = f"""Describe what you see in an image containing {objects_str} in a {scene_str}.
+    #             Write a simple 1-2 paragraph description. Description:"""
+    #         # QWEN 1.8B SPECIFIC FORMATTING - SIMPLE AND EFFECTIVE
+    #         if "qwen" in self.llm_model_id.lower():
+    #             # Qwen works best with this simple format
+    #             formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+    #         elif "phi" in self.llm_model_id:  # For Phi models
     #             # Phi-2 specific formatting
     #             formatted_prompt = f"Instruct: {prompt}\nOutput:"
     #         elif "gemma" in self.llm_model_id:
     #             formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
     #         else:
     #             # Generic formatting
+    #             formatted_prompt = f"{prompt}\n\n"
     #         # Tokenize and generate
     #         inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device)
     #         with torch.no_grad():
+    #             # QWEN OPTIMIZED GENERATION PARAMETERS
+    #             if "qwen" in self.llm_model_id.lower():
+    #                 outputs = self.llm_model.generate(
+    #                     **inputs,
+    #                     max_new_tokens=300,  # Good length for stories
+    #                     temperature=creativity_level,
+    #                     do_sample=True,
+    #                     top_p=0.9,
+    #                     repetition_penalty=1.1,
+    #                     eos_token_id=self.tokenizer.eos_token_id,
+    #                     pad_token_id=self.tokenizer.eos_token_id,
+    #                     no_repeat_ngram_size=3  # Prevent repetition
+    #                 )
+    #             else:
+    #                 outputs = self.llm_model.generate(
+    #                     **inputs,
+    #                     max_new_tokens=250,
+    #                     temperature=creativity_level,
+    #                     do_sample=True,
+    #                     top_p=0.9,
+    #                     repetition_penalty=1.1,
+    #                     pad_token_id=self.tokenizer.eos_token_id
+    #                 )
     #         # Decode and clean up
     #         story = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+    #         # Clean up Qwen specific tokens
+    #         if "qwen" in self.llm_model_id.lower():
+    #             # Remove the prompt and Qwen chat tokens
+    #             story = story.replace(formatted_prompt, "").strip()
+    #             story = story.replace("<|im_end|>", "").strip()
+    #             story = story.replace("<|im_start|>", "").strip()
+    #             story = story.replace("<|endoftext|>", "").strip()
+    #             # Sometimes Qwen repeats, clean that up
+    #             if "Story:" in story:
+    #                 story = story.split("Story:")[-1].strip()
+    #             if "Description:" in story:
+    #                 story = story.split("Description:")[-1].strip()
+    #         elif story.startswith(formatted_prompt):
     #             story = story[len(formatted_prompt):].strip()
+    #         # Additional cleanup for any model
+    #         story = story.strip()
+    #         # If story is too short, try a simpler approach
+    #         if len(story.split()) < 10:
+    #             # Fallback: use a direct prompt
+    #             simple_prompt = f"Tell me a story about {objects_str} in {scene_str}."
+    #             simple_inputs = self.tokenizer(simple_prompt, return_tensors="pt").to(self.llm_model.device)
+    #             with torch.no_grad():
+    #                 simple_outputs = self.llm_model.generate(
+    #                     **simple_inputs,
+    #                     max_new_tokens=200,
+    #                     temperature=0.8,
+    #                     do_sample=True
+    #                 )
+    #             story = self.tokenizer.decode(simple_outputs[0], skip_special_tokens=True)
+    #             story = story.replace(simple_prompt, "").strip()
+    # #         return story
     #     except Exception as e:
     #         print(f"Story generation failed: {e}")
     #         objects_str = ", ".join(objects) if 'objects' in locals() else "unknown"
     #         scene_str = scenes[0] if 'scenes' in locals() and scenes else "unknown scene"
+    #         return f"Failed to generate story. Detected objects: {objects_str} in a {scene_str}. Error: {str(e)}"
     def generate_story(self, analysis_result, creativity_level=0.7):
+        """Generate a story with caption based on detected objects and scene using Qwen"""
         if self.llm_model is None:
             return "Story generation model not available."
             if isinstance(creativity_level, (tuple, list)):
                 creativity_level = float(creativity_level[0])
+            # Enhanced prompt with caption generation
             if creativity_level > 0.8:
+                prompt = f"""Based on this image containing {objects_str} in a {scene_str}:
+    1. First, write a catchy 5-7 word YouTube-style caption (engaging, attention-grabbing)
+    2. Then, write a creative and imaginative short story (3-4 paragraphs)
+    Format exactly like this:
+    CAPTION: [your catchy caption here]
+    STORY: [your creative story here]"""
             elif creativity_level > 0.5:
+                prompt = f"""For an image with {objects_str} in a {scene_str}:
+    1. Create a short, interesting caption (5-7 words)
+    2. Write a 2-3 paragraph story about what's happening in this scene
+    Format:
+    CAPTION: [your caption here]
+    STORY: [your story here]"""
             else:
+                prompt = f"""Describe an image containing {objects_str} in a {scene_str}:
+    1. Give a simple, descriptive caption
+    2. Write a 1-2 paragraph description
+    Format:
+    CAPTION: [caption here]
+    STORY: [description here]"""
+            # QWEN 1.8B SPECIFIC FORMATTING
             if "qwen" in self.llm_model_id.lower():
                 formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+            elif "phi" in self.llm_model_id:
                 formatted_prompt = f"Instruct: {prompt}\nOutput:"
             elif "gemma" in self.llm_model_id:
                 formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
             else:
                 formatted_prompt = f"{prompt}\n\n"
             # Tokenize and generate
             inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.llm_model.device)
             with torch.no_grad():
                 if "qwen" in self.llm_model_id.lower():
                     outputs = self.llm_model.generate(
                         **inputs,
+                        max_new_tokens=350,  # Increased for caption + story
                         temperature=creativity_level,
                         do_sample=True,
                         top_p=0.9,
                         repetition_penalty=1.1,
                         eos_token_id=self.tokenizer.eos_token_id,
                         pad_token_id=self.tokenizer.eos_token_id,
+                        no_repeat_ngram_size=3
                     )
                 else:
                     outputs = self.llm_model.generate(
                         **inputs,
+                        max_new_tokens=300,
                         temperature=creativity_level,
                         do_sample=True,
                         top_p=0.9,
             # Clean up Qwen specific tokens
             if "qwen" in self.llm_model_id.lower():
                 story = story.replace(formatted_prompt, "").strip()
                 story = story.replace("<|im_end|>", "").strip()
                 story = story.replace("<|im_start|>", "").strip()
                 story = story.replace("<|endoftext|>", "").strip()
             elif story.startswith(formatted_prompt):
                 story = story[len(formatted_prompt):].strip()
+            # Additional cleanup
             story = story.strip()
+            # Ensure proper formatting for caption and story
+            lines = story.split('\n')
+            formatted_lines = []
+            for line in lines:
+                line = line.strip()
+                if line and not line.startswith('CAPTION:') and not line.startswith('STORY:'):
+                    # If we have caption/story markers but missing the prefix
+                    if 'caption:' in line.lower() and 'caption:' not in line:
+                        line = 'CAPTION: ' + line.split('caption:')[-1].strip()
+                    elif 'story:' in line.lower() and 'story:' not in line:
+                        line = 'STORY: ' + line.split('story:')[-1].strip()
+                formatted_lines.append(line)
+            story = '\n'.join(formatted_lines)
+            # Add visual separator if not already present
+            if 'STORY:' in story:
+                parts = story.split('STORY:', 1)
+                if len(parts) == 2:
+                    caption_part = parts[0].replace('CAPTION:', '').strip()
+                    story_part = parts[1].strip()
+                    # Format with separator
+                    story = f"{caption_part}\n{'─' * 40}\n{story_part}"
+            # Fallback if generation is too short
+            if len(story.split()) < 15:
+                fallback_prompt = f"Create a caption and story for {objects_str} in {scene_str}."
+                simple_inputs = self.tokenizer(fallback_prompt, return_tensors="pt").to(self.llm_model.device)
                 with torch.no_grad():
                     simple_outputs = self.llm_model.generate(
                         **simple_inputs,
+                        max_new_tokens=250,
                         temperature=0.8,
                         do_sample=True
                     )
                 story = self.tokenizer.decode(simple_outputs[0], skip_special_tokens=True)
+                story = story.replace(fallback_prompt, "").strip()
+                # Add separator
+                sentences = story.split('. ')
+                if sentences:
+                    caption = sentences[0].strip()
+                    if not caption.endswith('.'):
+                        caption += '.'
+                    rest_of_story = '. '.join(sentences[1:]) if len(sentences) > 1 else story
+                    story = f"{caption}\n{'─' * 40}\n{rest_of_story}"
             return story