Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jun 24, 2025

Commit

488d6c3

1 Parent(s): 416aa34

backend

Browse files

Files changed (2) hide show

app.py +351 -365
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -12,46 +12,23 @@ hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
 # Flag to track if backend is available
 BACKEND_AVAILABLE = False
-backend_api = None
 def initialize_backend():
-    """Initialize backend connection"""
-    global backend_api, BACKEND_AVAILABLE
     try:
         print(f"Attempting to connect to backend: {BACKEND_SPACE_URL}")
-        backend_api = gr.load(f"spaces/{BACKEND_SPACE_URL}", token=hf_token)
-        # Test if the API object has the expected methods
-        print(f"🔧 Backend API object type: {type(backend_api)}")
-        print(f"🔧 Backend API object attributes: {dir(backend_api)}")
-        # Detailed debugging of key attributes
-        if hasattr(backend_api, 'predict'):
-            print(f"🔧 predict type: {type(backend_api.predict)}")
-            print(f"🔧 predict content: {backend_api.predict}")
-            if hasattr(backend_api.predict, '__call__'):
-                print("✅ predict is callable")
-            else:
-                print("❌ predict is not callable")
-        if hasattr(backend_api, 'fns'):
-            print(f"🔧 fns type: {type(backend_api.fns)}")
-            print(f"🔧 fns content: {backend_api.fns}")
-        if hasattr(backend_api, 'call_function'):
-            print(f"🔧 call_function type: {type(backend_api.call_function)}")
-        # Check if it's a Blocks object and has fns
-        if hasattr(backend_api, 'fns') and backend_api.fns:
-            print(f"✅ Backend connection successful!")
-            print(f"🔧 Available functions in fns: {list(backend_api.fns.keys())}")
-            BACKEND_AVAILABLE = True
-            return True
-        else:
-            print("❌ Backend API functions not found")
-            print(f"🔧 Available methods: {[attr for attr in dir(backend_api) if not attr.startswith('_')]}")
-            BACKEND_AVAILABLE = False
-            return False
     except Exception as e:
         print(f"❌ Backend connection failed: {e}")
@@ -112,14 +89,25 @@ def handle_video_upload(video):
         return None, None, [], 50, 756, 3
     try:
-        if BACKEND_AVAILABLE and backend_api and hasattr(backend_api, 'predict'):
             # Try to use backend API
             try:
                 print("🔧 Calling backend API for video upload...")
-                # Call process_video_with_points with empty points to get initial state
-                # The function expects: (video_path, points, grid_size, vo_points, fps)
-                result = backend_api.predict(video, [], 50, 756, 3)
                 print(f"✅ Backend video upload API call successful!")
                 print(f"🔧 Result type: {type(result)}")
@@ -127,10 +115,16 @@ def handle_video_upload(video):
                 # Parse the result - expect a dict with success status
                 if isinstance(result, dict) and result.get("success"):
-                    # Extract first frame locally for display
-                    display_image = extract_first_frame(video)
-                    original_image_state = json.dumps({"video_path": video, "frame": "backend_processing"})
-                    return original_image_state, display_image, [], 50, 756, 3
                 else:
                     print("Backend processing failed, using local fallback")
                     # Fallback to local processing
@@ -150,8 +144,9 @@ def handle_video_upload(video):
             "frame": "local_processing"
         })
-        # Default settings
-        grid_size_val, vo_points_val, fps_val = 50, 756, 3
         return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val
@@ -165,39 +160,37 @@ def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.Sele
         return None, []
     try:
-        if BACKEND_AVAILABLE and backend_api and hasattr(backend_api, 'predict'):
             # Try to use backend API
             try:
                 print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
-                # Parse original image state to get video path
-                try:
-                    state_data = json.loads(original_img)
-                    video_path = state_data.get("video_path")
-                except:
-                    video_path = None
-                if video_path:
-                    # Create points list with the new point
-                    points = [(evt.index[0], evt.index[1], point_type)]
-                    # Call process_video_with_points with the new point
-                    result = backend_api.predict(video_path, points, 50, 756, 3)
-                    print(f"✅ Backend select point API call successful!")
-                    print(f"🔧 Result type: {type(result)}")
-                    print(f"🔧 Result: {result}")
-                    # Parse the result - expect a dict with success status
-                    if isinstance(result, dict) and result.get("success"):
-                        # For now, use local processing for visualization
-                        # Fallback to local processing
-                        pass
-                    else:
-                        print("Backend processing failed, using local fallback")
-                        # Fallback to local processing
-                        pass
                 else:
-                    print("No video path found in state, using local fallback")
                     # Fallback to local processing
                     pass
             except Exception as e:
@@ -223,74 +216,58 @@ def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.Sele
                 x, y = evt.index[0], evt.index[1]
                 color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
-                # Draw a more visible point
                 cv2.circle(display_image, (x, y), 8, color, -1)
-                cv2.circle(display_image, (x, y), 10, (255, 255, 255), 2)
-                # Add a simple mask-like visualization (circle around the point)
-                mask_radius = 30
-                mask_color = (0, 255, 0, 50) if point_type == 'positive_point' else (255, 0, 0, 50)
-                # Create a simple mask visualization
-                mask_overlay = display_image.copy()
-                cv2.circle(mask_overlay, (x, y), mask_radius, color, -1)
-                # Blend the mask with the original image
-                alpha = 0.3
-                display_image = cv2.addWeighted(display_image, 1-alpha, mask_overlay, alpha, 0)
-                # Update selected points
-                new_sel_pix = sel_pix + [(x, y, point_type)]
-                # Add text label
-                label = "Positive" if point_type == 'positive_point' else "Negative"
-                cv2.putText(display_image, label, (x+15, y-15), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
                 return display_image, new_sel_pix
-        return None, sel_pix
     except Exception as e:
         print(f"Error in select_point: {e}")
-        return None, sel_pix
 def reset_points(original_img: str, sel_pix):
-    """Reset all points and clear the mask"""
     if original_img is None:
         return None, []
     try:
-        if BACKEND_AVAILABLE and backend_api and hasattr(backend_api, 'predict'):
             # Try to use backend API
             try:
-                print("🔧 Calling backend reset points API")
-                # Parse original image state to get video path
-                try:
-                    state_data = json.loads(original_img)
-                    video_path = state_data.get("video_path")
-                except:
-                    video_path = None
-                if video_path:
-                    # Call process_video_with_points with empty points to reset
-                    result = backend_api.predict(video_path, [], 50, 756, 3)
-                    print(f"✅ Backend reset points API call successful!")
-                    print(f"🔧 Result type: {type(result)}")
-                    print(f"🔧 Result: {result}")
-                    # Parse the result - expect a dict with success status
-                    if isinstance(result, dict) and result.get("success"):
-                        # For now, use local processing for visualization
-                        # Fallback to local processing
-                        pass
-                    else:
-                        print("Backend processing failed, using local fallback")
-                        # Fallback to local processing
-                        pass
                 else:
-                    print("No video path found in state, using local fallback")
                     # Fallback to local processing
                     pass
             except Exception as e:
@@ -299,7 +276,7 @@ def reset_points(original_img: str, sel_pix):
                 pass
         # Fallback: local processing
-        print("Using local point reset...")
         # Parse original image state
         try:
@@ -309,7 +286,7 @@ def reset_points(original_img: str, sel_pix):
             video_path = None
         if video_path:
-            # Re-extract frame without points
             display_image = extract_first_frame(video_path)
             return display_image, []
@@ -325,41 +302,39 @@ def launch_viz(grid_size, vo_points, fps, original_image_state):
         return None, None
     try:
-        if BACKEND_AVAILABLE and backend_api and hasattr(backend_api, 'predict'):
             # Try to use backend API
             try:
                 print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
                 print(f"🔧 Original image state type: {type(original_image_state)}")
                 print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
-                # Parse original image state to get video path
-                try:
-                    state_data = json.loads(original_image_state)
-                    video_path = state_data.get("video_path")
-                except:
-                    video_path = None
-                if video_path:
-                    # Call process_video_with_points with current points and parameters
-                    # We need to get the current points from the state
-                    current_points = []  # For now, use empty points
-                    result = backend_api.predict(video_path, current_points, grid_size, vo_points, fps)
-                    print(f"✅ Backend API call successful!")
-                    print(f"🔧 Result type: {type(result)}")
-                    print(f"🔧 Result: {result}")
-                    # Parse the result
-                    if isinstance(result, dict) and result.get("success"):
-                        viz_html = result.get("viz_html_path", "")
-                        track_video_path = result.get("track_video_path", "")
-                        return viz_html, track_video_path
-                    else:
-                        print("Backend processing failed, showing error message")
-                        # Fallback to error message
-                        pass
                 else:
-                    print("No video path found in state, showing error message")
                     # Fallback to error message
                     pass
             except Exception as e:
@@ -385,10 +360,9 @@ def launch_viz(grid_size, vo_points, fps, original_image_state):
             <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 10px;'>
                 <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
                 <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend API Object: {backend_api is not None}</p>
                 <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Type: {type(backend_api) if backend_api else 'None'}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Available Functions: {list(backend_api.fns.keys()) if backend_api and hasattr(backend_api, 'fns') else 'None'}</p>
             </div>
             <p style='color: #2d3436; font-weight: bold; margin-top: 15px;'>
                 Current Status: Backend unavailable - Running in limited mode
@@ -403,48 +377,72 @@ def launch_viz(grid_size, vo_points, fps, original_image_state):
 def clear_all():
     """Clear all buffers and temporary files"""
-    return None, None, []
-def update_tracker_model(vo_points):
-    return None  # No output needed
-# Function to handle both manual upload and example selection
-def handle_video_change(video):
-    """Handle video change from both manual upload and example selection"""
-    if video is None:
-        return None, None, [], 50, 756, 3
-    # Handle video upload (extract first frame)
-    original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val = handle_video_upload(video)
-    return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
 def test_backend_connection():
     """Test if backend is actually working"""
     global BACKEND_AVAILABLE
-    if not backend_api:
         return False
     try:
-        # Try a simple API call to test connection
         print("Testing backend connection with a simple call...")
         # Check if we have fns available
-        if hasattr(backend_api, 'fns') and backend_api.fns:
             print("✅ Backend API functions are available")
-            print(f"🔧 Available function indices: {list(backend_api.fns.keys())}")
             return True
         else:
             print("❌ Backend API functions not found")
-            BACKEND_AVAILABLE = False
             return False
     except Exception as e:
         print(f"❌ Backend connection test failed: {e}")
-        BACKEND_AVAILABLE = False
         return False
 def test_backend_api():
     """Test specific backend API functions"""
-    if not BACKEND_AVAILABLE or not backend_api:
         print("❌ Backend not available for testing")
         return False
@@ -452,226 +450,214 @@ def test_backend_api():
         print("🧪 Testing backend API functions...")
         # Test if fns exist and show available indices
-        if hasattr(backend_api, 'fns') and backend_api.fns:
-            print(f"✅ Backend has {len(backend_api.fns)} functions available")
-            for idx in backend_api.fns.keys():
                 print(f"✅ Function {idx} is available")
         else:
-            print("❌ No functions available in backend")
         return True
     except Exception as e:
         print(f"❌ Backend API test failed: {e}")
         return False
-# Initialize backend connection
-print("🔧 Initializing backend connection...")
 initialize_backend()
-# Test the connection
-test_backend_connection()
-# Test specific API functions
-test_backend_api()
-# Build UI
-with gr.Blocks(css="""
-    #advanced_settings .wrap {
-        font-size: 14px !important;
-    }
-    #advanced_settings .gr-slider {
-        font-size: 13px !important;
-    }
-    #advanced_settings .gr-slider .gr-label {
-        font-size: 13px !important;
-        margin-bottom: 5px !important;
-    }
-    #advanced_settings .gr-slider .gr-info {
-        font-size: 12px !important;
-    }
-    #point_label_radio .gr-radio-group {
-        flex-direction: row !important;
-        gap: 15px !important;
-    }
-    #point_label_radio .gr-radio-group label {
-        margin-right: 0 !important;
-        margin-bottom: 0 !important;
-    }
-    /* Style for example videos label */
-    .gr-examples .gr-label {
-        font-weight: bold !important;
-        font-size: 16px !important;
-    }
-    /* Simple horizontal scroll for examples */
-    .gr-examples .gr-table-wrapper {
-        overflow-x: auto !important;
-        overflow-y: hidden !important;
-    }
-    .gr-examples .gr-table {
-        display: flex !important;
-        flex-wrap: nowrap !important;
-        min-width: max-content !important;
     }
-    .gr-examples .gr-table tbody {
-        display: flex !important;
-        flex-direction: row !important;
-        flex-wrap: nowrap !important;
     }
-    .gr-examples .gr-table tbody tr {
-        display: flex !important;
-        flex-direction: column !important;
-        min-width: 150px !important;
-        margin-right: 10px !important;
     }
-    .gr-examples .gr-table tbody tr td {
-        text-align: center !important;
-        padding: 5px !important;
-    }
-""") as demo:
-    # Initialize states inside Blocks
-    selected_points = gr.State([])
-    original_image_state = gr.State()  # Store original image in state
     with gr.Row():
-        # Show backend status with more detailed information
-        status_color = "#28a745" if BACKEND_AVAILABLE else "#dc3545"
-        status_text = "Connected" if BACKEND_AVAILABLE else "Disconnected"
-        status_icon = "✅" if BACKEND_AVAILABLE else "❌"
-        gr.Markdown(f"""
-        # ✨ SpaTrackV2 Frontend (Client)
-        <div style='background-color: #e6f3ff; padding: 20px; border-radius: 10px; margin: 10px 0;'>
-        <h2 style='color: #0066cc; margin-bottom: 15px;'>Instructions:</h2>
-        <ol style='font-size: 20px; line-height: 1.6;'>
-            <li>🎬 Upload a video or select from examples below</li>
-            <li>🎯 Select positive points (green) and negative points (red) on the first frame</li>
-            <li>⚡ Click 'Run Tracker and Visualize' when done</li>
-            <li>🔍 Iterative 3D result will be shown in the visualization</li>
-        </ol>
-        <div style='background-color: #fff3cd; border: 2px solid #ffc107; border-radius: 8px; padding: 15px; margin-top: 15px;'>
-            <p style='font-size: 18px; color: #856404; margin: 0 0 10px 0; font-weight: bold;'>
-                ⭐ If you like our work, please give us a star on GitHub!
-            </p>
-            <p style='font-size: 16px; color: #856404; margin: 0;'>
-                <a href="https://github.com/henry123-boy/SpaTrackerV2" target="_blank" style="color: #0066cc; text-decoration: none; font-weight: bold;">
-                    🌟 https://github.com/henry123-boy/SpaTrackerV2
-                </a>
-            </p>
-        </div>
-        <div style='background-color: {status_color}20; border: 2px solid {status_color}; border-radius: 8px; padding: 10px; margin-top: 15px;'>
-            <p style='font-size: 18px; color: {status_color}; margin: 0;'>
-                {status_icon} Backend Status: {status_text}
-            </p>
-            <p style='font-size: 14px; color: #666; margin: 5px 0 0 0;'>
-                {BACKEND_SPACE_URL}
-            </p>
-            <p style='font-size: 12px; color: #888; margin: 5px 0 0 0;'>
-                {'API methods available' if BACKEND_AVAILABLE else 'Connection failed - using local mode'}
-            </p>
-        </div>
-        </div>
-        """)
-    with gr.Row():
         with gr.Column(scale=1):
-            video_input = gr.Video(label="Upload Video", format="mp4", height=300)
-            # Move Interactive Frame and 2D Tracking under video upload
-            with gr.Row():
-                display_image = gr.Image(type="numpy", label="📸 Interactive Frame", height=250)
-                track_video = gr.Video(label="🎯 2D Tracking Result", height=250)
-            with gr.Row():
-                fg_bg_radio = gr.Radio(choices=['positive_point', 'negative_point'],
-                                       label='Point label',
-                                       value='positive_point',
-                                       elem_id="point_label_radio")
-                reset_button = gr.Button("Reset points")
-                clear_button = gr.Button("Clear All", variant="secondary")
-            with gr.Accordion("⚙️ Advanced Settings", open=True, elem_id="advanced_settings"):
-                grid_size = gr.Slider(minimum=10, maximum=100, value=50, step=1,
-                                      label="Grid Size", info="Size of the tracking grid")
-                vo_points = gr.Slider(minimum=256, maximum=4096, value=756, step=50,
-                                      label="VO Points", info="Number of points for solving camera pose")
-                fps_slider = gr.Slider(minimum=1, maximum=10, value=3, step=1,
-                                      label="FPS", info="FPS of the output video")
-            viz_button = gr.Button("🚀 Run Tracker and Visualize", variant="primary", size="lg")
-        with gr.Column(scale=2):
-            # Add example videos using gr.Examples
-            examples_component = gr.Examples(
-                examples=[
-                    "examples/kiss.mp4",
-                    "examples/backpack.mp4",
-                    "examples/pillow.mp4",
-                    "examples/hockey.mp4",
-                    "examples/drifting.mp4",
-                    "examples/ken_block_0.mp4",
-                    "examples/ball.mp4",
-                    "examples/kitchen.mp4",
-                    "examples/ego_teaser.mp4",
-                    "examples/ego_kc1.mp4",
-                    "examples/vertical_place.mp4",
-                    "examples/robot_unitree.mp4",
-                    "examples/droid_robot.mp4",
-                    "examples/robot_2.mp4",
-                    "examples/cinema_0.mp4",
-                ],
-                inputs=[video_input],
-                label="📁 Example Videos",
-                examples_per_page=20  # Show all examples on one page to enable scrolling
             )
-            # Initialize with a placeholder interface instead of static file
-            viz_iframe = gr.HTML("""
-                                <div style='border: 3px solid #667eea; border-radius: 10px; overflow: hidden; box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3); background: #f8f9fa; display: flex; align-items: center; justify-content: center; height: 950px;'>
-                                    <div style='text-align: center; color: #666;'>
-                                        <h3 style='margin-bottom: 20px; color: #667eea;'>🎮 Interactive 3D Tracking</h3>
-                                        <p style='font-size: 16px; margin-bottom: 10px;'>Upload a video and select points to start tracking</p>
-                                        <p style='font-size: 14px; color: #999;'>Powered by SpaTrackV2</p>
-                                    </div>
-                                </div>
-                                """)
-            # Simple description below the visualization
-            gr.HTML("""
-            <div style='text-align: center; margin-top: 15px; color: #666; font-size: 14px;'>
-                🎮 Interactive 3D visualization adapted from <a href="https://tapip3d.github.io/" target="_blank" style="color: #667eea;">TAPIP3D</a>
-            </div>
-            """)
-    # Bind events
     video_input.change(
-        handle_video_change,
-        inputs=[video_input],
-        outputs=[original_image_state, display_image, selected_points, grid_size, vo_points, fps_slider]
     )
-    reset_button.click(reset_points,
-                     inputs=[original_image_state, selected_points],
-                     outputs=[display_image, selected_points])
-    clear_button.click(clear_all,
-                      outputs=[video_input, display_image, selected_points])
-    display_image.select(select_point,
-                      inputs=[original_image_state, selected_points, fg_bg_radio],
-                      outputs=[display_image, selected_points])
-    # Update tracker model when vo_points changes
-    vo_points.change(update_tracker_model,
-                    inputs=[vo_points],
-                    outputs=[])
-    viz_button.click(launch_viz,
-                    inputs=[grid_size, vo_points, fps_slider, original_image_state],
-                    outputs=[viz_iframe, track_video],
-                    )
-# Launch the demo
 if __name__ == "__main__":
-    demo.launch()

 # Flag to track if backend is available
 BACKEND_AVAILABLE = False
+backend_client = None
 def initialize_backend():
+    """Initialize backend connection using gradio_client"""
+    global backend_client, BACKEND_AVAILABLE
     try:
         print(f"Attempting to connect to backend: {BACKEND_SPACE_URL}")
+        # Use gradio_client for proper API access
+        from gradio_client import Client
+        backend_client = Client(f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)
+        print(f"✅ Backend connection successful!")
+        print(f"🔧 Backend client: {backend_client}")
+        BACKEND_AVAILABLE = True
+        return True
     except Exception as e:
         print(f"❌ Backend connection failed: {e}")
         return None, None, [], 50, 756, 3
     try:
+        if BACKEND_AVAILABLE and backend_client:
             # Try to use backend API
             try:
                 print("🔧 Calling backend API for video upload...")
+                # Call the unified API with upload_video function type
+                result = backend_client.predict(
+                    "upload_video",  # function_type
+                    video,           # video file
+                    "",              # original_image_state (not used for upload)
+                    [],              # selected_points (not used for upload)
+                    "positive_point", # point_type (not used for upload)
+                    0,               # point_x (not used for upload)
+                    0,               # point_y (not used for upload)
+                    50,              # grid_size (not used for upload)
+                    756,             # vo_points (not used for upload)
+                    3,               # fps (not used for upload)
+                    api_name="/predict"
+                )
                 print(f"✅ Backend video upload API call successful!")
                 print(f"🔧 Result type: {type(result)}")
                 # Parse the result - expect a dict with success status
                 if isinstance(result, dict) and result.get("success"):
+                    # Extract data from backend response
+                    original_image_state = result.get("original_image_state", "")
+                    display_image = result.get("display_image", None)
+                    selected_points = result.get("selected_points", [])
+                    # Get video settings based on video name
+                    video_name = get_video_name(video)
+                    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+                    return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
                 else:
                     print("Backend processing failed, using local fallback")
                     # Fallback to local processing
             "frame": "local_processing"
         })
+        # Get video settings
+        video_name = get_video_name(video)
+        grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
         return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val
         return None, []
     try:
+        if BACKEND_AVAILABLE and backend_client:
             # Try to use backend API
             try:
                 print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
+                # Call the unified API with select_point function type
+                result = backend_client.predict(
+                    "select_point",  # function_type
+                    None,            # video file (not used for select_point)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    point_type,      # point_type
+                    evt.index[0],    # point_x
+                    evt.index[1],    # point_y
+                    50,              # grid_size (not used for select_point)
+                    756,             # vo_points (not used for select_point)
+                    3,               # fps (not used for select_point)
+                    api_name="/predict"
+                )
+                print(f"✅ Backend select point API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", sel_pix)
+                    return display_image, new_sel_pix
                 else:
+                    print("Backend processing failed, using local fallback")
                     # Fallback to local processing
                     pass
             except Exception as e:
                 x, y = evt.index[0], evt.index[1]
                 color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
+                # Draw a larger, more visible point
                 cv2.circle(display_image, (x, y), 8, color, -1)
+                cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
+                # Add point to selected points list
+                new_sel_pix = sel_pix.copy()
+                new_sel_pix.append([x, y, point_type])
                 return display_image, new_sel_pix
+        return None, []
     except Exception as e:
         print(f"Error in select_point: {e}")
+        return None, []
 def reset_points(original_img: str, sel_pix):
+    """Reset points and restore original image"""
     if original_img is None:
         return None, []
     try:
+        if BACKEND_AVAILABLE and backend_client:
             # Try to use backend API
             try:
+                print("🔧 Calling backend reset points API...")
+                # Call the unified API with reset_points function type
+                result = backend_client.predict(
+                    "reset_points",  # function_type
+                    None,            # video file (not used for reset_points)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    "positive_point", # point_type (not used for reset_points)
+                    0,               # point_x (not used for reset_points)
+                    0,               # point_y (not used for reset_points)
+                    50,              # grid_size (not used for reset_points)
+                    756,             # vo_points (not used for reset_points)
+                    3,               # fps (not used for reset_points)
+                    api_name="/predict"
+                )
+                print(f"✅ Backend reset points API call successful!")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", [])
+                    return display_image, new_sel_pix
                 else:
+                    print("Backend processing failed, using local fallback")
                     # Fallback to local processing
                     pass
             except Exception as e:
                 pass
         # Fallback: local processing
+        print("Using local reset points...")
         # Parse original image state
         try:
             video_path = None
         if video_path:
+            # Re-extract original frame
             display_image = extract_first_frame(video_path)
             return display_image, []
         return None, None
     try:
+        if BACKEND_AVAILABLE and backend_client:
             # Try to use backend API
             try:
                 print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
                 print(f"🔧 Original image state type: {type(original_image_state)}")
                 print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
+                # Call the unified API with run_tracker function type
+                result = backend_client.predict(
+                    "run_tracker",        # function_type
+                    None,                 # video file (not used for run_tracker)
+                    original_image_state, # original_image_state
+                    [],                   # selected_points (not used for run_tracker)
+                    "positive_point",     # point_type (not used for run_tracker)
+                    0,                    # point_x (not used for run_tracker)
+                    0,                    # point_y (not used for run_tracker)
+                    grid_size,            # grid_size
+                    vo_points,            # vo_points
+                    fps,                  # fps
+                    api_name="/predict"
+                )
+                print(f"✅ Backend API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    viz_html = result.get("viz_html", "")
+                    track_video_path = result.get("track_video_path", "")
+                    return viz_html, track_video_path
                 else:
+                    print("Backend processing failed, showing error message")
                     # Fallback to error message
                     pass
             except Exception as e:
             <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 10px;'>
                 <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
                 <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
                 <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Client Type: {type(backend_client) if backend_client else 'None'}</p>
             </div>
             <p style='color: #2d3436; font-weight: bold; margin-top: 15px;'>
                 Current Status: Backend unavailable - Running in limited mode
 def clear_all():
     """Clear all buffers and temporary files"""
+    return None, None, [], 50, 756, 3
+def update_tracker_model(model_name):
+    """Update tracker model (placeholder function)"""
+    return
+def get_video_settings(video_name):
+    """Get video-specific settings based on video name"""
+    video_settings = {
+        "blackswan": (50, 756, 3),
+        "bike-packing": (50, 756, 3),
+        "bmx-trees": (50, 756, 3),
+        "breakdance": (50, 756, 3),
+        "camel": (50, 756, 3),
+        "car-roundabout": (50, 756, 3),
+        "car-shadow": (50, 756, 3),
+        "cows": (50, 756, 3),
+        "dance-twirl": (50, 756, 3),
+        "dog": (50, 756, 3),
+        "dogs-jump": (50, 756, 3),
+        "drift-chicane": (50, 756, 3),
+        "drift-straight": (50, 756, 3),
+        "goat": (50, 756, 3),
+        "gold-fish": (50, 756, 3),
+        "horsejump-high": (50, 756, 3),
+        "india": (50, 756, 3),
+        "judo": (50, 756, 3),
+        "kite-surf": (50, 756, 3),
+        "lab-coat": (50, 756, 3),
+        "libby": (50, 756, 3),
+        "loading": (50, 756, 3),
+        "mbike-trick": (50, 756, 3),
+        "motocross-jump": (50, 756, 3),
+        "paragliding-launch": (50, 756, 3),
+        "parkour": (50, 756, 3),
+        "pigs": (50, 756, 3),
+        "scooter-black": (50, 756, 3),
+        "shooting": (50, 756, 3),
+        "soapbox": (50, 756, 3)
+    }
+    return video_settings.get(video_name, (50, 756, 3))
 def test_backend_connection():
     """Test if backend is actually working"""
     global BACKEND_AVAILABLE
+    if not backend_client:
         return False
     try:
         print("Testing backend connection with a simple call...")
         # Check if we have fns available
+        if hasattr(backend_client, 'fns') and backend_client.fns:
             print("✅ Backend API functions are available")
+            print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
             return True
         else:
             print("❌ Backend API functions not found")
             return False
     except Exception as e:
         print(f"❌ Backend connection test failed: {e}")
         return False
 def test_backend_api():
     """Test specific backend API functions"""
+    if not BACKEND_AVAILABLE or not backend_client:
         print("❌ Backend not available for testing")
         return False
         print("🧪 Testing backend API functions...")
         # Test if fns exist and show available indices
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print(f"✅ Backend has {len(backend_client.fns)} functions available")
+            for idx in backend_client.fns.keys():
                 print(f"✅ Function {idx} is available")
         else:
+            print("❌ No functions found in backend API")
+            return False
         return True
     except Exception as e:
         print(f"❌ Backend API test failed: {e}")
         return False
+# Initialize the backend connection
+print("🚀 Initializing frontend application...")
 initialize_backend()
+# Test backend connection if available
+if BACKEND_AVAILABLE:
+    print("🧪 Testing backend connection...")
+    test_result = test_backend_connection()
+    if test_result:
+        print("✅ Backend connection test passed!")
+        test_backend_api()
+    else:
+        print("❌ Backend connection test failed!")
+        BACKEND_AVAILABLE = False
+# Create the Gradio interface
+print("🎨 Creating Gradio interface...")
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="SpatialTracker V2 - Frontend",
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
     }
+    .gr-button {
+        margin: 5px;
     }
+    .gr-form {
+        background: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
     }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎯 SpatialTracker V2 - Frontend Interface
+    Welcome to SpatialTracker V2! This interface allows you to track objects in videos using advanced computer vision techniques.
+    **Instructions:**
+    1. Upload a video file
+    2. Click on the object you want to track in the first frame
+    3. Adjust tracking parameters if needed
+    4. Click "Launch Visualization" to start tracking
+    """)
+    # Status indicator
+    status_text = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Running in Standalone Mode"
+    gr.Markdown(f"**Status:** {status_text}")
     with gr.Row():
+        with gr.Column(scale=1):
+            # Video upload section
+            with gr.Group():
+                gr.Markdown("### 📹 Video Upload")
+                video_input = gr.Video(
+                    label="Upload Video",
+                    format="mp4"
+                )
+            # Interactive frame display
+            with gr.Group():
+                gr.Markdown("### ��� Point Selection")
+                gr.Markdown("Click on the object you want to track in the frame below:")
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points",
+                    type="numpy",
+                    interactive=True
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive points indicate the object to track, negative points indicate areas to avoid"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
+                    clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
         with gr.Column(scale=1):
+            # Tracking results
+            with gr.Group():
+                gr.Markdown("### 🎬 Tracking Results")
+                tracking_result_video = gr.Video(
+                    label="Tracking Result Video",
+                    interactive=False
+                )
+            # 3D Visualization
+            with gr.Group():
+                gr.Markdown("### 🌐 3D Visualization")
+                viz_html = gr.HTML(
+                    label="3D Trajectory Visualization",
+                    value="<p>Upload a video and select points to see 3D visualization here.</p>"
+                )
+    # Advanced settings section
+    with gr.Accordion("⚙️ Advanced Settings", open=False):
+        with gr.Row():
+            grid_size = gr.Slider(
+                minimum=10,
+                maximum=100,
+                step=10,
+                value=50,
+                label="Grid Size",
+                info="Size of the tracking grid"
             )
+            vo_points = gr.Slider(
+                minimum=100,
+                maximum=2000,
+                step=50,
+                value=756,
+                label="VO Points",
+                info="Number of visual odometry points"
+            )
+            fps = gr.Slider(
+                minimum=1,
+                maximum=30,
+                step=1,
+                value=3,
+                label="FPS",
+                info="Frames per second for processing"
+            )
+    # Launch button
+    with gr.Row():
+        launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg")
+    # Example videos section
+    with gr.Accordion("📂 Example Videos", open=False):
+        gr.Examples(
+            examples=[
+                ["examples/blackswan.mp4"],
+                ["examples/bike-packing.mp4"],
+                ["examples/bmx-trees.mp4"],
+                ["examples/breakdance.mp4"],
+                ["examples/camel.mp4"],
+            ],
+            inputs=video_input,
+            label="Try these example videos"
+        )
+    # Hidden state variables
+    original_image_state = gr.State(None)
+    selected_points = gr.State([])
+    # Event handlers
     video_input.change(
+        fn=handle_video_upload,
+        inputs=[video_input],
+        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
     )
+    interactive_frame.select(
+        fn=select_point,
+        inputs=[original_image_state, selected_points, point_type],
+        outputs=[interactive_frame, selected_points]
+    )
+    reset_points_btn.click(
+        fn=reset_points,
+        inputs=[original_image_state, selected_points],
+        outputs=[interactive_frame, selected_points]
+    )
+    clear_all_btn.click(
+        fn=clear_all,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    launch_btn.click(
+        fn=launch_viz,
+        inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_result_video]
+    )
+# Launch the interface
 if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Frontend...")
+    print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True,
+        show_error=True
+    )

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 gradio==5.31.0
 opencv-python-headless

 gradio==5.31.0
+gradio_client
 opencv-python-headless