import gradio as gr import pandas as pd import vlai_template # Import Linear Regression core try: from src import vectorized_linear_regression LR_AVAILABLE = True except ImportError as e: print(f"❌ Linear Regression module failed to load: {str(e)}") LR_AVAILABLE = False vectorized_linear_regression = None vlai_template.configure( project_name="Vectorized Linear Regression Demo", year="2025", module="05", description="Interactive demonstration of Vectorized Linear Regression using numpy and gradient descent. Learn how linear regression works from scratch with pure matrix operations and visualize the training process.", colors={ "primary": "#2E7D32", "accent": "#66BB6A", "bg1": "#E8F5E9", "bg2": "#C8E6C9", "bg3": "#81C784", }, font_family="'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif" ) current_dataframe = None def load_sample_data_fallback(dataset_choice="Diabetes"): """Fallback data loading function when core module is not available""" from sklearn.datasets import load_diabetes, fetch_california_housing import pandas as pd def sklearn_to_df(data): df = pd.DataFrame(data.data, columns=getattr(data, "feature_names", None)) if df.columns.isnull().any(): df.columns = [f"feature_{i}" for i in range(df.shape[1])] df["target"] = data.target return df datasets = { "Diabetes": lambda: sklearn_to_df(load_diabetes()), "California Housing": lambda: sklearn_to_df(fetch_california_housing()), } if dataset_choice not in datasets: raise ValueError(f"Unknown dataset: {dataset_choice}") return datasets[dataset_choice]() def create_input_components_fallback(df, target_col): """Fallback input components creation when XGBoost is not available""" feature_cols = [c for c in df.columns if c != target_col] components = [] for col in feature_cols: data = df[col] if data.dtype == "object": uniq = sorted(map(str, data.dropna().unique())) if not uniq: uniq = ["N/A"] components.append( {"name": col, "type": "dropdown", "choices": uniq, "value": uniq[0]} ) else: val = pd.to_numeric(data, errors="coerce").dropna().mean() val = 0.0 if pd.isna(val) else float(val) components.append( { "name": col, "type": "number", "value": round(val, 3), "minimum": None, "maximum": None, } ) return components SAMPLE_DATA_CONFIG = { "Diabetes": {"target_column": "target", "problem_type": "regression"}, "California Housing": {"target_column": "target", "problem_type": "regression"}, } force_light_theme_js = """ () => { const params = new URLSearchParams(window.location.search); if (!params.has('__theme')) { params.set('__theme', 'light'); window.location.search = params.toString(); } } """ def validate_config(df, target_col): if not target_col or target_col not in df.columns: return False, "❌ Please select a valid target column from the dropdown.", None target_series = df[target_col] unique_vals = target_series.nunique() # For linear regression, we only support continuous values problem_type = "regression" if target_series.isnull().any(): return False, "⚠️ Target column has missing values. Please clean your data.", None if target_series.dtype == "object": return False, "⚠️ Target must be numeric for regression. Please select a numeric column.", None if unique_vals < 5: return False, f"⚠️ Too few unique values ({unique_vals}). This may not be suitable for regression.", None return True, f"\n✅ Configuration is valid! Ready for regression with {unique_vals} unique values.", problem_type def get_status_message(is_sample, dataset_choice, target_col, problem_type, is_valid, validation_msg): if is_sample: return f"✅ **Selected Dataset**: {dataset_choice} | **Target**: {target_col} | **Type**: {problem_type.title()}" elif target_col and problem_type: status_icon = "✅" if is_valid else "⚠️" return f"{status_icon} **Custom Data** | **Target**: {target_col} | **Type**: {problem_type.title()} | {validation_msg}" else: return "📁 **Custom data uploaded!** 👆 Please select target column above to continue." def load_and_configure_data_simple(dataset_choice="Diabetes"): global current_dataframe try: if not LR_AVAILABLE: # Fallback data loading without core module df = load_sample_data_fallback(dataset_choice) else: df = vectorized_linear_regression.load_data(None, dataset_choice) current_dataframe = df target_options = df.columns.tolist() cfg = SAMPLE_DATA_CONFIG.get(dataset_choice, {}) target_col = cfg.get("target_column") problem_type = cfg.get("problem_type") if target_col and target_col in target_options: is_valid, validation_msg, detected = validate_config(df, target_col) if detected: problem_type = detected status_msg = get_status_message(True, dataset_choice, target_col, problem_type, is_valid, validation_msg) else: # If target_col not in options, use first column as fallback target_col = target_options[0] if target_options else None status_msg = get_status_message(True, dataset_choice, target_col, problem_type, False, "") return [df.head(5).round(2), gr.Dropdown(choices=target_options, value=target_col), status_msg] except Exception as e: current_dataframe = None return [pd.DataFrame(), gr.Dropdown(choices=[], value=None), f"❌ **Error loading data**: {str(e)} | Please try a different dataset."] def load_and_configure_data(file_obj=None, dataset_choice="Diabetes"): global current_dataframe try: if not LR_AVAILABLE: # Fallback data loading without core module if file_obj is not None: # Handle file upload fallback if file_obj.name.endswith(".csv"): df = pd.read_csv(file_obj.name) elif file_obj.name.endswith((".xlsx", ".xls")): df = pd.read_excel(file_obj.name) else: raise ValueError("Unsupported format. Upload CSV or Excel files.") else: df = load_sample_data_fallback(dataset_choice) else: df = vectorized_linear_regression.load_data(file_obj, dataset_choice) current_dataframe = df target_options = df.columns.tolist() is_sample = file_obj is None if is_sample: cfg = SAMPLE_DATA_CONFIG.get(dataset_choice, {}) target_col = cfg.get("target_column") problem_type = cfg.get("problem_type") else: target_col, problem_type = None, None if target_col: is_valid, validation_msg, detected = validate_config(df, target_col) if detected: problem_type = detected status_msg = get_status_message(is_sample, dataset_choice, target_col, problem_type, is_valid, validation_msg) else: status_msg = get_status_message(is_sample, dataset_choice, target_col, problem_type, False, "") input_updates = [gr.update(visible=False)] * 40 inputs_visible = gr.update(visible=False) input_status = "⚙️ Configure target column above to enable feature inputs." if target_col and problem_type and (not is_sample or is_valid): try: if LR_AVAILABLE: components_info = vectorized_linear_regression.create_input_components(df, target_col) else: components_info = create_input_components_fallback(df, target_col) for i in range(min(20, len(components_info))): comp = components_info[i] number_idx, dropdown_idx = i * 2, i * 2 + 1 if comp["type"] == "number": upd = {"visible": True, "label": comp["name"], "value": comp["value"]} if comp["minimum"] is not None: upd["minimum"] = comp["minimum"] if comp["maximum"] is not None: upd["maximum"] = comp["maximum"] input_updates[number_idx] = gr.update(**upd) input_updates[dropdown_idx] = gr.update(visible=False) else: input_updates[number_idx] = gr.update(visible=False) input_updates[dropdown_idx] = gr.update( visible=True, label=comp["name"], choices=comp["choices"], value=comp["value"] ) inputs_visible = gr.update(visible=True) input_status = f"📝 **Ready!** Enter values for {len(components_info)} features below, then click Run prediction. | {validation_msg}" except Exception as e: input_status = f"❌ Error generating inputs: {str(e)}" return [df.head(5).round(2), gr.Dropdown(choices=target_options, value=target_col), status_msg] + input_updates + [inputs_visible, input_status] except Exception as e: current_dataframe = None empty = [pd.DataFrame(), gr.Dropdown(choices=[], value=None), f"❌ **Error loading data**: {str(e)} | Please try a different file or dataset."] return empty + [gr.update(visible=False)] * 40 + [gr.update(visible=False), "No data loaded."] def update_learning_rate_display(lr_power): """Update the display to show what the current learning rate slider value represents""" # Map slider value to actual learning rate lr_values = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0] lr_labels = ["1e-6", "1e-5", "1e-4", "1e-3", "1e-2", "1e-1", "1"] idx = int(lr_power) if 0 <= idx < len(lr_values): return f"**Current Learning Rate:** {lr_values[idx]} ({lr_labels[idx]})" else: return "**Current Learning Rate:** N/A" def update_batch_size_display(batch_size_power, train_split): """Update the display to show what the current batch size slider value represents""" global current_dataframe df = current_dataframe if df is None or df.empty: return "**Current Batch Size:** N/A" # Calculate training set size train_size = int(len(df) * train_split) # Determine max power of 2 that fits in training size import math max_power = int(math.log2(train_size)) if train_size > 0 else 0 # Convert slider value to batch size if batch_size_power >= max_power + 1: return f"**Current Batch Size:** Full Batch ({train_size} samples)" else: actual_batch_size = 2 ** int(batch_size_power) return f"**Current Batch Size:** {actual_batch_size} samples (2^{int(batch_size_power)})" def update_batch_size_slider(df_preview, target_col, train_split): """Update batch size slider max based on training data size""" global current_dataframe df = current_dataframe if df is None or df.empty: return gr.update(maximum=10, value=10) # Calculate training set size train_size = int(len(df) * train_split) # Determine max power of 2 that fits in training size import math max_power = int(math.log2(train_size)) if train_size > 0 else 0 # Slider goes from 0 to max_power+1 (where max_power+1 = Full Batch) new_max = max_power + 1 # Set value to Full Batch by default return gr.update(maximum=new_max, value=new_max) def update_configuration(df_preview, target_col): global current_dataframe df = current_dataframe if df is None or df.empty: return [gr.update(visible=False)] * 40 + [gr.update(visible=False), "No data available.", "No data available."] if not target_col: return [gr.update(visible=False)] * 40 + [gr.update(visible=False), "Select target column.", "Select target column."] try: is_valid, validation_msg, problem_type = validate_config(df, target_col) if not is_valid: return [gr.update(visible=False)] * 40 + [gr.update(visible=False), f"⚠️ {validation_msg}", f"⚠️ {validation_msg}"] if LR_AVAILABLE: components_info = vectorized_linear_regression.create_input_components(df, target_col) else: components_info = create_input_components_fallback(df, target_col) input_updates = [gr.update(visible=False)] * 40 for i in range(min(20, len(components_info))): comp = components_info[i] number_idx, dropdown_idx = i * 2, i * 2 + 1 if comp["type"] == "number": upd = {"visible": True, "label": comp["name"], "value": comp["value"]} if comp["minimum"] is not None: upd["minimum"] = comp["minimum"] if comp["maximum"] is not None: upd["maximum"] = comp["maximum"] input_updates[number_idx] = gr.update(**upd) input_updates[dropdown_idx] = gr.update(visible=False) else: input_updates[number_idx] = gr.update(visible=False) input_updates[dropdown_idx] = gr.update( visible=True, label=comp["name"], choices=comp["choices"], value=comp["value"] ) input_status = f"📝 Enter values for {len(components_info)} features | {validation_msg}" status_msg = f"✅ **Selected Dataset**: Custom Data | **Target**: {target_col} | **Type**: {problem_type.title()}" return input_updates + [gr.update(visible=True), input_status, status_msg] except Exception as e: return [gr.update(visible=False)] * 40 + [gr.update(visible=False), f"❌ Error: {str(e)}", f"❌ Error: {str(e)}"] # Linear Regression prediction function def execute_prediction(df_preview, target_col, epochs, learning_rate_power, batch_size_power, train_test_split_ratio, *input_values): global current_dataframe df = current_dataframe EMPTY_PLOT = None EMPTY_TABLE = "" error_style = "