Spaces:

yasser-alharbi
/

SHEFAA-Arabic-Medical-QA

Runtime error

App Files Files Community

yasser-alharbi commited on Jan 28

Commit

b083db4

verified ·

1 Parent(s): fb13328

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -13

app.py CHANGED Viewed

@@ -6,22 +6,33 @@ from peft import PeftModel
 from bitsandbytes import BitsAndBytesConfig
 from tqdm import tqdm
-# Force BitsAndBytes to CPU Mode
-os.environ["BITSANDBYTES_NOWELCOME"] = "1"
 # Define Hugging Face model paths
 base_model_path = "inceptionai/Jais-family-256m"  # Base model
 lora_model_path = "E-dd77/SHEFAA"  # Fine-tuned model with LoRA adapters
-# Quantization configuration for CPU
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=False,  # Disable GPU-specific 4-bit quantization
-)
-# Load the base model without 4-bit quantization
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_path,
-    device_map="cpu",  # Force CPU
     trust_remote_code=True
 )
@@ -31,8 +42,7 @@ tokenizer = AutoTokenizer.from_pretrained(lora_model_path, trust_remote_code=Tru
 # Load the LoRA adapters onto the base model
 model = PeftModel.from_pretrained(base_model, lora_model_path)
-# Move to CPU and set to evaluation mode
-device = torch.device("cpu")
 model.to(device)
 model.eval()
@@ -41,7 +51,7 @@ if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 # Define Text Generation Pipeline
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)  # -1 forces CPU
 # Function for Generating Answers
 def generate_answer(question):
@@ -78,4 +88,4 @@ interface = gr.Interface(
 # Launch the Interface
 if __name__ == "__main__":
-    interface.launch(debug=True)

 from bitsandbytes import BitsAndBytesConfig
 from tqdm import tqdm
+# Check if GPU is available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Define Hugging Face model paths
 base_model_path = "inceptionai/Jais-family-256m"  # Base model
 lora_model_path = "E-dd77/SHEFAA"  # Fine-tuned model with LoRA adapters
+# **Fix for bitsandbytes Import Error**
+# If no GPU, disable 4-bit quantization
+use_4bit = torch.cuda.is_available()  # Only enable 4-bit if GPU is present
+if use_4bit:
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype="float16",
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True
+    )
+else:
+    print("⚠ No GPU detected: Disabling 4-bit quantization")
+    bnb_config = BitsAndBytesConfig(load_in_4bit=False)  # **Disable 4-bit if on CPU**
+# Load the base model with the correct quantization settings
 base_model = AutoModelForCausalLM.from_pretrained(
     base_model_path,
+    quantization_config=bnb_config if use_4bit else None,  # Apply quantization only on GPU
+    device_map="auto" if use_4bit else "cpu",
     trust_remote_code=True
 )
 # Load the LoRA adapters onto the base model
 model = PeftModel.from_pretrained(base_model, lora_model_path)
+# Move to the correct device
 model.to(device)
 model.eval()
     tokenizer.pad_token = tokenizer.eos_token
 # Define Text Generation Pipeline
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1 if not use_4bit else 0)
 # Function for Generating Answers
 def generate_answer(question):
 # Launch the Interface
 if __name__ == "__main__":
+    interface.launch()