yasser-alharbi commited on
Commit
b083db4
·
verified ·
1 Parent(s): fb13328

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -13
app.py CHANGED
@@ -6,22 +6,33 @@ from peft import PeftModel
6
  from bitsandbytes import BitsAndBytesConfig
7
  from tqdm import tqdm
8
 
9
- # Force BitsAndBytes to CPU Mode
10
- os.environ["BITSANDBYTES_NOWELCOME"] = "1"
11
 
12
  # Define Hugging Face model paths
13
  base_model_path = "inceptionai/Jais-family-256m" # Base model
14
  lora_model_path = "E-dd77/SHEFAA" # Fine-tuned model with LoRA adapters
15
 
16
- # Quantization configuration for CPU
17
- bnb_config = BitsAndBytesConfig(
18
- load_in_4bit=False, # Disable GPU-specific 4-bit quantization
19
- )
20
-
21
- # Load the base model without 4-bit quantization
 
 
 
 
 
 
 
 
 
 
22
  base_model = AutoModelForCausalLM.from_pretrained(
23
  base_model_path,
24
- device_map="cpu", # Force CPU
 
25
  trust_remote_code=True
26
  )
27
 
@@ -31,8 +42,7 @@ tokenizer = AutoTokenizer.from_pretrained(lora_model_path, trust_remote_code=Tru
31
  # Load the LoRA adapters onto the base model
32
  model = PeftModel.from_pretrained(base_model, lora_model_path)
33
 
34
- # Move to CPU and set to evaluation mode
35
- device = torch.device("cpu")
36
  model.to(device)
37
  model.eval()
38
 
@@ -41,7 +51,7 @@ if tokenizer.pad_token is None:
41
  tokenizer.pad_token = tokenizer.eos_token
42
 
43
  # Define Text Generation Pipeline
44
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) # -1 forces CPU
45
 
46
  # Function for Generating Answers
47
  def generate_answer(question):
@@ -78,4 +88,4 @@ interface = gr.Interface(
78
 
79
  # Launch the Interface
80
  if __name__ == "__main__":
81
- interface.launch(debug=True)
 
6
  from bitsandbytes import BitsAndBytesConfig
7
  from tqdm import tqdm
8
 
9
+ # Check if GPU is available
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
 
12
  # Define Hugging Face model paths
13
  base_model_path = "inceptionai/Jais-family-256m" # Base model
14
  lora_model_path = "E-dd77/SHEFAA" # Fine-tuned model with LoRA adapters
15
 
16
+ # **Fix for bitsandbytes Import Error**
17
+ # If no GPU, disable 4-bit quantization
18
+ use_4bit = torch.cuda.is_available() # Only enable 4-bit if GPU is present
19
+
20
+ if use_4bit:
21
+ bnb_config = BitsAndBytesConfig(
22
+ load_in_4bit=True,
23
+ bnb_4bit_compute_dtype="float16",
24
+ bnb_4bit_quant_type="nf4",
25
+ bnb_4bit_use_double_quant=True
26
+ )
27
+ else:
28
+ print("⚠ No GPU detected: Disabling 4-bit quantization")
29
+ bnb_config = BitsAndBytesConfig(load_in_4bit=False) # **Disable 4-bit if on CPU**
30
+
31
+ # Load the base model with the correct quantization settings
32
  base_model = AutoModelForCausalLM.from_pretrained(
33
  base_model_path,
34
+ quantization_config=bnb_config if use_4bit else None, # Apply quantization only on GPU
35
+ device_map="auto" if use_4bit else "cpu",
36
  trust_remote_code=True
37
  )
38
 
 
42
  # Load the LoRA adapters onto the base model
43
  model = PeftModel.from_pretrained(base_model, lora_model_path)
44
 
45
+ # Move to the correct device
 
46
  model.to(device)
47
  model.eval()
48
 
 
51
  tokenizer.pad_token = tokenizer.eos_token
52
 
53
  # Define Text Generation Pipeline
54
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1 if not use_4bit else 0)
55
 
56
  # Function for Generating Answers
57
  def generate_answer(question):
 
88
 
89
  # Launch the Interface
90
  if __name__ == "__main__":
91
+ interface.launch()