swordfish7412 commited on
Commit
d3a0f1a
·
verified ·
1 Parent(s): ec97cab

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +12 -3
app.py CHANGED
@@ -29,11 +29,19 @@ base_model = AutoModelForCausalLM.from_pretrained(
29
  BASE_MODEL,
30
  torch_dtype=torch.float16,
31
  device_map="auto",
32
- trust_remote_code=True
 
33
  )
34
 
35
  # Load LoRA adapter
36
  model = PeftModel.from_pretrained(base_model, LORA_MODEL)
 
 
 
 
 
 
 
37
  print("✅ Amigo 1.0 loaded successfully!")
38
 
39
  @spaces.GPU
@@ -52,11 +60,12 @@ def chat(message, history):
52
  with torch.no_grad():
53
  outputs = model.generate(
54
  **inputs,
55
- max_new_tokens=512,
56
  temperature=0.7,
57
  top_p=0.9,
58
  do_sample=True,
59
- pad_token_id=tokenizer.eos_token_id
 
60
  )
61
 
62
  # Decode
 
29
  BASE_MODEL,
30
  torch_dtype=torch.float16,
31
  device_map="auto",
32
+ trust_remote_code=True,
33
+ attn_implementation="eager" # Fix: Use eager attention (faster on T4)
34
  )
35
 
36
  # Load LoRA adapter
37
  model = PeftModel.from_pretrained(base_model, LORA_MODEL)
38
+ model.eval()
39
+
40
+ # Disable cache for faster inference
41
+ model.config.use_cache = False
42
+ if hasattr(model.generation_config, 'use_cache'):
43
+ model.generation_config.use_cache = False
44
+
45
  print("✅ Amigo 1.0 loaded successfully!")
46
 
47
  @spaces.GPU
 
60
  with torch.no_grad():
61
  outputs = model.generate(
62
  **inputs,
63
+ max_new_tokens=384, # Reduced from 512 for faster responses
64
  temperature=0.7,
65
  top_p=0.9,
66
  do_sample=True,
67
+ pad_token_id=tokenizer.eos_token_id,
68
+ use_cache=False # Explicitly disable cache
69
  )
70
 
71
  # Decode