iimran commited on
Commit
d611803
·
verified ·
1 Parent(s): 4af501b

Create inference.py

Browse files
Files changed (1) hide show
  1. inference.py +54 -0
inference.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unsloth import FastLanguageModel, is_bfloat16_supported
2
+ from vllm import SamplingParams
3
+ from huggingface_hub import snapshot_download
4
+ model, tokenizer = FastLanguageModel.from_pretrained(
5
+ model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner",
6
+ load_in_4bit=True,
7
+ fast_inference=True,
8
+ gpu_memory_utilization=0.5
9
+ )
10
+ lora_rank = 64
11
+ model = FastLanguageModel.get_peft_model(
12
+ model,
13
+ r=lora_rank,
14
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
15
+ "gate_proj", "up_proj", "down_proj"],
16
+ lora_alpha=lora_rank,
17
+ use_gradient_checkpointing="unsloth",
18
+ random_state=3407,
19
+ )
20
+ lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter")
21
+ print("LoRA adapter downloaded to:", lora_path)
22
+ model.load_lora(lora_path)
23
+ SYSTEM_PROMPT = (
24
+ "Respond in the following format:\n"
25
+ "<reasoning>\n"
26
+ "...\n"
27
+ "</reasoning>\n"
28
+ "<answer>\n"
29
+ "...\n"
30
+ "</answer>"
31
+ )
32
+ USER_PROMPT = (
33
+ "In the context of disseminated intravascular coagulation (DIC), "
34
+ "which blood component is expected to show an increase due to the excessive breakdown of fibrin?"
35
+ )
36
+ text = tokenizer.apply_chat_template(
37
+ [
38
+ {"role": "system", "content": SYSTEM_PROMPT},
39
+ {"role": "user", "content": USER_PROMPT},
40
+ ],
41
+ tokenize=False,
42
+ add_generation_prompt=True
43
+ )
44
+ sampling_params = SamplingParams(
45
+ temperature=0.1,
46
+ top_p=0.95,
47
+ max_tokens=4096,
48
+ )
49
+ outputs = model.fast_generate(
50
+ text,
51
+ sampling_params=sampling_params,
52
+ lora_request=None
53
+ )
54
+ print(outputs[0].outputs[0].text)