iimran
/

Qwen2.5-3B-R1-MedicalReasoner

Text Generation

text-generation-inference

Model card Files Files and versions

iimran commited on Apr 9

Commit

d611803

·

verified ·

1 Parent(s): 4af501b

Create inference.py

Files changed (1) hide show

inference.py +54 -0

inference.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from unsloth import FastLanguageModel, is_bfloat16_supported
+from vllm import SamplingParams
+from huggingface_hub import snapshot_download
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner",
+    load_in_4bit=True,
+    fast_inference=True,
+    gpu_memory_utilization=0.5
+)
+lora_rank = 64
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=lora_rank,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=lora_rank,
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+)
+lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter")
+print("LoRA adapter downloaded to:", lora_path)
+model.load_lora(lora_path)
+SYSTEM_PROMPT = (
+    "Respond in the following format:\n"
+    "<reasoning>\n"
+    "...\n"
+    "</reasoning>\n"
+    "<answer>\n"
+    "...\n"
+    "</answer>"
+)
+USER_PROMPT = (
+    "In the context of disseminated intravascular coagulation (DIC), "
+    "which blood component is expected to show an increase due to the excessive breakdown of fibrin?"
+)
+text = tokenizer.apply_chat_template(
+    [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": USER_PROMPT},
+    ],
+    tokenize=False,
+    add_generation_prompt=True
+)
+sampling_params = SamplingParams(
+    temperature=0.1,
+    top_p=0.95,
+    max_tokens=4096,
+)
+outputs = model.fast_generate(
+    text,
+    sampling_params=sampling_params,
+    lora_request=None
+)
+print(outputs[0].outputs[0].text)