from unsloth import FastLanguageModel, is_bfloat16_supported from vllm import SamplingParams from huggingface_hub import snapshot_download model, tokenizer = FastLanguageModel.from_pretrained( model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner", load_in_4bit=True, fast_inference=True, gpu_memory_utilization=0.5 ) lora_rank = 64 model = FastLanguageModel.get_peft_model( model, r=lora_rank, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=lora_rank, use_gradient_checkpointing="unsloth", random_state=3407, ) lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter") print("LoRA adapter downloaded to:", lora_path) model.load_lora(lora_path) SYSTEM_PROMPT = ( "Respond in the following format:\n" "\n" "...\n" "\n" "\n" "...\n" "" ) USER_PROMPT = ( "In the context of disseminated intravascular coagulation (DIC), " "which blood component is expected to show an increase due to the excessive breakdown of fibrin?" ) text = tokenizer.apply_chat_template( [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": USER_PROMPT}, ], tokenize=False, add_generation_prompt=True ) sampling_params = SamplingParams( temperature=0.1, top_p=0.95, max_tokens=4096, ) outputs = model.fast_generate( text, sampling_params=sampling_params, lora_request=None ) print(outputs[0].outputs[0].text)