LLaMA2

Runtime error

File size: 1,208 Bytes

d0f20d3
 
 
 
e06bfd0
9f1f2e7
 
 
 
 
d0f20d3
90b0f0b
d0f20d3
35cd09c
d0f20d3
e06bfd0
 
 
 
 
d0f20d3
 
e06bfd0
d0f20d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a00f794
d0f20d3

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces
import gradio as gr
from transformers import BitsAndBytesConfig
import os
from huggingface_hub import login

login(token=os.environ["HUGGINGFACE_TOKEN"])

# Model and Tokenizer
model_name = "meta-llama/Llama-2-7b-hf"  # Change to 13B or 70B if needed

tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token=True)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Set `True` for 4-bit, `False` for 8-bit
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically place model on GPU
)

# Inference Function
@spaces.GPU
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example Usage

@spaces.GPU
def chat_with_llama(prompt):
    return generate_text(prompt)

gr.Interface(fn=chat_with_llama, inputs="text", outputs="text", title="LLaMA 2 Chatbot").launch()