import torch import soundfile as sf from nemo.collections.speechlm2.models import SALM class EndpointHandler(): def __init__(self, path=""): # Load the model from Hugging Face directory (ASR + LLM model) self.model = SALM.from_pretrained(path) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) def __call__(self, data): mode = data.get("mode") if mode == "asr": # For ASR, we expect `audio` to be a list of floats or bytes audio = data.get("audio") # base64 / list of floats / bytes if isinstance(audio, list): audio_tensor = torch.tensor(audio).unsqueeze(0).to(self.device) else: # fallback: assume bytes audio_data, sr = sf.read(audio) audio_tensor = torch.tensor(audio_data).unsqueeze(0).to(self.device) transcripts = self.model.transcribe(audio_tensor) return {"text": transcripts} elif mode == "llm": # For LLM, we expect "inputs" key prompt = data.get("inputs", "") outputs = self.model.generate([prompt]) return {"text": outputs[0]} else: return {"error": "Please specify mode as 'asr' or 'llm'."}