Please update model card and upload the sample codes.
#1
by
jarvishacks
- opened
Please update model card and upload the sample codes.
import torch
from snac import SNAC
import onnx
import onnxruntime
import onnxruntime_genai as og
import time
import json
onnxruntime.preload_dlls(directory="D:\\Cuda\\12\\bin")
import soundfile as sf
# Initialize SNAC decoder
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().cuda()
# def generate_speech(text, speaker="kavya", temperature=0.4, top_p=0.9):
# """Generate speech from text using specified speaker voice"""
#
# # Prepare input with speaker token
# prompt = f"<spk_{speaker}> {text}"
# prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
#
# # Construct full sequence: [HUMAN] <spk_speaker> text [/HUMAN] [AI] [SPEECH]
# input_tokens = [
# START_OF_HUMAN_TOKEN,
# *prompt_tokens,
# END_OF_HUMAN_TOKEN,
# START_OF_AI_TOKEN,
# START_OF_SPEECH_TOKEN
# ]
#
# input_ids = torch.tensor([input_tokens], device=model.device)
#
# # Calculate max tokens based on text length
# max_tokens = min(int(len(text) * 1.3) * 7 + 21, 700)
#
# # Generate audio tokens
# with torch.no_grad():
# output = model.generate(
# input_ids,
# max_new_tokens=max_tokens,
# do_sample=True,
# temperature=temperature,
# top_p=top_p,
# repetition_penalty=1.05,
# pad_token_id=tokenizer.pad_token_id,
# eos_token_id=[END_OF_SPEECH_TOKEN, END_OF_AI_TOKEN]
# )
#
# # Extract SNAC tokens
# generated_ids = output[0][len(input_tokens):].tolist()
# snac_tokens = [
# token_id for token_id in generated_ids
# if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
# ]
#
# if not snac_tokens:
# raise ValueError("No audio tokens generated")
#
# # Decode audio
# audio = decode_snac_tokens(snac_tokens, snac_model)
# return audio
def decode_snac_tokens(snac_tokens, snac_model):
print(len(snac_tokens))
"""De-interleave and decode SNAC tokens to audio"""
if not snac_tokens or len(snac_tokens) % 7 != 0:
return None
# Get the device of the SNAC model. Fixed by Shresth to run on colab notebook :)
snac_device = next(snac_model.parameters()).device
# De-interleave tokens into 3 hierarchical levels
codes_lvl = [[] for _ in range(3)]
llm_codebook_offsets = [AUDIO_CODE_BASE_OFFSET + i * 4096 for i in range(7)]
for i in range(0, len(snac_tokens), 7):
# Level 0: Coarse (1 token)
codes_lvl[0].append(snac_tokens[i] - llm_codebook_offsets[0])
# Level 1: Medium (2 tokens)
codes_lvl[1].append(snac_tokens[i+1] - llm_codebook_offsets[1])
codes_lvl[1].append(snac_tokens[i+4] - llm_codebook_offsets[4])
# Level 2: Fine (4 tokens)
codes_lvl[2].append(snac_tokens[i+2] - llm_codebook_offsets[2])
codes_lvl[2].append(snac_tokens[i+3] - llm_codebook_offsets[3])
codes_lvl[2].append(snac_tokens[i+5] - llm_codebook_offsets[5])
codes_lvl[2].append(snac_tokens[i+6] - llm_codebook_offsets[6])
# Convert to tensors for SNAC decoder
hierarchical_codes = []
for lvl_codes in codes_lvl:
tensor = torch.tensor(lvl_codes, dtype=torch.int32, device=snac_device).unsqueeze(0)
if torch.any((tensor < 0) | (tensor > 4095)):
raise ValueError("Invalid SNAC token values")
hierarchical_codes.append(tensor)
# Decode with SNAC
with torch.no_grad():
audio_hat = snac_model.decode(hierarchical_codes)
return audio_hat.squeeze().clamp(-1, 1).cpu().numpy()
# Control token IDs (fixed for Veena)
START_OF_SPEECH_TOKEN = 128257
END_OF_SPEECH_TOKEN = 128258
START_OF_HUMAN_TOKEN = 128259
END_OF_HUMAN_TOKEN = 128260
START_OF_AI_TOKEN = 128261
END_OF_AI_TOKEN = 128262
AUDIO_CODE_BASE_OFFSET = 128266
# Available speakers
speakers:list[str] = ["kavya", "agastya", "maitri", "vinaya"]
def main():
text = "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"
prompt = f"<custom_token_3><spk_{speakers[2]}> {text}<custom_token_4><custom_token_5><custom_token_1>"
model_path = "onnx_model"
# input_tokens:list[int] = [
# START_OF_HUMAN_TOKEN,
# *prompt_tokens,
# END_OF_HUMAN_TOKEN,
# START_OF_AI_TOKEN,
# START_OF_SPEECH_TOKEN
# ]
# 128260, 128261, 128257
batch_size = 1
config = og.Config(model_path)
config.append_provider("cuda")
#config.overlay(f'{{"search": {{"batch_size": {batch_size}}}}}')
#if args.execution_provider != "follow_config":
# config.clear_providers()
# if args.execution_provider != "cpu":
# if args.verbose:
# print(f"Setting model to {args.execution_provider}...")
# config.append_provider(args.execution_provider)
model = og.Model(config)
if True: print("Model loaded")
tokenizer = og.Tokenizer(model)
if True : print("Tokenizer created")
input_tokens = tokenizer.encode(prompt)
if True : print(f'Prompt(s) encoded: {prompt}');print(f"tokenized: {input_tokens}")
params = og.GeneratorParams(model)
search_options = {}
#search_options['num_beams'] = 3
search_options['max_length'] = min(int(len(text) * 1.3) * 7 + 21, 700)
#if True : print(f'Args: {args}')
if True : print(f'Search options: {search_options}')
params.set_search_options(**search_options)
if True: print("GeneratorParams created")
generator = og.Generator(model, params)
if True: print("Generator created")
generator.append_tokens(input_tokens)
if True:print("Input tokens added")
if True: print("Generating tokens ...\n")
start_time = time.time()
while not generator.is_done():
generator.generate_next_token()
run_time = time.time() - start_time
# dec = generator.get_sequence(0)
# print(tokenizer.decode(dec))
total_tokens = dec.shape[0]
print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}")
print()
print("Hello from venna!")
snac_tokens = [
token_id for token_id in dec
if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
]
print(len(snac_tokens))
if not snac_tokens:
raise ValueError("No audio tokens generated")
# Decode audio
print(snac_tokens)
audio = decode_snac_tokens( snac_tokens[:7 * (len(snac_tokens) // 7)], snac_model)
print(audio)
sf.write(f"output_english_{speakers[2]}.wav", audio, 24000)
# --- Example Usage ---
# Hindi
# text_hindi = "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"
# audio = generate_speech(text_hindi, speaker="kavya")
# sf.write("output_hindi_kavya.wav", audio, 24000)
#
# # English
# text_english = "Today I learned about a new technology that uses artificial intelligence to generate human-like voices."
# audio = generate_speech(text_english, speaker="agastya")
# sf.write("output_english_agastya.wav", audio, 24000)
#
# # Code-mixed
# text_mixed = "मैं तो पूरा presentation prepare कर चुका हूं! कल रात को ही मैंने पूरा code base चेक किया।"
# audio = generate_speech(text_mixed, speaker="maitri")
# sf.write("output_mixed_maitri.wav", audio, 24000)
if __name__ == "__main__":
main()
[project]
name = "venna"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"accelerate>=1.12.0",
"bitsandbytes>=0.48.2",
"hf-xet==1.2.0",
"onnx>=1.19.1",
"onnxruntime-genai-cuda==0.11.2",
"onnxruntime-gpu>=1.23.2",
"snac>=1.2.1",
"soundfile>=0.13.1",
]
[tool.uv.sources]
torch = { index = "pytorch" }
#torchvision = { index = "pytorch" }
#xformers = { url = "https://download.pytorch.org/whl/cu130/xformers-0.0.33.post1-cp39-abi3-win_amd64.whl" }
[[tool.uv.index]]
name = "pytorch"
url = "https://download.pytorch.org/whl/cu129"
explicit = true
[dependency-groups]
dev = [
"torch==2.9.0+cu129",
"transformers>=4.57.1",
]
Loading CUDA-based torch already comes with required CUDA DLL, so there is no need to preload again with ONNX.