Please update model card and upload the sample codes.

#1
by jarvishacks - opened

Please update model card and upload the sample codes.


import torch

from snac import SNAC

import onnx
import onnxruntime 
import onnxruntime_genai as og
import time
import json
onnxruntime.preload_dlls(directory="D:\\Cuda\\12\\bin")

import soundfile as sf

# Initialize SNAC decoder
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().cuda()



# def generate_speech(text, speaker="kavya", temperature=0.4, top_p=0.9):
#     """Generate speech from text using specified speaker voice"""
#
#     # Prepare input with speaker token
#     prompt = f"<spk_{speaker}> {text}"
#     prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
#
#     # Construct full sequence: [HUMAN] <spk_speaker> text [/HUMAN] [AI] [SPEECH]
#     input_tokens = [
#         START_OF_HUMAN_TOKEN,
#         *prompt_tokens,
#         END_OF_HUMAN_TOKEN,
#         START_OF_AI_TOKEN,
#         START_OF_SPEECH_TOKEN
#     ]
#
#     input_ids = torch.tensor([input_tokens], device=model.device)
#
#     # Calculate max tokens based on text length
#     max_tokens = min(int(len(text) * 1.3) * 7 + 21, 700)
#
#     # Generate audio tokens
#     with torch.no_grad():
#         output = model.generate(
#             input_ids,
#             max_new_tokens=max_tokens,
#             do_sample=True,
#             temperature=temperature,
#             top_p=top_p,
#             repetition_penalty=1.05,
#             pad_token_id=tokenizer.pad_token_id,
#             eos_token_id=[END_OF_SPEECH_TOKEN, END_OF_AI_TOKEN]
#         )
#
#     # Extract SNAC tokens
#     generated_ids = output[0][len(input_tokens):].tolist()
#     snac_tokens = [
#         token_id for token_id in generated_ids
#         if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
#     ]
#
#     if not snac_tokens:
#         raise ValueError("No audio tokens generated")
#
#     # Decode audio
#     audio = decode_snac_tokens(snac_tokens, snac_model)
#     return audio

def decode_snac_tokens(snac_tokens, snac_model):
    print(len(snac_tokens))
    """De-interleave and decode SNAC tokens to audio"""
    if not snac_tokens or len(snac_tokens) % 7 != 0:
        return None

    # Get the device of the SNAC model. Fixed by Shresth to run on colab notebook :)
    snac_device = next(snac_model.parameters()).device

    # De-interleave tokens into 3 hierarchical levels
    codes_lvl = [[] for _ in range(3)]
    llm_codebook_offsets = [AUDIO_CODE_BASE_OFFSET + i * 4096 for i in range(7)]

    for i in range(0, len(snac_tokens), 7):
        # Level 0: Coarse (1 token)
        codes_lvl[0].append(snac_tokens[i] - llm_codebook_offsets[0])
        # Level 1: Medium (2 tokens)
        codes_lvl[1].append(snac_tokens[i+1] - llm_codebook_offsets[1])
        codes_lvl[1].append(snac_tokens[i+4] - llm_codebook_offsets[4])
        # Level 2: Fine (4 tokens)
        codes_lvl[2].append(snac_tokens[i+2] - llm_codebook_offsets[2])
        codes_lvl[2].append(snac_tokens[i+3] - llm_codebook_offsets[3])
        codes_lvl[2].append(snac_tokens[i+5] - llm_codebook_offsets[5])
        codes_lvl[2].append(snac_tokens[i+6] - llm_codebook_offsets[6])

    # Convert to tensors for SNAC decoder
    hierarchical_codes = []
    for lvl_codes in codes_lvl:
        tensor = torch.tensor(lvl_codes, dtype=torch.int32, device=snac_device).unsqueeze(0)
        if torch.any((tensor < 0) | (tensor > 4095)):
            raise ValueError("Invalid SNAC token values")
        hierarchical_codes.append(tensor)

    # Decode with SNAC
    with torch.no_grad():
        audio_hat = snac_model.decode(hierarchical_codes)

    return audio_hat.squeeze().clamp(-1, 1).cpu().numpy()

# Control token IDs (fixed for Veena)
START_OF_SPEECH_TOKEN = 128257
END_OF_SPEECH_TOKEN = 128258
START_OF_HUMAN_TOKEN = 128259
END_OF_HUMAN_TOKEN = 128260
START_OF_AI_TOKEN = 128261
END_OF_AI_TOKEN = 128262
AUDIO_CODE_BASE_OFFSET = 128266

# Available speakers
speakers:list[str] = ["kavya", "agastya", "maitri", "vinaya"]

def main():
    text =  "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"

    prompt = f"<custom_token_3><spk_{speakers[2]}> {text}<custom_token_4><custom_token_5><custom_token_1>"
    model_path = "onnx_model"

    # input_tokens:list[int] = [
    #     START_OF_HUMAN_TOKEN,
    #     *prompt_tokens,
    #     END_OF_HUMAN_TOKEN,
    #     START_OF_AI_TOKEN,
    #     START_OF_SPEECH_TOKEN
    # ]
    # 128260, 128261, 128257
    batch_size = 1

    config = og.Config(model_path)
    config.append_provider("cuda")

    #config.overlay(f'{{"search": {{"batch_size": {batch_size}}}}}')

    #if args.execution_provider != "follow_config":
    #    config.clear_providers()
    #    if args.execution_provider != "cpu":
    #        if args.verbose:
    #            print(f"Setting model to {args.execution_provider}...")
    #        config.append_provider(args.execution_provider)
    model = og.Model(config)

    if True: print("Model loaded")
    tokenizer = og.Tokenizer(model)
    if True : print("Tokenizer created")

 
    input_tokens = tokenizer.encode(prompt)
    if True : print(f'Prompt(s) encoded: {prompt}');print(f"tokenized: {input_tokens}")

    params = og.GeneratorParams(model)

    search_options = {}
   
    #search_options['num_beams'] = 3
    search_options['max_length'] = min(int(len(text) * 1.3) * 7 + 21, 700)
    #if True : print(f'Args: {args}')
    if True : print(f'Search options: {search_options}')

    params.set_search_options(**search_options)
    if True: print("GeneratorParams created")

    generator = og.Generator(model, params)
    if True: print("Generator created")
    
    generator.append_tokens(input_tokens)
    if True:print("Input tokens added")

    if True: print("Generating tokens ...\n")
    start_time = time.time()
    while not generator.is_done():
        generator.generate_next_token()
    run_time = time.time() - start_time

   # dec = generator.get_sequence(0)
   # print(tokenizer.decode(dec))

    total_tokens = dec.shape[0]
    print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}")
    print()

    print("Hello from venna!")

    snac_tokens = [
        token_id for token_id in dec
        if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
    ]
    print(len(snac_tokens))
    if not snac_tokens:
        raise ValueError("No audio tokens generated")

    # Decode audio
    print(snac_tokens)

    audio = decode_snac_tokens( snac_tokens[:7 * (len(snac_tokens) // 7)], snac_model)
    print(audio)
    sf.write(f"output_english_{speakers[2]}.wav", audio, 24000)

# --- Example Usage ---

# Hindi
# text_hindi = "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"
# audio = generate_speech(text_hindi, speaker="kavya")
# sf.write("output_hindi_kavya.wav", audio, 24000)
#
# # English
# text_english = "Today I learned about a new technology that uses artificial intelligence to generate human-like voices."
# audio = generate_speech(text_english, speaker="agastya")
# sf.write("output_english_agastya.wav", audio, 24000)
#
# # Code-mixed
# text_mixed = "मैं तो पूरा presentation prepare कर चुका हूं! कल रात को ही मैंने पूरा code base चेक किया।"
# audio = generate_speech(text_mixed, speaker="maitri")
# sf.write("output_mixed_maitri.wav", audio, 24000)

if __name__ == "__main__":
    main()
[project]
name = "venna"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
    "accelerate>=1.12.0",
    "bitsandbytes>=0.48.2",
    "hf-xet==1.2.0",
    "onnx>=1.19.1",
    "onnxruntime-genai-cuda==0.11.2",
    "onnxruntime-gpu>=1.23.2",
    "snac>=1.2.1",
    "soundfile>=0.13.1",
]

[tool.uv.sources]
torch = { index = "pytorch" }
#torchvision = { index = "pytorch" }
#xformers = { url = "https://download.pytorch.org/whl/cu130/xformers-0.0.33.post1-cp39-abi3-win_amd64.whl" }


[[tool.uv.index]]
name = "pytorch"
url = "https://download.pytorch.org/whl/cu129"
explicit = true

[dependency-groups]
dev = [
    "torch==2.9.0+cu129",
    "transformers>=4.57.1",
]

Loading CUDA-based torch already comes with required CUDA DLL, so there is no need to preload again with ONNX.

Sign up or log in to comment