diff --git "a/vibevoice-clone-test-1.ipynb" "b/vibevoice-clone-test-1.ipynb" new file mode 100644--- /dev/null +++ "b/vibevoice-clone-test-1.ipynb" @@ -0,0 +1,1247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# VibeVoice Voice Cloning Test\n", + "\n", + "**IMPORTANT:** Voice cloning with custom audio ONLY works through Gradio interface!\n", + "\n", + "The command-line script only uses built-in voices (Alice, Frank, etc.)" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "scrolled": true + }, + "source": [ + "# Setup\n", + "import torch\n", + "if torch.cuda.is_available():\n", + " print(f\"GPU: {torch.cuda.get_device_name(0)}\")" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "GPU: NVIDIA L40S\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "scrolled": true + }, + "source": [ + "# Install VibeVoice\n", + "![ -d /root/VibeVoice ] || git clone --quiet https://github.com/cseti007/VibeVoice.git /root/VibeVoice\n", + "%uv pip install --quiet -e /root/VibeVoice\n", + "print(\"Installed\")" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "Installed\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "scrolled": true + }, + "source": [ + "# Download models\n", + "!huggingface-cli download aoi-ot/VibeVoice-Large --local-dir /root/models/VibeVoice-Large --quiet\n", + "!huggingface-cli download ABDALLALSWAITI/vibevoice-arabic-Z --local-dir /root/models/vibevoice-arabic-Z --quiet\n", + "print(\"Models ready\")" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[33m\u26a0\ufe0f Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.\u001b[0m\r\n", + "/root/models/VibeVoice-Large\r\n", + "\u001b[33m\u26a0\ufe0f Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.\u001b[0m\r\n", + "/root/models/vibevoice-arabic-Z\r\n", + "Models ready\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "scrolled": true + }, + "source": [ + "# Launch Gradio with Arabic LoRA\n", + "!python /root/VibeVoice/demo/gradio_demo.py \\\n", + " --model_path /root/models/VibeVoice-Large \\\n", + " --checkpoint_path /root/models/vibevoice-arabic-Z \\\n", + " --share" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "APEX FusedRMSNorm not available, using native implementation\r\n", + "\ud83c\udf99\ufe0f Initializing VibeVoice Demo with Streaming Support...\r\n", + "Loading processor & model from /root/models/VibeVoice-Large\r\n", + "Using device: cuda\r\n", + "\rtokenizer_config.json: 0.00B [00:00, ?B/s]\rtokenizer_config.json: 7.23kB [00:00, 25.5MB/s]\r\n", + "\rvocab.json: 0.00B [00:00, ?B/s]\rvocab.json: 2.78MB [00:00, 134MB/s]\r\n", + "\rmerges.txt: 0.00B [00:00, ?B/s]\rmerges.txt: 1.67MB [00:00, 148MB/s]\r\n", + "\rtokenizer.json: 0.00B [00:00, ?B/s]\rtokenizer.json: 7.03MB [00:00, 175MB/s]\r\n", + "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/vocab.json\r\n", + "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/merges.txt\r\n", + "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer.json\r\n", + "loading file added_tokens.json from cache at None\r\n", + "loading file special_tokens_map.json from cache at None\r\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer_config.json\r\n", + "loading file chat_template.jinja from cache at None\r\n", + "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \r\n", + "The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. \r\n", + "The class this function is called from is 'VibeVoiceTextTokenizerFast'.\r\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n", + "Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2\r\n", + "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", + "Model config VibeVoiceConfig {\r\n", + " \"acostic_vae_dim\": 64,\r\n", + " \"acoustic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"decoder_depths\": null,\r\n", + " \"decoder_n_filters\": 32,\r\n", + " \"decoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0.5,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"gaussian\",\r\n", + " \"vae_dim\": 64,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"acoustic_vae_dim\": 64,\r\n", + " \"architectures\": [\r\n", + " \"VibeVoiceForConditionalGeneration\"\r\n", + " ],\r\n", + " \"decoder_config\": {\r\n", + " \"attention_dropout\": 0.0,\r\n", + " \"hidden_act\": \"silu\",\r\n", + " \"hidden_size\": 3584,\r\n", + " \"initializer_range\": 0.02,\r\n", + " \"intermediate_size\": 18944,\r\n", + " \"max_position_embeddings\": 32768,\r\n", + " \"max_window_layers\": 28,\r\n", + " \"model_type\": \"qwen2\",\r\n", + " \"num_attention_heads\": 28,\r\n", + " \"num_hidden_layers\": 28,\r\n", + " \"num_key_value_heads\": 4,\r\n", + " \"rms_norm_eps\": 1e-06,\r\n", + " \"rope_scaling\": null,\r\n", + " \"rope_theta\": 1000000.0,\r\n", + " \"sliding_window\": null,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"use_cache\": true,\r\n", + " \"use_mrope\": false,\r\n", + " \"use_sliding_window\": false,\r\n", + " \"vocab_size\": 152064\r\n", + " },\r\n", + " \"diffusion_head_config\": {\r\n", + " \"ddpm_batch_mul\": 4,\r\n", + " \"ddpm_beta_schedule\": \"cosine\",\r\n", + " \"ddpm_num_inference_steps\": 20,\r\n", + " \"ddpm_num_steps\": 1000,\r\n", + " \"diffusion_type\": \"ddpm\",\r\n", + " \"head_ffn_ratio\": 3.0,\r\n", + " \"head_layers\": 4,\r\n", + " \"hidden_size\": 3584,\r\n", + " \"latent_size\": 64,\r\n", + " \"model_type\": \"vibevoice_diffusion_head\",\r\n", + " \"prediction_type\": \"v_prediction\",\r\n", + " \"rms_norm_eps\": 1e-05,\r\n", + " \"speech_vae_dim\": 64\r\n", + " },\r\n", + " \"model_type\": \"vibevoice\",\r\n", + " \"semantic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"none\",\r\n", + " \"vae_dim\": 128,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"semantic_vae_dim\": 128,\r\n", + " \"tie_word_embeddings\": false,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"transformers_version\": \"4.51.3\"\r\n", + "}\r\n", + "\r\n", + "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", + "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", + "[ERROR] : ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", + "Traceback (most recent call last):\r\n", + " File \"/root/VibeVoice/demo/gradio_demo.py\", line 86, in load_model\r\n", + " self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 279, in _wrapper\r\n", + " return func(*args, **kwargs)\r\n", + " ^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 4336, in from_pretrained\r\n", + " config = cls._autoset_attn_implementation(\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2109, in _autoset_attn_implementation\r\n", + " cls._check_and_enable_flash_attn_2(\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2252, in _check_and_enable_flash_attn_2\r\n", + " raise ImportError(f\"{preface} the package flash_attn seems to be not installed. {install_message}\")\r\n", + "ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", + "\r\n", + "Falling back to attention implementation: sdpa\r\n", + "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", + "Model config VibeVoiceConfig {\r\n", + " \"acostic_vae_dim\": 64,\r\n", + " \"acoustic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"decoder_depths\": null,\r\n", + " \"decoder_n_filters\": 32,\r\n", + " \"decoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0.5,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"gaussian\",\r\n", + " \"vae_dim\": 64,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"acoustic_vae_dim\": 64,\r\n", + " \"architectures\": [\r\n", + " \"VibeVoiceForConditionalGeneration\"\r\n", + " ],\r\n", + " \"decoder_config\": {\r\n", + " \"attention_dropout\": 0.0,\r\n", + " \"hidden_act\": \"silu\",\r\n", + " \"hidden_size\": 3584,\r\n", + " \"initializer_range\": 0.02,\r\n", + " \"intermediate_size\": 18944,\r\n", + " \"max_position_embeddings\": 32768,\r\n", + " \"max_window_layers\": 28,\r\n", + " \"model_type\": \"qwen2\",\r\n", + " \"num_attention_heads\": 28,\r\n", + " \"num_hidden_layers\": 28,\r\n", + " \"num_key_value_heads\": 4,\r\n", + " \"rms_norm_eps\": 1e-06,\r\n", + " \"rope_scaling\": null,\r\n", + " \"rope_theta\": 1000000.0,\r\n", + " \"sliding_window\": null,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"use_cache\": true,\r\n", + " \"use_mrope\": false,\r\n", + " \"use_sliding_window\": false,\r\n", + " \"vocab_size\": 152064\r\n", + " },\r\n", + " \"diffusion_head_config\": {\r\n", + " \"ddpm_batch_mul\": 4,\r\n", + " \"ddpm_beta_schedule\": \"cosine\",\r\n", + " \"ddpm_num_inference_steps\": 20,\r\n", + " \"ddpm_num_steps\": 1000,\r\n", + " \"diffusion_type\": \"ddpm\",\r\n", + " \"head_ffn_ratio\": 3.0,\r\n", + " \"head_layers\": 4,\r\n", + " \"hidden_size\": 3584,\r\n", + " \"latent_size\": 64,\r\n", + " \"model_type\": \"vibevoice_diffusion_head\",\r\n", + " \"prediction_type\": \"v_prediction\",\r\n", + " \"rms_norm_eps\": 1e-05,\r\n", + " \"speech_vae_dim\": 64\r\n", + " },\r\n", + " \"model_type\": \"vibevoice\",\r\n", + " \"semantic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"none\",\r\n", + " \"vae_dim\": 128,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"semantic_vae_dim\": 128,\r\n", + " \"tie_word_embeddings\": false,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"transformers_version\": \"4.51.3\"\r\n", + "}\r\n", + "\r\n", + "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", + "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", + "Generate config GenerationConfig {}\r\n", + "\r\n", + "Instantiating Qwen2Model model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.\r\n", + "\rLoading checkpoint shards: 0%| | 0/10 [00:00 https://89c767c53e806c2545.gradio.live\r\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Alternative: Test Built-in Voices\n", + "\n", + "If you want to test the Arabic LoRA with built-in voices (not your custom voice):" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "scrolled": true + }, + "source": [ + "# Create test text\n", + "import os\n", + "text = \"\"\"Speaker 1: \u0645\u0631\u062d\u0628\u0627\u064b \u0628\u0643\u0645\u060c \u0627\u0633\u0645\u064a \u0633\u0627\u0645\u064a.\n", + "\u0623\u0646\u0627 \u0627\u0644\u0622\u0646 \u0623\u062e\u062a\u0628\u0631 \u062a\u0642\u0646\u064a\u0629 \u062c\u062f\u064a\u062f\u0629 \u0644\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645.\n", + "\n", + "\u0643\u064a\u0641 \u064a\u0628\u062f\u0648 \u0635\u0648\u062a\u064a\u061f\n", + "\u0647\u0644 \u062a\u0633\u0645\u0639 \u0627\u0644\u0646\u0628\u0631\u0629 \u0627\u0644\u0637\u0628\u064a\u0639\u064a\u0629 \u0641\u064a \u062d\u062f\u064a\u062b\u064a\u061f\n", + "\n", + "\u0627\u0644\u0623\u0631\u062f\u0646 \u0628\u0644\u062f \u0627\u0644\u062c\u0628\u0627\u0644 \u0648\u0627\u0644\u0628\u062d\u0631 \u0648\u0627\u0644\u0635\u062d\u0631\u0627\u0621\u060c\n", + "\u0648\u0641\u064a \u0643\u0644 \u0645\u062f\u064a\u0646\u0629\u064d \u0642\u0635\u0629\u060c \u0648\u0641\u064a \u0643\u0644 \u0634\u0627\u0631\u0639\u064d \u062d\u0643\u0627\u064a\u0629.\n", + "\n", + "\u0627\u0644\u062d\u064a\u0627\u0629 \u0631\u062d\u0644\u0629 \u0646\u062a\u0639\u0644\u0651\u0645 \u0645\u0646\u0647\u0627 \u0643\u0644 \u064a\u0648\u0645\u060c\n", + "\u0641\u0644\u0646\u0628\u062a\u0633\u0645 \u0627\u0644\u0622\u0646\u2026 \u0648\u0644\u0646\u0628\u062f\u0623 \u0645\u0646 \u062c\u062f\u064a\u062f.\\nSpeaker 2: \u0623\u0646\u0627 \u0628\u062e\u064a\u0631 \u0634\u0643\u0631\u0627\"\"\"\n", + "with open('/root/test.txt', 'w', encoding='utf-8') as f:\n", + " f.write(text)" + ], + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "scrolled": true + }, + "source": [ + "# WITH LoRA (built-in Alice voice)\n", + "os.makedirs('/root/outputs/builtin_with_lora', exist_ok=True)\n", + "!python /root/VibeVoice/demo/inference_from_file.py \\\n", + " --model_path /root/models/VibeVoice-Large \\\n", + " --txt_path /root/test.txt \\\n", + " --speaker_names Alice Frank \\\n", + " --checkpoint_path /root/models/vibevoice-arabic-Z \\\n", + " --output_dir /root/outputs/builtin_with_lora" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "APEX FusedRMSNorm not available, using native implementation\r\n", + "Using device: cuda\r\n", + "Found 9 voice files in /root/VibeVoice/demo/voices\r\n", + "Available voices: en-Alice_woman, en-Carter_man, en-Frank_man, en-Mary_woman_bgm, en-Maya_woman, in-Samuel_man, zh-Anchen_man_bgm, zh-Bowen_man, zh-Xinran_woman\r\n", + "Reading script from: /root/test.txt\r\n", + "Found 2 speaker segments:\r\n", + " 1. Speaker 1\r\n", + " Text preview: Speaker 1: \u0645\u0631\u062d\u0628\u0627\u064b \u0628\u0643\u0645\u060c \u0627\u0633\u0645\u064a \u0633\u0627\u0645\u064a. \u0623\u0646\u0627 \u0627\u0644\u0622\u0646 \u0623\u062e\u062a\u0628\u0631 \u062a\u0642\u0646\u064a\u0629 \u062c\u062f\u064a\u062f\u0629 \u0644\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0646\u0635 \u0625\u0644\u0649 \u0643\u0644\u0627\u0645. \u0643\u064a\u0641 \u064a\u0628\u062f\u0648 \u0635\u0648\u062a\u064a\u061f \u0647\u0644...\r\n", + " 2. Speaker 2\r\n", + " Text preview: Speaker 2: \u0623\u0646\u0627 \u0628\u062e\u064a\u0631 \u0634\u0643\u0631\u0627...\r\n", + "\r\n", + "Speaker mapping:\r\n", + " Speaker 2 -> Frank\r\n", + " Speaker 1 -> Alice\r\n", + "Speaker 1 ('Alice') -> Voice: en-Alice_woman.wav\r\n", + "Speaker 2 ('Frank') -> Voice: en-Frank_man.wav\r\n", + "Loading processor & model from /root/models/VibeVoice-Large\r\n", + "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/vocab.json\r\n", + "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/merges.txt\r\n", + "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer.json\r\n", + "loading file added_tokens.json from cache at None\r\n", + "loading file special_tokens_map.json from cache at None\r\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer_config.json\r\n", + "loading file chat_template.jinja from cache at None\r\n", + "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \r\n", + "The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. \r\n", + "The class this function is called from is 'VibeVoiceTextTokenizerFast'.\r\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n", + "Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2\r\n", + "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", + "Model config VibeVoiceConfig {\r\n", + " \"acostic_vae_dim\": 64,\r\n", + " \"acoustic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"decoder_depths\": null,\r\n", + " \"decoder_n_filters\": 32,\r\n", + " \"decoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0.5,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"gaussian\",\r\n", + " \"vae_dim\": 64,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"acoustic_vae_dim\": 64,\r\n", + " \"architectures\": [\r\n", + " \"VibeVoiceForConditionalGeneration\"\r\n", + " ],\r\n", + " \"decoder_config\": {\r\n", + " \"attention_dropout\": 0.0,\r\n", + " \"hidden_act\": \"silu\",\r\n", + " \"hidden_size\": 3584,\r\n", + " \"initializer_range\": 0.02,\r\n", + " \"intermediate_size\": 18944,\r\n", + " \"max_position_embeddings\": 32768,\r\n", + " \"max_window_layers\": 28,\r\n", + " \"model_type\": \"qwen2\",\r\n", + " \"num_attention_heads\": 28,\r\n", + " \"num_hidden_layers\": 28,\r\n", + " \"num_key_value_heads\": 4,\r\n", + " \"rms_norm_eps\": 1e-06,\r\n", + " \"rope_scaling\": null,\r\n", + " \"rope_theta\": 1000000.0,\r\n", + " \"sliding_window\": null,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"use_cache\": true,\r\n", + " \"use_mrope\": false,\r\n", + " \"use_sliding_window\": false,\r\n", + " \"vocab_size\": 152064\r\n", + " },\r\n", + " \"diffusion_head_config\": {\r\n", + " \"ddpm_batch_mul\": 4,\r\n", + " \"ddpm_beta_schedule\": \"cosine\",\r\n", + " \"ddpm_num_inference_steps\": 20,\r\n", + " \"ddpm_num_steps\": 1000,\r\n", + " \"diffusion_type\": \"ddpm\",\r\n", + " \"head_ffn_ratio\": 3.0,\r\n", + " \"head_layers\": 4,\r\n", + " \"hidden_size\": 3584,\r\n", + " \"latent_size\": 64,\r\n", + " \"model_type\": \"vibevoice_diffusion_head\",\r\n", + " \"prediction_type\": \"v_prediction\",\r\n", + " \"rms_norm_eps\": 1e-05,\r\n", + " \"speech_vae_dim\": 64\r\n", + " },\r\n", + " \"model_type\": \"vibevoice\",\r\n", + " \"semantic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"none\",\r\n", + " \"vae_dim\": 128,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"semantic_vae_dim\": 128,\r\n", + " \"tie_word_embeddings\": false,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"transformers_version\": \"4.51.3\"\r\n", + "}\r\n", + "\r\n", + "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", + "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", + "[ERROR] : ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", + "Traceback (most recent call last):\r\n", + " File \"/root/VibeVoice/demo/inference_from_file.py\", line 305, in main\r\n", + " model = VibeVoiceForConditionalGenerationInference.from_pretrained(\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 279, in _wrapper\r\n", + " return func(*args, **kwargs)\r\n", + " ^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 4336, in from_pretrained\r\n", + " config = cls._autoset_attn_implementation(\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2109, in _autoset_attn_implementation\r\n", + " cls._check_and_enable_flash_attn_2(\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2252, in _check_and_enable_flash_attn_2\r\n", + " raise ImportError(f\"{preface} the package flash_attn seems to be not installed. {install_message}\")\r\n", + "ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", + "\r\n", + "Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.\r\n", + "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", + "Model config VibeVoiceConfig {\r\n", + " \"acostic_vae_dim\": 64,\r\n", + " \"acoustic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"decoder_depths\": null,\r\n", + " \"decoder_n_filters\": 32,\r\n", + " \"decoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0.5,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"gaussian\",\r\n", + " \"vae_dim\": 64,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"acoustic_vae_dim\": 64,\r\n", + " \"architectures\": [\r\n", + " \"VibeVoiceForConditionalGeneration\"\r\n", + " ],\r\n", + " \"decoder_config\": {\r\n", + " \"attention_dropout\": 0.0,\r\n", + " \"hidden_act\": \"silu\",\r\n", + " \"hidden_size\": 3584,\r\n", + " \"initializer_range\": 0.02,\r\n", + " \"intermediate_size\": 18944,\r\n", + " \"max_position_embeddings\": 32768,\r\n", + " \"max_window_layers\": 28,\r\n", + " \"model_type\": \"qwen2\",\r\n", + " \"num_attention_heads\": 28,\r\n", + " \"num_hidden_layers\": 28,\r\n", + " \"num_key_value_heads\": 4,\r\n", + " \"rms_norm_eps\": 1e-06,\r\n", + " \"rope_scaling\": null,\r\n", + " \"rope_theta\": 1000000.0,\r\n", + " \"sliding_window\": null,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"use_cache\": true,\r\n", + " \"use_mrope\": false,\r\n", + " \"use_sliding_window\": false,\r\n", + " \"vocab_size\": 152064\r\n", + " },\r\n", + " \"diffusion_head_config\": {\r\n", + " \"ddpm_batch_mul\": 4,\r\n", + " \"ddpm_beta_schedule\": \"cosine\",\r\n", + " \"ddpm_num_inference_steps\": 20,\r\n", + " \"ddpm_num_steps\": 1000,\r\n", + " \"diffusion_type\": \"ddpm\",\r\n", + " \"head_ffn_ratio\": 3.0,\r\n", + " \"head_layers\": 4,\r\n", + " \"hidden_size\": 3584,\r\n", + " \"latent_size\": 64,\r\n", + " \"model_type\": \"vibevoice_diffusion_head\",\r\n", + " \"prediction_type\": \"v_prediction\",\r\n", + " \"rms_norm_eps\": 1e-05,\r\n", + " \"speech_vae_dim\": 64\r\n", + " },\r\n", + " \"model_type\": \"vibevoice\",\r\n", + " \"semantic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"none\",\r\n", + " \"vae_dim\": 128,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"semantic_vae_dim\": 128,\r\n", + " \"tie_word_embeddings\": false,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"transformers_version\": \"4.51.3\"\r\n", + "}\r\n", + "\r\n", + "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", + "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", + "Generate config GenerationConfig {}\r\n", + "\r\n", + "Instantiating Qwen2Model model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.\r\n", + "\rLoading checkpoint shards: 0%| | 0/10 [00:00 Frank\r\n", + " Speaker 1 -> Alice\r\n", + "Speaker 1 ('Alice') -> Voice: en-Alice_woman.wav\r\n", + "Speaker 2 ('Frank') -> Voice: en-Frank_man.wav\r\n", + "Loading processor & model from /root/models/VibeVoice-Large\r\n", + "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/vocab.json\r\n", + "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/merges.txt\r\n", + "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer.json\r\n", + "loading file added_tokens.json from cache at None\r\n", + "loading file special_tokens_map.json from cache at None\r\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796/tokenizer_config.json\r\n", + "loading file chat_template.jinja from cache at None\r\n", + "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \r\n", + "The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. \r\n", + "The class this function is called from is 'VibeVoiceTextTokenizerFast'.\r\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\r\n", + "Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2\r\n", + "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", + "Model config VibeVoiceConfig {\r\n", + " \"acostic_vae_dim\": 64,\r\n", + " \"acoustic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"decoder_depths\": null,\r\n", + " \"decoder_n_filters\": 32,\r\n", + " \"decoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0.5,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"gaussian\",\r\n", + " \"vae_dim\": 64,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"acoustic_vae_dim\": 64,\r\n", + " \"architectures\": [\r\n", + " \"VibeVoiceForConditionalGeneration\"\r\n", + " ],\r\n", + " \"decoder_config\": {\r\n", + " \"attention_dropout\": 0.0,\r\n", + " \"hidden_act\": \"silu\",\r\n", + " \"hidden_size\": 3584,\r\n", + " \"initializer_range\": 0.02,\r\n", + " \"intermediate_size\": 18944,\r\n", + " \"max_position_embeddings\": 32768,\r\n", + " \"max_window_layers\": 28,\r\n", + " \"model_type\": \"qwen2\",\r\n", + " \"num_attention_heads\": 28,\r\n", + " \"num_hidden_layers\": 28,\r\n", + " \"num_key_value_heads\": 4,\r\n", + " \"rms_norm_eps\": 1e-06,\r\n", + " \"rope_scaling\": null,\r\n", + " \"rope_theta\": 1000000.0,\r\n", + " \"sliding_window\": null,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"use_cache\": true,\r\n", + " \"use_mrope\": false,\r\n", + " \"use_sliding_window\": false,\r\n", + " \"vocab_size\": 152064\r\n", + " },\r\n", + " \"diffusion_head_config\": {\r\n", + " \"ddpm_batch_mul\": 4,\r\n", + " \"ddpm_beta_schedule\": \"cosine\",\r\n", + " \"ddpm_num_inference_steps\": 20,\r\n", + " \"ddpm_num_steps\": 1000,\r\n", + " \"diffusion_type\": \"ddpm\",\r\n", + " \"head_ffn_ratio\": 3.0,\r\n", + " \"head_layers\": 4,\r\n", + " \"hidden_size\": 3584,\r\n", + " \"latent_size\": 64,\r\n", + " \"model_type\": \"vibevoice_diffusion_head\",\r\n", + " \"prediction_type\": \"v_prediction\",\r\n", + " \"rms_norm_eps\": 1e-05,\r\n", + " \"speech_vae_dim\": 64\r\n", + " },\r\n", + " \"model_type\": \"vibevoice\",\r\n", + " \"semantic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"none\",\r\n", + " \"vae_dim\": 128,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"semantic_vae_dim\": 128,\r\n", + " \"tie_word_embeddings\": false,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"transformers_version\": \"4.51.3\"\r\n", + "}\r\n", + "\r\n", + "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", + "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", + "[ERROR] : ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", + "Traceback (most recent call last):\r\n", + " File \"/root/VibeVoice/demo/inference_from_file.py\", line 305, in main\r\n", + " model = VibeVoiceForConditionalGenerationInference.from_pretrained(\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 279, in _wrapper\r\n", + " return func(*args, **kwargs)\r\n", + " ^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 4336, in from_pretrained\r\n", + " config = cls._autoset_attn_implementation(\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2109, in _autoset_attn_implementation\r\n", + " cls._check_and_enable_flash_attn_2(\r\n", + " File \"/usr/local/lib/python3.12/site-packages/transformers/modeling_utils.py\", line 2252, in _check_and_enable_flash_attn_2\r\n", + " raise ImportError(f\"{preface} the package flash_attn seems to be not installed. {install_message}\")\r\n", + "ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\r\n", + "\r\n", + "Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.\r\n", + "loading configuration file /root/models/VibeVoice-Large/config.json\r\n", + "Model config VibeVoiceConfig {\r\n", + " \"acostic_vae_dim\": 64,\r\n", + " \"acoustic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"decoder_depths\": null,\r\n", + " \"decoder_n_filters\": 32,\r\n", + " \"decoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0.5,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_acoustic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"gaussian\",\r\n", + " \"vae_dim\": 64,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"acoustic_vae_dim\": 64,\r\n", + " \"architectures\": [\r\n", + " \"VibeVoiceForConditionalGeneration\"\r\n", + " ],\r\n", + " \"decoder_config\": {\r\n", + " \"attention_dropout\": 0.0,\r\n", + " \"hidden_act\": \"silu\",\r\n", + " \"hidden_size\": 3584,\r\n", + " \"initializer_range\": 0.02,\r\n", + " \"intermediate_size\": 18944,\r\n", + " \"max_position_embeddings\": 32768,\r\n", + " \"max_window_layers\": 28,\r\n", + " \"model_type\": \"qwen2\",\r\n", + " \"num_attention_heads\": 28,\r\n", + " \"num_hidden_layers\": 28,\r\n", + " \"num_key_value_heads\": 4,\r\n", + " \"rms_norm_eps\": 1e-06,\r\n", + " \"rope_scaling\": null,\r\n", + " \"rope_theta\": 1000000.0,\r\n", + " \"sliding_window\": null,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"use_cache\": true,\r\n", + " \"use_mrope\": false,\r\n", + " \"use_sliding_window\": false,\r\n", + " \"vocab_size\": 152064\r\n", + " },\r\n", + " \"diffusion_head_config\": {\r\n", + " \"ddpm_batch_mul\": 4,\r\n", + " \"ddpm_beta_schedule\": \"cosine\",\r\n", + " \"ddpm_num_inference_steps\": 20,\r\n", + " \"ddpm_num_steps\": 1000,\r\n", + " \"diffusion_type\": \"ddpm\",\r\n", + " \"head_ffn_ratio\": 3.0,\r\n", + " \"head_layers\": 4,\r\n", + " \"hidden_size\": 3584,\r\n", + " \"latent_size\": 64,\r\n", + " \"model_type\": \"vibevoice_diffusion_head\",\r\n", + " \"prediction_type\": \"v_prediction\",\r\n", + " \"rms_norm_eps\": 1e-05,\r\n", + " \"speech_vae_dim\": 64\r\n", + " },\r\n", + " \"model_type\": \"vibevoice\",\r\n", + " \"semantic_tokenizer_config\": {\r\n", + " \"causal\": true,\r\n", + " \"channels\": 1,\r\n", + " \"conv_bias\": true,\r\n", + " \"conv_norm\": \"none\",\r\n", + " \"corpus_normalize\": 0.0,\r\n", + " \"disable_last_norm\": true,\r\n", + " \"encoder_depths\": \"3-3-3-3-3-3-8\",\r\n", + " \"encoder_n_filters\": 32,\r\n", + " \"encoder_ratios\": [\r\n", + " 8,\r\n", + " 5,\r\n", + " 5,\r\n", + " 4,\r\n", + " 2,\r\n", + " 2\r\n", + " ],\r\n", + " \"fix_std\": 0,\r\n", + " \"layer_scale_init_value\": 1e-06,\r\n", + " \"layernorm\": \"RMSNorm\",\r\n", + " \"layernorm_elementwise_affine\": true,\r\n", + " \"layernorm_eps\": 1e-05,\r\n", + " \"mixer_layer\": \"depthwise_conv\",\r\n", + " \"model_type\": \"vibevoice_semantic_tokenizer\",\r\n", + " \"pad_mode\": \"constant\",\r\n", + " \"std_dist_type\": \"none\",\r\n", + " \"vae_dim\": 128,\r\n", + " \"weight_init_value\": 0.01\r\n", + " },\r\n", + " \"semantic_vae_dim\": 128,\r\n", + " \"tie_word_embeddings\": false,\r\n", + " \"torch_dtype\": \"bfloat16\",\r\n", + " \"transformers_version\": \"4.51.3\"\r\n", + "}\r\n", + "\r\n", + "loading weights file /root/models/VibeVoice-Large/model.safetensors.index.json\r\n", + "Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.\r\n", + "Generate config GenerationConfig {}\r\n", + "\r\n", + "Instantiating Qwen2Model model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.\r\n", + "Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.\r\n", + "\rLoading checkpoint shards: 0%| | 0/10 [00:00WITH LoRA (Alice)\"))\n", + "display(Audio(\"/root/outputs/builtin_with_lora/test_generated.wav\"))\n", + "display(HTML(\"

WITHOUT LoRA (Alice)

\"))\n", + "display(Audio(\"/root/outputs/builtin_without_lora/test_generated.wav\"))" + ], + "execution_count": 16, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": "

WITH LoRA (Alice)

", + "text/plain": "" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/html": "\n \n ", + "text/plain": "" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/html": "

WITHOUT LoRA (Alice)

", + "text/plain": "" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/html": "\n \n ", + "text/plain": "" + }, + "metadata": {} + } + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file