{ "architectures": [ "NemotronFlashForCausalLM" ], "attention_dropout": 0.0, "attn_hidden_size": -1, "attn_implementation": "fused_mha", "attn_implementation_new": "fused_mha", "auto_map": { "AutoConfig": "configuration_nemotron_flash.NemotronFlashConfig", "AutoModelForCausalLM": "modeling_nemotron_flash.NemotronFlashForCausalLM" }, "bos_token_id": 1, "calc_logits_for_entire_prompt": false, "d_conv": 4, "dtype": "bfloat16", "eos_token_id": 2, "ffn_expand_ratio": 3, "global_attn_idx": [], "hidden_act": "silu", "hidden_size": 3072, "hybrid_decoder_layer": "mamba", "initializer_range": 0.02, "intermediate_size": 0, "kq_head_dim": -1, "kq_norm": "none", "layer_type": [ "m", "a", "m", "a", "a", "a", "m", "a", "m", "a", "m", "a", "a", "a", "m", "a", "m", "a", "m", "a", "a", "a", "m", "a", "m", "a", "m", "a", "m", "a", "m", "a", "m", "a", "m", "a" ], "layer_types": [ "deltanet", "f", "m2", "f", "a", "f", "m2", "f", "deltanet", "f", "m2", "f", "a", "f", "m2", "f", "deltanet", "f", "m2", "f", "a", "f", "m2", "f", "deltanet", "f", "m2", "f", "deltanet", "f", "m2", "f", "deltanet", "f", "m2", "f" ], "mamba2_headdim": 64, "mamba_conv_bias": true, "mamba_d_conv": 4, "mamba_d_state": 128, "mamba_dt_rank": 192, "mamba_expand": 2, "mamba_inner_layernorms": true, "mamba_proj_bias": false, "max_position_embeddings": 29000, "mlp_hidden_act": "silu", "model_type": "nemotron_flash", "new_seq_length": 2048, "num_attention_heads": 24, "num_experts": 1, "num_experts_per_tok": 1, "num_hidden_layers": 36, "num_key_value_heads": 6, "num_memory_tokens": 256, "orig_max_position_embeddings": 4096, "output_router_logits": false, "pad_token_id": 0, "rms_norm_eps": 1e-06, "rope": true, "rope_theta": 10000.0, "rope_type": "ntk", "router_aux_loss_coef": 0.001, "sliding_window": null, "tie_word_embeddings": true, "transformers_version": "4.56.2", "use_cache": false, "use_mamba_kernels": true, "v_head_dim": -1, "vocab_size": 131075 }