{ "d_model": 256, "num_heads": 8, "num_encoder_layers": 6, "num_decoder_layers": 6, "d_ff": 1024, "num_experts": 8, "top_k": 2, "dropout": 0.1, "max_input_length": 512, "max_target_length": 64, "batch_size": 64, "learning_rate": 0.0001, "early_stopping_patience": 3, "architectures": [ "MoETransformer" ], "model_type": "moe_transformer", "vocab_size": 32100, "auto_map": { "AutoModel": "moe_transformer.MoETransformer" } }