alfiinyang
/

Llama3.1-8B-Instruct

Model card Files Files and versions

alfiinyang commited on Dec 17, 2024

Commit

64a0272

·

verified ·

1 Parent(s): c15b907

Upload config.json

Files changed (1) hide show

config.json +19 -0

config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "model_type": "llama",
+  "hidden_size": 5120,                // Dimensionality of hidden layers
+  "num_attention_heads": 40,          // Number of attention heads
+  "num_hidden_layers": 40,            // Number of transformer layers
+  "intermediate_size": 13824,         // Feedforward layer size (often ~2.7x hidden_size)
+  "hidden_act": "silu",               // Activation function
+  "rotary_emb_base": 10000,           // Base for rotary embeddings
+  "max_position_embeddings": 2048,    // Maximum sequence length
+  "initializer_range": 0.02,          // Weight initialization range
+  "rms_norm_eps": 1e-6,               // Epsilon for RMSNorm
+  "use_parallel_residual": true,      // Optimized residual connections
+  "vocab_size": 32000,                // Vocabulary size for the tokenizer
+  "pad_token_id": 0,                  // Padding token ID
+  "eos_token_id": 2,                  // End-of-sequence token ID
+  "bos_token_id": 1,                  // Beginning-of-sequence token ID
+  "torch_dtype": "float16",           // Floating point precision for weights (fp16)
+  "use_cache": true                   // Enable caching during inference
+}