Update configuration_rwkv5.py
Browse files- configuration_rwkv5.py +4 -6
configuration_rwkv5.py
CHANGED
|
@@ -53,11 +53,9 @@ class Rwkv5Config(PretrainedConfig):
|
|
| 53 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
| 54 |
The epsilon to use in the layer normalization layers.
|
| 55 |
bos_token_id (`int`, *optional*, defaults to 0):
|
| 56 |
-
The id of the beginning of sentence token in the vocabulary. Defaults to 0
|
| 57 |
-
as GPTNeoX.
|
| 58 |
eos_token_id (`int`, *optional*, defaults to 0):
|
| 59 |
-
The id of the end of sentence token in the vocabulary. Defaults to 0
|
| 60 |
-
GPTNeoX.
|
| 61 |
rescale_every (`int`, *optional*, defaults to 6):
|
| 62 |
At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
|
| 63 |
`rescale_every` layer. If set to 0 or a negative number, no rescale is done.
|
|
@@ -90,8 +88,8 @@ class Rwkv5Config(PretrainedConfig):
|
|
| 90 |
hidden_size=768,
|
| 91 |
num_hidden_layers=24,
|
| 92 |
attention_hidden_size=None,
|
| 93 |
-
num_attention_heads=64,
|
| 94 |
head_size=64,
|
|
|
|
| 95 |
intermediate_size=None,
|
| 96 |
layer_norm_epsilon=1e-5,
|
| 97 |
bos_token_id=0,
|
|
@@ -105,8 +103,8 @@ class Rwkv5Config(PretrainedConfig):
|
|
| 105 |
self.hidden_size = hidden_size
|
| 106 |
self.num_hidden_layers = num_hidden_layers
|
| 107 |
self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
|
| 108 |
-
self.num_attention_heads = num_attention_heads
|
| 109 |
self.head_size = head_size
|
|
|
|
| 110 |
self.intermediate_size = None
|
| 111 |
self.layer_norm_epsilon = layer_norm_epsilon
|
| 112 |
self.rescale_every = rescale_every
|
|
|
|
| 53 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
| 54 |
The epsilon to use in the layer normalization layers.
|
| 55 |
bos_token_id (`int`, *optional*, defaults to 0):
|
| 56 |
+
The id of the beginning of sentence token in the vocabulary. Defaults to 0.
|
|
|
|
| 57 |
eos_token_id (`int`, *optional*, defaults to 0):
|
| 58 |
+
The id of the end of sentence token in the vocabulary. Defaults to 0.
|
|
|
|
| 59 |
rescale_every (`int`, *optional*, defaults to 6):
|
| 60 |
At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
|
| 61 |
`rescale_every` layer. If set to 0 or a negative number, no rescale is done.
|
|
|
|
| 88 |
hidden_size=768,
|
| 89 |
num_hidden_layers=24,
|
| 90 |
attention_hidden_size=None,
|
|
|
|
| 91 |
head_size=64,
|
| 92 |
+
head_size_divisor=8,
|
| 93 |
intermediate_size=None,
|
| 94 |
layer_norm_epsilon=1e-5,
|
| 95 |
bos_token_id=0,
|
|
|
|
| 103 |
self.hidden_size = hidden_size
|
| 104 |
self.num_hidden_layers = num_hidden_layers
|
| 105 |
self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
|
|
|
|
| 106 |
self.head_size = head_size
|
| 107 |
+
self.head_size_divisor = head_size_divisor
|
| 108 |
self.intermediate_size = None
|
| 109 |
self.layer_norm_epsilon = layer_norm_epsilon
|
| 110 |
self.rescale_every = rescale_every
|