ShallowMind-abeat
/

blahblahthron-1.1b

custom-architecture

Model card Files Files and versions

abeat commited on Aug 21, 2025

Commit

cd28c9d

·

verified ·

1 Parent(s): a9b6b1a

Update blahblahtron_1_1B.py

Files changed (1) hide show

blahblahtron_1_1B.py +8 -4

blahblahtron_1_1B.py CHANGED Viewed

@@ -131,7 +131,7 @@ def apply_rope(q: torch.Tensor, k: torch.Tensor, rope_cos: torch.Tensor, rope_si
     k_embed = (k * rope_cos) + (rotate_half(k) * rope_sin)
     return q_embed, k_embed
 class Attention(nn.Module):
     def __init__(self, cfg: HFWrapperConfig):
         super().__init__()
@@ -313,12 +313,16 @@ class MyCustomModelForCausalLM(PreTrainedModel):
         rope_cos = self.rope_cos[T_past:T].to(x.dtype)
         rope_sin = self.rope_sin[T_past:T].to(x.dtype)
-        print("RECEIVED USE CACHE: ",use_cache)
         present_key_values = [] if use_cache else None
         for i, block in enumerate(self.blocks):
-            past_kv = past_key_values[i] if past_key_values is not None else None
             if self.training and self._gradient_checkpointing:
                 def block_only_x(x_, rc, rs):
                     out_x, _ = block(x_, rc, rs, past_kv=None, use_cache=False)

     k_embed = (k * rope_cos) + (rotate_half(k) * rope_sin)
     return q_embed, k_embed
 class Attention(nn.Module):
     def __init__(self, cfg: HFWrapperConfig):
         super().__init__()
         rope_cos = self.rope_cos[T_past:T].to(x.dtype)
         rope_sin = self.rope_sin[T_past:T].to(x.dtype)
         present_key_values = [] if use_cache else None
         for i, block in enumerate(self.blocks):
+            #past_kv = past_key_values[i] if past_key_values is not None else None
+            if past_key_values is not None and i < len(past_key_values):
+                past_kv = past_key_values[i]
+            else:
+                past_kv = None
             if self.training and self._gradient_checkpointing:
                 def block_only_x(x_, rc, rs):
                     out_x, _ = block(x_, rc, rs, past_kv=None, use_cache=False)