zai-org
/

glm-4v-9b

@@ -692,16 +692,16 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         """Initialize the weights."""
         return
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
         if padding_mask is not None:
             full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
         if not past_length and padding_mask is not None:
@@ -887,7 +887,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         if full_attention_mask is None:
             if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
         # Rotary positional embeddings
         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
@@ -976,6 +976,21 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         # only last token for input_ids if past is not None
         if position_ids is None:
             position_ids = self.get_position_ids(input_ids, device=input_ids.device)
         if not is_first_forward:
             if past_key_values is not None:
                 position_ids = position_ids[..., -1:]

         """Initialize the weights."""
         return
+    def get_masks(self, input_embeds, past_key_values, padding_mask=None):
+        batch_size, seq_length, embed_size = input_embeds.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_embeds.device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_embeds.device), full_attention_mask), dim=-1)
         if padding_mask is not None:
             full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
         if not past_length and padding_mask is not None:
         if full_attention_mask is None:
             if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(inputs_embeds, past_key_values, padding_mask=attention_mask)
         # Rotary positional embeddings
         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
         # only last token for input_ids if past is not None
         if position_ids is None:
             position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if attention_mask is not None:
+            image_size: int = self.config.vision_config['image_size']
+            patch_size: int = self.config.vision_config['patch_size']
+            num_patches = (image_size // patch_size // 2) ** 2
+            new_attention_masks = []
+            for i in range(len(input_ids)):
+                input_id = input_ids[i].tolist()
+                boi_token_pos, eoi_token_pos = input_id.index(self.config.boi_token_id), input_id.index(
+                    self.config.eoi_token_id)
+                assert eoi_token_pos - boi_token_pos == 2
+                new_attention_masks.append(torch.cat(
+                    (attention_mask[i, :boi_token_pos + 1], attention_mask.new_ones(num_patches),
+                     attention_mask[i, eoi_token_pos:])
+                ))
+            attention_mask = torch.stack(new_attention_masks, dim=0)
         if not is_first_forward:
             if past_key_values is not None:
                 position_ids = position_ids[..., -1:]