gemma-3-270m-it

Running on Zero

anakin87 commited on Aug 18

Commit

8bfc45f

verified ·

1 Parent(s): 6a159c9

choose fa2 if GPU available

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,16 +27,15 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "google/gemma-3-270m-it"
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
 )
 model.config.sliding_window = 4096
 model.eval()

 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 model_id = "google/gemma-3-270m-it"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+attn_impl = "flash_attention_2" if torch.cuda.is_available() else "eager"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
+    attn_implementation=attn_impl,
 )
 model.config.sliding_window = 4096
 model.eval()