|
|
--- |
|
|
language: |
|
|
- en |
|
|
- pt |
|
|
base_model: |
|
|
- NeuML/bert-hash-femto |
|
|
pipeline_tag: fill-mask |
|
|
--- |
|
|
An experiment on the ridiculously tiny model [NeuML/bert-hash-femto](https://huggingface.co/NeuML/bert-hash-femto) for MLM/WWM: |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer |
|
|
from transformers import pipeline |
|
|
|
|
|
model_checkpoint = "cnmoro/bert-hash-femto-mlm" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, trust_remote_code=True) |
|
|
|
|
|
mask_filler = pipeline( |
|
|
"fill-mask", |
|
|
model="bert-femto-wwm-final", |
|
|
tokenizer="bert-femto-wwm-final", |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
result = mask_filler( |
|
|
f"predict the next {tokenizer.mask_token}", # EN |
|
|
top_k=10 |
|
|
) |
|
|
print(result) |
|
|
# [ |
|
|
# {'score': 0.00577017106115818, 'token': 2709, 'token_str': 'return', 'sequence': 'predict the next return'}, {'score': 0.004855221603065729, 'token': 1996, 'token_str': 'the', 'sequence': 'predict the next the'}, |
|
|
# {'score': 0.004020849708467722, 'token': 13366, 'token_str': 'def', 'sequence': 'predict the next def'}, |
|
|
# {'score': 0.0034529557451605797, 'token': 2065, 'token_str': 'if', 'sequence': 'predict the next if'}, |
|
|
# {'score': 0.002598398830741644, 'token': 5164, 'token_str': 'string', 'sequence': 'predict the next string'}, {'score': 0.002328184898942709, 'token': 2013, 'token_str': 'from', 'sequence': 'predict the next from'}, |
|
|
# {'score': 0.0022902805358171463, 'token': 2193, 'token_str': 'number', 'sequence': 'predict the next number'}, {'score': 0.0021068700589239597, 'token': 2171, 'token_str': 'name', 'sequence': 'predict the next name'}, |
|
|
# {'score': 0.0020654958207160234, 'token': 1997, 'token_str': 'of', 'sequence': 'predict the next of'}, |
|
|
# {'score': 0.001995558850467205, 'token': 12324, 'token_str': 'import', 'sequence': 'predict the next import'} |
|
|
# ] |
|
|
|
|
|
result = mask_filler( |
|
|
f"prever o próximo {tokenizer.mask_token}", # PTBR |
|
|
top_k=10 |
|
|
) |
|
|
print(result) |
|
|
# [ |
|
|
# {'score': 0.10908675193786621, 'token': 10861, 'token_str': 'que', 'sequence': 'prever o proximo que'}, |
|
|
# {'score': 0.07482825964689255, 'token': 2139, 'token_str': 'de', 'sequence': 'prever o proximo de'}, |
|
|
# {'score': 0.050046466290950775, 'token': 7861, 'token_str': 'em', 'sequence': 'prever o proximo em'}, |
|
|
# {'score': 0.029970934614539146, 'token': 11498, 'token_str': 'para', 'sequence': 'prever o proximo para'}, |
|
|
# {'score': 0.0208846777677536, 'token': 18609, 'token_str': 'como', 'sequence': 'prever o proximo como'}, |
|
|
# {'score': 0.016479674726724625, 'token': 16137, 'token_str': 'mas', 'sequence': 'prever o proximo mas'}, |
|
|
# {'score': 0.01430923119187355, 'token': 4830, 'token_str': 'da', 'sequence': 'prever o proximo da'}, |
|
|
# {'score': 0.013099807314574718, 'token': 18499, 'token_str': 'por', 'sequence': 'prever o proximo por'}, |
|
|
# {'score': 0.012438337318599224, 'token': 21934, 'token_str': 'sim', 'sequence': 'prever o proximo sim'}, |
|
|
# {'score': 0.012361743487417698, 'token': 6583, 'token_str': 'na', 'sequence': 'prever o proximo na'} |
|
|
# ] |
|
|
``` |