Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
6f8ad2f
1
Parent(s):
9066f73
Caches models metadata card to a temporary file to speed up initilization
Browse files- .gitignore +2 -1
- app.py +21 -10
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
*.pyc
|
|
|
|
|
|
| 1 |
+
*.pyc
|
| 2 |
+
model_infos.json
|
app.py
CHANGED
|
@@ -151,10 +151,14 @@ def add_rank(df):
|
|
| 151 |
df.fillna("", inplace=True)
|
| 152 |
return df
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
|
| 157 |
-
global
|
| 158 |
api = API
|
| 159 |
models = api.list_models(filter="mteb")
|
| 160 |
# Initialize list to models that we cannot fetch metadata from
|
|
@@ -181,11 +185,13 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 181 |
for model in models:
|
| 182 |
if model.modelId in MODELS_TO_SKIP: continue
|
| 183 |
print("MODEL", model.modelId)
|
| 184 |
-
if model.modelId not in
|
| 185 |
readme_path = hf_hub_download(model.modelId, filename="README.md")
|
| 186 |
meta = metadata_load(readme_path)
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
if "model-index" not in meta:
|
| 190 |
continue
|
| 191 |
# meta['model-index'][0]["results"] is list of elements like:
|
|
@@ -217,14 +223,19 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 217 |
if add_emb_dim:
|
| 218 |
try:
|
| 219 |
# Fails on gated repos, so we only include scores for them
|
| 220 |
-
if
|
| 221 |
-
|
| 222 |
-
out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] =
|
| 223 |
except:
|
| 224 |
-
|
| 225 |
df_list.append(out)
|
| 226 |
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
| 227 |
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
df = pd.DataFrame(df_list)
|
| 229 |
# If there are any models that are the same, merge them
|
| 230 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|
|
|
|
| 151 |
df.fillna("", inplace=True)
|
| 152 |
return df
|
| 153 |
|
| 154 |
+
model_infos_path = "model_infos.json"
|
| 155 |
+
MODEL_INFOS = {}
|
| 156 |
+
if os.path.exists(model_infos_path):
|
| 157 |
+
with open(model_infos_path) as f:
|
| 158 |
+
MODEL_INFOS = json.load(f)
|
| 159 |
+
|
| 160 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
|
| 161 |
+
global MODEL_INFOS
|
| 162 |
api = API
|
| 163 |
models = api.list_models(filter="mteb")
|
| 164 |
# Initialize list to models that we cannot fetch metadata from
|
|
|
|
| 185 |
for model in models:
|
| 186 |
if model.modelId in MODELS_TO_SKIP: continue
|
| 187 |
print("MODEL", model.modelId)
|
| 188 |
+
if model.modelId not in MODEL_INFOS or refresh:
|
| 189 |
readme_path = hf_hub_download(model.modelId, filename="README.md")
|
| 190 |
meta = metadata_load(readme_path)
|
| 191 |
+
MODEL_INFOS[model.modelId] = {
|
| 192 |
+
"metadata": meta
|
| 193 |
+
}
|
| 194 |
+
meta = MODEL_INFOS[model.modelId]["metadata"]
|
| 195 |
if "model-index" not in meta:
|
| 196 |
continue
|
| 197 |
# meta['model-index'][0]["results"] is list of elements like:
|
|
|
|
| 223 |
if add_emb_dim:
|
| 224 |
try:
|
| 225 |
# Fails on gated repos, so we only include scores for them
|
| 226 |
+
if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh:
|
| 227 |
+
MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
|
| 228 |
+
out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
|
| 229 |
except:
|
| 230 |
+
MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", ""
|
| 231 |
df_list.append(out)
|
| 232 |
if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
|
| 233 |
SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
|
| 234 |
+
|
| 235 |
+
# Save & cache MODEL_INFOS
|
| 236 |
+
with open("model_infos.json", "w") as f:
|
| 237 |
+
json.dump(MODEL_INFOS, f)
|
| 238 |
+
|
| 239 |
df = pd.DataFrame(df_list)
|
| 240 |
# If there are any models that are the same, merge them
|
| 241 |
# E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
|