Spaces:
Running
Running
| { | |
| "time": "2025-06-25 18:17:55", | |
| "results": { | |
| "IO": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "gpt-3.5-turbo", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 37.83, | |
| "Cost($)": 0.3328 | |
| }, | |
| "AQuA": { | |
| "Score": 38.98, | |
| "Cost($)": 0.038 | |
| }, | |
| "MATH-500": { | |
| "Score": 17.2, | |
| "Cost($)": 0.2436 | |
| } | |
| }, | |
| "ReAct-Pro*": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "gpt-3.5-turbo", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 74.91, | |
| "Cost($)": 3.4633 | |
| }, | |
| "AQuA": { | |
| "Score": 64.57, | |
| "Cost($)": 0.4928 | |
| }, | |
| "MATH-500": { | |
| "Score": 23.8, | |
| "Cost($)": 2.0406 | |
| } | |
| }, | |
| "PoT": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "gpt-3.5-turbo", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 76.88, | |
| "Cost($)": 0.6902 | |
| }, | |
| "AQuA": { | |
| "Score": 59.45, | |
| "Cost($)": 0.1748 | |
| }, | |
| "MATH-500": { | |
| "Score": 28.8, | |
| "Cost($)": 0.168 | |
| } | |
| }, | |
| "CoT": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "gpt-3.5-turbo", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 78.7, | |
| "Cost($)": 0.6788 | |
| }, | |
| "AQuA": { | |
| "Score": 61.02, | |
| "Cost($)": 0.0957 | |
| }, | |
| "MATH-500": { | |
| "Score": 39.8, | |
| "Cost($)": 0.3189 | |
| } | |
| }, | |
| "SC-CoT": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "gpt-3.5-turbo", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 69.29, | |
| "Cost($)": 2.5203 | |
| }, | |
| "AQuA": { | |
| "Score": 58.66, | |
| "Cost($)": 0.3277 | |
| }, | |
| "MATH-500": { | |
| "Score": 40.8, | |
| "Cost($)": 1.2308 | |
| } | |
| }, | |
| "ToT": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "gpt-3.5-turbo", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 67.93, | |
| "Cost($)": 9.1707 | |
| }, | |
| "AQuA": { | |
| "Score": 57.09, | |
| "Cost($)": 1.1513 | |
| }, | |
| "MATH-500": { | |
| "Score": 9.8, | |
| "Cost($)": 5.2914 | |
| } | |
| }, | |
| "IO-Doubao-lite-32k": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Doubao-lite-32k", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 72.02, | |
| "Cost($)": 0.0354 | |
| }, | |
| "AQuA": { | |
| "Score": 79.13, | |
| "Cost($)": 0.0058 | |
| }, | |
| "MATH-500": { | |
| "Score": 37.4, | |
| "Cost($)": 0.0187 | |
| } | |
| }, | |
| "ReAct-Pro*-Doubao-lite-32k": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Doubao-lite-32k", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 85.6, | |
| "Cost($)": 0.2512 | |
| }, | |
| "AQuA": { | |
| "Score": 77.56, | |
| "Cost($)": 0.0445 | |
| }, | |
| "MATH-500": { | |
| "Score": 47.2, | |
| "Cost($)": 0.186 | |
| } | |
| }, | |
| "PoT-Doubao-lite-32k": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Doubao-lite-32k", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 79.61, | |
| "Cost($)": 0.0576 | |
| }, | |
| "AQuA": { | |
| "Score": 71.65, | |
| "Cost($)": 0.0147 | |
| }, | |
| "MATH-500": { | |
| "Score": 32.6, | |
| "Cost($)": 0.0144 | |
| } | |
| }, | |
| "CoT-Doubao-lite-32k": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Doubao-lite-32k", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 89.31, | |
| "Cost($)": 0.0558 | |
| }, | |
| "AQuA": { | |
| "Score": 82.68, | |
| "Cost($)": 0.0066 | |
| }, | |
| "MATH-500": { | |
| "Score": 59.0, | |
| "Cost($)": 0.0255 | |
| } | |
| }, | |
| "SC-CoT-Doubao-lite-32k": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Doubao-lite-32k", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 91.58, | |
| "Cost($)": 0.1118 | |
| }, | |
| "AQuA": { | |
| "Score": 76.37, | |
| "Cost($)": 0.0279 | |
| }, | |
| "MATH-500": { | |
| "Score": 65.8, | |
| "Cost($)": 0.0734 | |
| } | |
| }, | |
| "ToT-Doubao-lite-32k": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Doubao-lite-32k", | |
| "Eval Date": "2025/1/7" | |
| }, | |
| "gsm8k": { | |
| "Score": 37.83, | |
| "Cost($)": 0.8739 | |
| }, | |
| "AQuA": { | |
| "Score": 45.28, | |
| "Cost($)": 0.0881 | |
| }, | |
| "MATH-500": { | |
| "Score": 1.2, | |
| "Cost($)": 0.2371 | |
| } | |
| }, | |
| "IO-gpt-4o": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "gpt-4o", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 88.4, | |
| "Cost($)": 3.3463 | |
| }, | |
| "AQuA": { | |
| "Score": 75.59, | |
| "Cost($)": 1.1453 | |
| }, | |
| "MATH-500": { | |
| "Score": 41.8, | |
| "Cost($)": 2.7907 | |
| } | |
| }, | |
| "ReAct-Pro*-gpt-4o": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "gpt-4o", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 63.31, | |
| "Cost($)": 39.0751 | |
| }, | |
| "AQuA": { | |
| "Score": 57.48, | |
| "Cost($)": 2.304 | |
| }, | |
| "MATH-500": { | |
| "Score": 54.0, | |
| "Cost($)": 17.7735 | |
| } | |
| }, | |
| "PoT-gpt-4o": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "gpt-4o", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 93.1, | |
| "Cost($)": 4.2166 | |
| }, | |
| "AQuA": { | |
| "Score": 75.2, | |
| "Cost($)": 1.6087 | |
| }, | |
| "MATH-500": { | |
| "Score": 46.2, | |
| "Cost($)": 1.5994 | |
| } | |
| }, | |
| "CoT-gpt-4o": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "gpt-4o", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 94.09, | |
| "Cost($)": 4.5367 | |
| }, | |
| "AQuA": { | |
| "Score": 82.68, | |
| "Cost($)": 1.0417 | |
| }, | |
| "MATH-500": { | |
| "Score": 68.0, | |
| "Cost($)": 3.0569 | |
| } | |
| }, | |
| "SC-CoT-gpt-4o": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "gpt-4o", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 94.77, | |
| "Cost($)": 18.2044 | |
| }, | |
| "AQuA": { | |
| "Score": 85.83, | |
| "Cost($)": 5.2456 | |
| }, | |
| "MATH-500": { | |
| "Score": 74.6, | |
| "Cost($)": 12.3611 | |
| } | |
| }, | |
| "ToT-gpt-4o": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "gpt-4o", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 91.13, | |
| "Cost($)": 86.8581 | |
| }, | |
| "AQuA": { | |
| "Score": 81.5, | |
| "Cost($)": 8.5295 | |
| }, | |
| "MATH-500": { | |
| "Score": 3.2, | |
| "Cost($)": 40.8094 | |
| } | |
| }, | |
| "IO-Qwen2.5-72B-Instruct": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Qwen2.5-72B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 86.58, | |
| "Cost($)": 0.4899 | |
| }, | |
| "AQuA": { | |
| "Score": 84.25, | |
| "Cost($)": 0.0742 | |
| }, | |
| "MATH-500": { | |
| "Score": 70.2, | |
| "Cost($)": 0.2506 | |
| } | |
| }, | |
| "ReAct-Pro*-Qwen2.5-72B-Instruct": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Qwen2.5-72B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 87.26, | |
| "Cost($)": 10.5479 | |
| }, | |
| "AQuA": { | |
| "Score": 73.23, | |
| "Cost($)": 0.3177 | |
| }, | |
| "MATH-500": { | |
| "Score": 62.8, | |
| "Cost($)": 3.4541 | |
| } | |
| }, | |
| "PoT-Qwen2.5-72B-Instruct": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Qwen2.5-72B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 92.34, | |
| "Cost($)": 0.7054 | |
| }, | |
| "AQuA": { | |
| "Score": 75.2, | |
| "Cost($)": 0.1645 | |
| }, | |
| "MATH-500": { | |
| "Score": 47.2, | |
| "Cost($)": 0.233 | |
| } | |
| }, | |
| "CoT-Qwen2.5-72B-Instruct": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Qwen2.5-72B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 92.87, | |
| "Cost($)": 0.7195 | |
| }, | |
| "AQuA": { | |
| "Score": 86.22, | |
| "Cost($)": 0.0808 | |
| }, | |
| "MATH-500": { | |
| "Score": 80.2, | |
| "Cost($)": 0.349 | |
| } | |
| }, | |
| "SC-CoT-Qwen2.5-72B-Instruct": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Qwen2.5-72B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 94.77, | |
| "Cost($)": 4.045 | |
| }, | |
| "AQuA": { | |
| "Score": 85.43, | |
| "Cost($)": 0.4186 | |
| }, | |
| "MATH-500": { | |
| "Score": 79.8, | |
| "Cost($)": 1.8504 | |
| } | |
| }, | |
| "ToT-Qwen2.5-72B-Instruct": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Qwen2.5-72B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 88.88, | |
| "Cost($)": 23.5911 | |
| }, | |
| "AQuA": { | |
| "Score": 81.1, | |
| "Cost($)": 3.7389 | |
| }, | |
| "MATH-500": { | |
| "Score": 10.8, | |
| "Cost($)": 9.0421 | |
| } | |
| }, | |
| "IO-Llama-3.3-70B-Instruct": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Llama-3.3-70B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 92.27, | |
| "Cost($)": 0.4709 | |
| }, | |
| "AQuA": { | |
| "Score": 82.68, | |
| "Cost($)": 0.0798 | |
| }, | |
| "MATH-500": { | |
| "Score": 69.4, | |
| "Cost($)": 0.2386 | |
| } | |
| }, | |
| "ReAct-Pro*-Llama-3.3-70B-Instruct": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Llama-3.3-70B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 87.64, | |
| "Cost($)": 10.1124 | |
| }, | |
| "AQuA": { | |
| "Score": 79.13, | |
| "Cost($)": 0.768 | |
| }, | |
| "MATH-500": { | |
| "Score": 64.6, | |
| "Cost($)": 3.1806 | |
| } | |
| }, | |
| "PoT-Llama-3.3-70B-Instruct": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Llama-3.3-70B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 73.09, | |
| "Cost($)": 0.9736 | |
| }, | |
| "AQuA": { | |
| "Score": 79.53, | |
| "Cost($)": 0.1746 | |
| }, | |
| "MATH-500": { | |
| "Score": 42.6, | |
| "Cost($)": 0.2839 | |
| } | |
| }, | |
| "CoT-Llama-3.3-70B-Instruct": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Llama-3.3-70B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 93.93, | |
| "Cost($)": 0.687 | |
| }, | |
| "AQuA": { | |
| "Score": 83.46, | |
| "Cost($)": 0.0927 | |
| }, | |
| "MATH-500": { | |
| "Score": 71.2, | |
| "Cost($)": 0.3463 | |
| } | |
| }, | |
| "SC-CoT-Llama-3.3-70B-Instruct": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Llama-3.3-70B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 95.22, | |
| "Cost($)": 3.7895 | |
| }, | |
| "AQuA": { | |
| "Score": 84.65, | |
| "Cost($)": 0.4438 | |
| }, | |
| "MATH-500": { | |
| "Score": 72.4, | |
| "Cost($)": 1.7845 | |
| } | |
| }, | |
| "ToT-Llama-3.3-70B-Instruct": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Llama-3.3-70B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 91.89, | |
| "Cost($)": 20.8753 | |
| }, | |
| "AQuA": { | |
| "Score": 83.07, | |
| "Cost($)": 2.9404 | |
| }, | |
| "MATH-500": { | |
| "Score": 1.4, | |
| "Cost($)": 8.2699 | |
| } | |
| }, | |
| "IO-Qwen2.5-7B-Instruct": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Qwen2.5-7B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 57.24, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 78.74, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 59.4, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ReAct-Pro*-Qwen2.5-7B-Instruct": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Qwen2.5-7B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 82.87, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 74.41, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 48.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "PoT-Qwen2.5-7B-Instruct": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Qwen2.5-7B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 58.83, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 68.11, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 39.6, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "CoT-Qwen2.5-7B-Instruct": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Qwen2.5-7B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 85.67, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 80.71, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 69.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "SC-CoT-Qwen2.5-7B-Instruct": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Qwen2.5-7B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 90.98, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 79.53, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 71.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ToT-Qwen2.5-7B-Instruct": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Qwen2.5-7B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 72.21, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 53.94, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 1.4, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "IO-Llama-3.1-8B-Instruct": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Llama-3.1-8B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 57.16, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 51.18, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 38.6, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ReAct-Pro*-Llama-3.1-8B-Instruct": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Llama-3.1-8B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 67.78, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 55.51, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 28.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "PoT-Llama-3.1-8B-Instruct": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Llama-3.1-8B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 38.67, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 36.61, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 25.4, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "CoT-Llama-3.1-8B-Instruct": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Llama-3.1-8B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 75.44, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 60.63, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 25.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "SC-CoT-Llama-3.1-8B-Instruct": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Llama-3.1-8B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 54.36, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 59.45, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 19.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ToT-Llama-3.1-8B-Instruct": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Llama-3.1-8B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 65.05, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 59.06, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 1.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "IO-Internllm2_5-7B": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Internllm2_5-7B", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 11.6, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 47.64, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 22.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ReAct-Pro*-Internllm2_5-7B": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Internllm2_5-7B", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 33.51, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 40.94, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 14.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "PoT-Internllm2_5-7B": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Internllm2_5-7B", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 38.21, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 36.61, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 15.0, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "CoT-Internllm2_5-7B": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Internllm2_5-7B", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 77.71, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 52.76, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 46.6, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "SC-CoT-Internllm2_5-7B": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Internllm2_5-7B", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 44.66, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 38.58, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 9.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ToT-Internllm2_5-7B": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Internllm2_5-7B", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 20.85, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 35.83, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "IO-Qwen2-1.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Qwen2-1.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 16.68, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 29.13, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 7.0, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ReAct-Pro*-Qwen2-1.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Qwen2-1.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 24.87, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 25.59, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 8.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "PoT-Qwen2-1.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Qwen2-1.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 18.5, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 30.71, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "CoT-Qwen2-1.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Qwen2-1.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 55.5, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 40.55, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 15.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "SC-CoT-Qwen2-1.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Qwen2-1.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 8.19, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 10.63, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 2.0, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ToT-Qwen2-1.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Qwen2-1.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 19.64, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 31.5, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "IO-Qwen2-0.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "Qwen2-0.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 14.71, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 27.17, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 2.6, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ReAct-Pro*-Qwen2-0.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "Qwen2-0.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 7.66, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 24.02, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.6, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "PoT-Qwen2-0.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "Qwen2-0.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 9.63, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 17.32, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.0, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "CoT-Qwen2-0.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "Qwen2-0.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 35.94, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 33.07, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 6.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "SC-CoT-Qwen2-0.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "Qwen2-0.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 4.17, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 17.32, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 2.2, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ToT-Qwen2-0.5B-Instruct": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "Qwen2-0.5B-Instruct", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 0.0, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 29.92, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.0, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "IO-deepseek-r1:1.5b": { | |
| "META": { | |
| "Algorithm": "IO", | |
| "LLM": "deepseek-r1:1.5b", | |
| "Eval Date": "2025/1/22" | |
| }, | |
| "gsm8k": { | |
| "Score": 64.14, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 68.9, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 43.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ReAct-Pro*-deepseek-r1:1.5b": { | |
| "META": { | |
| "Algorithm": "ReAct-Pro*", | |
| "LLM": "deepseek-r1:1.5b", | |
| "Eval Date": "2025/2/10" | |
| }, | |
| "gsm8k": { | |
| "Score": 35.94, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 54.33, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 24.4, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "PoT-deepseek-r1:1.5b": { | |
| "META": { | |
| "Algorithm": "PoT", | |
| "LLM": "deepseek-r1:1.5b", | |
| "Eval Date": "2025/2/10" | |
| }, | |
| "gsm8k": { | |
| "Score": 11.9, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 54.72, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 1.0, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "CoT-deepseek-r1:1.5b": { | |
| "META": { | |
| "Algorithm": "CoT", | |
| "LLM": "deepseek-r1:1.5b", | |
| "Eval Date": "2025/1/23" | |
| }, | |
| "gsm8k": { | |
| "Score": 70.66, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 71.65, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 49.4, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "SC-CoT-deepseek-r1:1.5b": { | |
| "META": { | |
| "Algorithm": "SC-CoT", | |
| "LLM": "deepseek-r1:1.5b", | |
| "Eval Date": "2025/2/10" | |
| }, | |
| "gsm8k": { | |
| "Score": 69.07, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 57.87, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 46.8, | |
| "Cost($)": 0.0 | |
| } | |
| }, | |
| "ToT-deepseek-r1:1.5b": { | |
| "META": { | |
| "Algorithm": "ToT", | |
| "LLM": "deepseek-r1:1.5b", | |
| "Eval Date": "2025/2/10" | |
| }, | |
| "gsm8k": { | |
| "Score": 23.12, | |
| "Cost($)": 0.0 | |
| }, | |
| "AQuA": { | |
| "Score": 24.8, | |
| "Cost($)": 0.0 | |
| }, | |
| "MATH-500": { | |
| "Score": 0.4, | |
| "Cost($)": 0.0 | |
| } | |
| } | |
| } | |
| } |