Upload run_gpt3_125M_edu_hermes.sh with huggingface_hub
Browse files- run_gpt3_125M_edu_hermes.sh +46 -0
run_gpt3_125M_edu_hermes.sh
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GPT-3 (125M) repro, but using FineWeb
|
| 2 |
+
# 125M parameter model on 300B tokens
|
| 3 |
+
# note context length: 1024 -> 2048 for GPT-3
|
| 4 |
+
# => 6 * 125e6 * 300e9 = ~= 2.25e20 capability model
|
| 5 |
+
# 572,204 steps of 524,288 tokens/step => 300B
|
| 6 |
+
# on 8X A100 80GB SXM ($14/hr) steps in ~150ms/iter
|
| 7 |
+
# => training time 572,204 * 150ms ~= 24 hours ~= $336
|
| 8 |
+
|
| 9 |
+
make train_gpt2cu USE_CUDNN=1
|
| 10 |
+
out_dir="log_gpt3_125M_edu_hermes_v5"
|
| 11 |
+
done_file="$out_dir/DONE_00019622"
|
| 12 |
+
|
| 13 |
+
while true; do
|
| 14 |
+
|
| 15 |
+
# exit condition is that optimization has finished
|
| 16 |
+
if [ -f "$done_file" ]; then
|
| 17 |
+
echo "File $done_file exists. Exiting the loop."
|
| 18 |
+
break
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
mpirun -np 2 ./train_gpt2cu \
|
| 22 |
+
-i "dev/data/edu_fineweb10B_hermes/edu_fineweb_hermes_train_*.bin" \
|
| 23 |
+
-j "dev/data/edu_fineweb10B_hermes/edu_fineweb_hermes_val_*.bin" \
|
| 24 |
+
-o $out_dir \
|
| 25 |
+
-v 250 -s 1000 -g 144 \
|
| 26 |
+
-h 1 \
|
| 27 |
+
-b 16 -t 2048 \
|
| 28 |
+
-d 524288 \
|
| 29 |
+
-r 0 \
|
| 30 |
+
-z 1 \
|
| 31 |
+
-c 0.1 \
|
| 32 |
+
-l 0.0006 \
|
| 33 |
+
-q 0.1 \
|
| 34 |
+
-u 700 \
|
| 35 |
+
-n 500 \
|
| 36 |
+
-nk 5 \
|
| 37 |
+
-nm 2000 \
|
| 38 |
+
-ge 1 \
|
| 39 |
+
-sl 5.0 \
|
| 40 |
+
-sg 5.0 \
|
| 41 |
+
-y 1 \
|
| 42 |
+
-x 19622 \
|
| 43 |
+
-e "gpt3:c768"
|
| 44 |
+
|
| 45 |
+
sleep 1
|
| 46 |
+
done
|