Upload 2 files
Browse files- train/sorc.toml +154 -0
- train/sorc_ds.json +6 -0
train/sorc.toml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Paths
|
| 2 |
+
model = '/workspace/model'
|
| 3 |
+
output_dir = '/workspace/out'
|
| 4 |
+
|
| 5 |
+
# Lora configuration
|
| 6 |
+
# can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA
|
| 7 |
+
#full_fine_tune = true
|
| 8 |
+
lora_rank = 16
|
| 9 |
+
lora_alpha = 32
|
| 10 |
+
lora_dropout = 0.05
|
| 11 |
+
|
| 12 |
+
# Train only specific modules. This is passed to the parameter of the same name in the LoraConfig.
|
| 13 |
+
# If not set, adapt all linear modules.
|
| 14 |
+
# Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one
|
| 15 |
+
# of these keys as substring will have requires_grad. If not set everything is trained.
|
| 16 |
+
#target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
|
| 17 |
+
|
| 18 |
+
# can specify layers to adapt with LoRA if you want
|
| 19 |
+
#layers_to_transform = '16:31'
|
| 20 |
+
|
| 21 |
+
# for Mixtral, set the load balancing coefficient
|
| 22 |
+
# load_balancing_loss_coef = 0.02
|
| 23 |
+
|
| 24 |
+
# Optimization configuration
|
| 25 |
+
epochs = 2
|
| 26 |
+
lr_scheduler = 'cosine' # can also be 'constant'
|
| 27 |
+
warmup_steps = 50
|
| 28 |
+
|
| 29 |
+
# might be useful if resuming from a checkpoint and you want to change the LR and force it to something
|
| 30 |
+
#force_constant_lr = 5e-5
|
| 31 |
+
|
| 32 |
+
# hard clamp the magnitude of the LoRA weights
|
| 33 |
+
#scale_weight_norms = 1.0
|
| 34 |
+
|
| 35 |
+
# dynamic batch size, targeting this many tokens per batch, per device
|
| 36 |
+
# if set, completely ignores the batch size in the deepspeed JSON config file
|
| 37 |
+
# can be thought of as a replacement for sample packing
|
| 38 |
+
batch_size_tokens = 10000
|
| 39 |
+
|
| 40 |
+
# Performance settings
|
| 41 |
+
pipeline_stages = 8 # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with
|
| 42 |
+
logging_steps = 10 # how often to log in Tensorboard
|
| 43 |
+
eval_steps = 500
|
| 44 |
+
save_steps = 500
|
| 45 |
+
checkpoint_every_n_minutes = 60
|
| 46 |
+
eval_before_first_step = false # do an eval before any training happens
|
| 47 |
+
# dtype to load the underlying model weights in
|
| 48 |
+
model_weight_dtype = 'bfloat16'
|
| 49 |
+
# dtype for the LoRA weights
|
| 50 |
+
lora_weight_dtype = 'bfloat16'
|
| 51 |
+
# Can have the saved weights be different dtype. Don't need to set this. Could be useful for
|
| 52 |
+
# training in float32 but saving with float16.
|
| 53 |
+
#save_dtype = 'bfloat16'
|
| 54 |
+
# Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest
|
| 55 |
+
# (this only applies to the current training session, and resumed training sessions will not touch
|
| 56 |
+
# old saves)
|
| 57 |
+
keep_states = 5
|
| 58 |
+
|
| 59 |
+
# sort examples by length before dividing them into batches
|
| 60 |
+
# this makes all examples in a batch approximately the same length, to minimize padding
|
| 61 |
+
# the batches are still shuffled after that
|
| 62 |
+
# you should probably always have this set to true
|
| 63 |
+
group_by_length = true
|
| 64 |
+
|
| 65 |
+
# This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM
|
| 66 |
+
# for a minor performance hit.
|
| 67 |
+
# Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length.
|
| 68 |
+
# true: 75s step time, 19.7G peak per-GPU VRAM usage.
|
| 69 |
+
# 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage.
|
| 70 |
+
activation_checkpointing = 'unsloth'
|
| 71 |
+
|
| 72 |
+
# Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a
|
| 73 |
+
# moderate hit to performance. If using an MoE model, this can also be an integer, in
|
| 74 |
+
# which case only that many experts are offloaded (tradeoff between VRAM and speed).
|
| 75 |
+
offload_mlp_to_cpu = 2
|
| 76 |
+
|
| 77 |
+
# Resume a prior run
|
| 78 |
+
# if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps)
|
| 79 |
+
# so, to resume, just run the exact same command but set this to true first
|
| 80 |
+
resume_from_checkpoint = false
|
| 81 |
+
|
| 82 |
+
# Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak.
|
| 83 |
+
# It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the
|
| 84 |
+
# limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment
|
| 85 |
+
# this to not load the optimizer states and hopefully the resumption won't OOM.
|
| 86 |
+
#load_optimizer_states = false
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# Dataset configuration
|
| 90 |
+
|
| 91 |
+
# How to combine multiple datasets if you have more than one.
|
| 92 |
+
# Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set.
|
| 93 |
+
dataset_combination_mode = 'interleave'
|
| 94 |
+
# When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'.
|
| 95 |
+
# Default if not set: 'first_exhausted'
|
| 96 |
+
dataset_interleave_stopping_strategy = 'all_exhausted'
|
| 97 |
+
# Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches.
|
| 98 |
+
# Default if not set: same as training GAS.
|
| 99 |
+
eval_gradient_accumulation_steps = 1
|
| 100 |
+
|
| 101 |
+
# bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig.
|
| 102 |
+
#[quantization.bnb]
|
| 103 |
+
#load_in_4bit = true
|
| 104 |
+
#bnb_4bit_use_double_quant = false
|
| 105 |
+
#bnb_4bit_compute_dtype = 'bfloat16'
|
| 106 |
+
|
| 107 |
+
# HQQ quantization. The parameters here become arguments to CustomHQQConfig.
|
| 108 |
+
# [quantization.hqq]
|
| 109 |
+
# nbits = 4
|
| 110 |
+
# group_size = 64
|
| 111 |
+
# compute_dtype = 'bfloat16'
|
| 112 |
+
|
| 113 |
+
# (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj'
|
| 114 |
+
# is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq].
|
| 115 |
+
# [quantization.hqq.dynamic_config]
|
| 116 |
+
# gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
|
| 117 |
+
# up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
|
| 118 |
+
# down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
|
| 119 |
+
|
| 120 |
+
[optimizer]
|
| 121 |
+
# options: adamw_kahan, AdamW, AdamW8bit
|
| 122 |
+
type = 'adamw_kahan'
|
| 123 |
+
lr = 5e-5
|
| 124 |
+
beta1 = 0.9
|
| 125 |
+
beta2 = 0.99
|
| 126 |
+
weight_decay = 0.1
|
| 127 |
+
|
| 128 |
+
[[datasets]]
|
| 129 |
+
# Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set.
|
| 130 |
+
name = 'c2'
|
| 131 |
+
dataset_type = 'axolotl'
|
| 132 |
+
dataset_path = '../axolotl/sorc.yml'
|
| 133 |
+
sequence_len = 8192
|
| 134 |
+
eval_size = 0.01
|
| 135 |
+
# Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set.
|
| 136 |
+
sample_weight = 1
|
| 137 |
+
|
| 138 |
+
#[[datasets]]
|
| 139 |
+
#name = 'capybara'
|
| 140 |
+
#dataset_type = 'axolotl'
|
| 141 |
+
#dataset_path = 'examples/capybara.yml'
|
| 142 |
+
#sequence_len = 2048
|
| 143 |
+
#eval_size = 0.02
|
| 144 |
+
#sample_weight = 1.5
|
| 145 |
+
|
| 146 |
+
# In addition to using eval_size which splits off some of the dataset, we can have completely separate datasets for eval.
|
| 147 |
+
# This can be useful if you're training on raw text data, so that the eval set remains completely fixed, even if
|
| 148 |
+
# you change training sequence_len, etc.
|
| 149 |
+
# This is just an example, typically you wouldn't have this overlap a training dataset.
|
| 150 |
+
# [[eval_datasets]]
|
| 151 |
+
# name = 'capybara'
|
| 152 |
+
# dataset_type = 'axolotl'
|
| 153 |
+
# dataset_path = 'examples/capybara.yml'
|
| 154 |
+
# sequence_len = 2048
|
train/sorc_ds.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train_micro_batch_size_per_gpu": 1,
|
| 3 |
+
"gradient_accumulation_steps": 2,
|
| 4 |
+
"gradient_clipping": 1.0,
|
| 5 |
+
"steps_per_print": 1
|
| 6 |
+
}
|