tvosch commited on
Commit
20e8864
1 Parent(s): ce00c3b

mismatch default value and choice list of gradio dropdown

Browse files
Files changed (1) hide show
  1. app.py +35 -31
app.py CHANGED
@@ -16,6 +16,7 @@ PRECISION_TO_BYTES = {"float32": 4,
16
  "int8": 1}
17
 
18
  ZERO_STAGES = [0, 1, 2, 3]
 
19
  OPTIMIZERS = ["adam", "adamw", "sgd"]
20
  HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
21
 
@@ -151,33 +152,32 @@ def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_siz
151
 
152
  def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
153
  # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
 
154
  model_vram = model_memory(model_size, mixed_precision=mixed_precision)
155
  gradients_vram = gradients_memory(model_size)
156
  optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
157
 
158
  # Baseline
159
  if zero_stage == 0:
160
- aggregated_vram = model_vram + gradients_vram + optimizer_vram
161
  # Optimizer state partitioning
162
- if zero_stage == 1:
163
- aggregated_vram = model_vram + gradients_vram + (optimizer_vram / num_gpus)
164
  # Gradient + Optimzer state partitioning
165
- if zero_stage == 2:
166
- aggregated_vram = model_vram + ((gradients_vram + optimizer_vram) / num_gpus)
167
  # Parameter partitioning + Gradient + Optimizer partitioning
168
  if zero_stage == 3:
169
- aggregated_vram = (model_vram / num_gpus) + (gradients_vram / num_gpus) + (optimizer_vram / num_gpus)
170
 
171
- print(f"ZeRO stage {zero_stage} takes {aggregated_vram} GB")
172
 
173
  activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
174
  if gradient_checkpointing:
175
- activations_vram = activations_vram ** 0.5
176
 
177
- print(f"Activations require {activations_vram} GB with gradient checkpointing: {gradient_checkpointing}")
178
  total_vram = aggregated_vram + activations_vram
179
- print(f"Estimated 'minimal' VRAM requirement on {num_gpus} GPUs per GPU is {total_vram} GB")
180
- return total_vram
181
 
182
  def build_interface(estimate_vram_fn):
183
  training_params = []
@@ -190,11 +190,11 @@ def build_interface(estimate_vram_fn):
190
 
191
 
192
  with gr.Row(visible=False) as model_params_row:
193
- model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=1000, step=0.1, value=7, info="Model size (in billion parameters)"),
194
  gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
195
- gr.Slider(label="Sequence length", minimum=256, maximum=1_000_000, step=256, value=8192, info="Sequence length"),
196
- gr.Slider(label="Num layers", minimum=1, maximum=64, step=1, value=32, info="Number of layers"),
197
- gr.Slider(label="Num heads", minimum=1, maximum=64, step=1, value=32, info="Number of attention heads")
198
  ]
199
 
200
 
@@ -212,16 +212,17 @@ def build_interface(estimate_vram_fn):
212
 
213
 
214
  with gr.Row(equal_height=True):
215
- training_params = [gr.Dropdown(label="Micro batch size", choices=[1,2,4,8,16,32,64], value=4, info="Micro batch size (batch size per device/GPU)"),
216
  gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
217
- gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=False, info="Enable gradient checkpointing"),
218
- gr.Dropdown(label="Mixed preision", choices=[True, False], value=False, info="Enable mixed precision for model training"),
219
  gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
220
  gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
221
  gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
222
  ]
223
 
224
  submit_btn = gr.Button("Estimate!")
 
225
  output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
226
 
227
  submit_btn.click(
@@ -235,22 +236,24 @@ def build_interface(estimate_vram_fn):
235
 
236
  def estimate_vram(arg_keys, *args):
237
  params = dict(zip(arg_keys, args))
238
- print(params)
239
 
240
  model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
241
  training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
242
- if params["repo_id"]:
243
- # If cache directory set, then download config
244
- if params["cache_dir"]:
245
- config = scrape_config_from_hub(params["repo_id"])
246
- model_config.overwrite_with_hf_config(config)
247
- # By default, scrape config.json from hub
248
- else:
249
- config = download_config_from_hub(params["repo_id"], params["cache_dir"])
250
- model_config.overwrite_with_hf_config(config.to_dict())
 
251
 
252
- total_vram = vram_required(**vars(model_config), **vars(training_config))
253
- return total_vram
 
254
 
255
  if __name__ == "__main__":
256
  parser = parse_args()
@@ -276,4 +279,5 @@ if __name__ == "__main__":
276
  config = scrape_config_from_hub(args.repo_id)
277
  model_config.overwrite_with_hf_config(config)
278
 
279
- total_vram = vram_required(**vars(model_config), **vars(training_config))
 
 
16
  "int8": 1}
17
 
18
  ZERO_STAGES = [0, 1, 2, 3]
19
+ BATCH_SIZES = [1,2,4,8,16,32,64]
20
  OPTIMIZERS = ["adam", "adamw", "sgd"]
21
  HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
22
 
 
152
 
153
  def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
154
  # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
155
+
156
  model_vram = model_memory(model_size, mixed_precision=mixed_precision)
157
  gradients_vram = gradients_memory(model_size)
158
  optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
159
 
160
  # Baseline
161
  if zero_stage == 0:
162
+ pass
163
  # Optimizer state partitioning
164
+ if zero_stage >= 1:
165
+ optimizer_vram = optimizer_vram / num_gpus
166
  # Gradient + Optimzer state partitioning
167
+ if zero_stage >= 2:
168
+ gradients_vram = gradients_vram / num_gpus
169
  # Parameter partitioning + Gradient + Optimizer partitioning
170
  if zero_stage == 3:
171
+ aggregated_vram = model_vram / num_gpus
172
 
173
+ aggregated_vram = round(model_vram, 2) + gradients_vram + optimizer_vram
174
 
175
  activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
176
  if gradient_checkpointing:
177
+ activations_vram = round(activations_vram ** 0.5, 2)
178
 
 
179
  total_vram = aggregated_vram + activations_vram
180
+ return {"total": total_vram, "model": model_vram, "gradients": gradients_vram, "optimizer": optimizer_vram, "activations": activations_vram}
 
181
 
182
  def build_interface(estimate_vram_fn):
183
  training_params = []
 
190
 
191
 
192
  with gr.Row(visible=False) as model_params_row:
193
+ model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
194
  gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
195
+ gr.Slider(label="Sequence length", minimum=256, maximum=128_000, step=256, value=8192, info="Sequence length"),
196
+ gr.Slider(label="Num layers", minimum=8, maximum=64, step=1, value=32, info="Number of layers"),
197
+ gr.Slider(label="Num heads", minimum=8, maximum=64, step=1, value=32, info="Number of attention heads")
198
  ]
199
 
200
 
 
212
 
213
 
214
  with gr.Row(equal_height=True):
215
+ training_params = [gr.Dropdown(label="Micro batch size", choices=BATCH_SIZES, value=4, info="Micro batch size (batch size per device/GPU)"),
216
  gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
217
+ gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
218
+ gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
219
  gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
220
  gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
221
  gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
222
  ]
223
 
224
  submit_btn = gr.Button("Estimate!")
225
+
226
  output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
227
 
228
  submit_btn.click(
 
236
 
237
  def estimate_vram(arg_keys, *args):
238
  params = dict(zip(arg_keys, args))
239
+ print("Parameters: ", params)
240
 
241
  model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
242
  training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
243
+ if not params["repo_id"]:
244
+ return "No model selected!"
245
+ # If cache directory set, then download config
246
+ if params["cache_dir"]:
247
+ config = scrape_config_from_hub(params["repo_id"])
248
+ model_config.overwrite_with_hf_config(config)
249
+ # By default, scrape config.json from hub
250
+ else:
251
+ config = download_config_from_hub(params["repo_id"], params["cache_dir"])
252
+ model_config.overwrite_with_hf_config(config.to_dict())
253
 
254
+ total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
255
+ output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
256
+ return output_str
257
 
258
  if __name__ == "__main__":
259
  parser = parse_args()
 
279
  config = scrape_config_from_hub(args.repo_id)
280
  model_config.overwrite_with_hf_config(config)
281
 
282
+ total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
283
+ print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations")