Echo9Zulu commited on
Commit
89f94a2
·
verified ·
1 Parent(s): 7a4e720

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +296 -237
app.py CHANGED
@@ -1,54 +1,21 @@
1
  import gradio as gr
2
-
3
-
4
- INTRODUCTION="""
5
-
6
- ### # Optimum CLI Export Tool.. tool
7
-
8
- This tool helps organize conversion commands when using Intel Optimum for Transformers and respects the order of positional arguments. Otherwise these commands can get quite nuanced to keep track of.
9
-
10
- My goal was to make it easier to construct commands for the [Optimum CLI conversion tool](https://huggingface.co/docs/optimum/main/en/intel/openvino/export) which enables converting models to the OpenVINO Intermediate Representation
11
- outside of the from.pretrained method used in Transformers with OpenVINO related classes like OVModelForCausalLM, OVModelForSeq2SeqLM, OVModelForQuestionAnswering, etc, which interface with the OpenVINO runtime.
12
-
13
- ## Usage
14
- Here I'm assuming you have followed the instructions in the documentation and have all your dependencies in order.
15
-
16
- Run to to get the latest version of the neccessary extension for optimum:
17
- ```
18
- pip install --upgrade --upgrade-strategy eager optimum[openvino]
19
- ```
20
-
21
- Intended workflow:
22
- -Select conversion parameters.
23
- -Hit "Submit"
24
- -Copy command.
25
- -Execute in your environment.
26
-
27
- Note: Converstion can take a while and will be resource intensive.
28
-
29
-
30
- OpenVINO supports Intel CPUs from 6th gen forward, so you can squeeze performance out of older hardware with
31
- different accuracy/performance tradeoffs than the popular quants of GGUFs.
32
-
33
- ## Discussion
34
-
35
- Leveraging CPU, GPU and NPU hardware acceleration from OpenVINO requires converting a model into an Intermediate format derived from ONNX.
36
- The command we execute rebuilds the model graph from it's source to be optimized for how OpenVINO uses this graph in memory.
37
-
38
- Using OpenVINO effectively requires considering facts about your Intel hardware. Visit the [Intel Ark]([Intel® Processors for PC, Laptops, Servers, and AI | Intel®](https://www.intel.com/content/www/us/en/products/details/processors.html)) product database to find this information.
39
-
40
- Here are some hardware questions you should be able to answer before using this tool;
41
-
42
- - What data types does my CPU support?
43
- - What instruction sets?
44
- - How will I be using the model?
45
- - Do I have enough system memory for this task?
46
-
47
-
48
-
49
- It's *the* ground truth for Intel Hardware specs. Even so, when testing with different model architectures
50
-
51
- """
52
 
53
  class ConversionTool:
54
  def __init__(self):
@@ -56,184 +23,256 @@ class ConversionTool:
56
  self.model_input = gr.Textbox(
57
  label='Model',
58
  placeholder='Model ID on huggingface.co or path on disk',
59
- info="The model to convert. This can be a model ID on Hugging Face or a path on disk."
60
  )
61
 
62
  self.output_path = gr.Textbox(
63
  label='Output Directory',
64
  placeholder='Path to store the generated OV model',
65
- info="We are storing some text here"
66
  )
67
 
68
  self.task = gr.Dropdown(
69
  label='Task',
70
- choices=['auto'] + [
71
- 'image-to-image',
72
- 'image-segmentation',
73
- 'inpainting',
74
- 'sentence-similarity',
75
- 'text-to-audio',
76
- 'image-to-text',
77
- 'automatic-speech-recognition',
78
- 'token-classification',
79
- 'text-to-image',
80
- 'audio-classification',
81
- 'feature-extraction',
82
- 'semantic-segmentation',
83
- 'masked-im',
84
- 'audio-xvector',
85
- 'audio-frame-classification',
86
- 'text2text-generation',
87
- 'multiple-choice',
88
- 'depth-estimation',
89
- 'image-classification',
90
- 'fill-mask', 'zero-shot-object-detection', 'object-detection',
91
- 'question-answering', 'zero-shot-image-classification',
92
- 'mask-generation', 'text-generation', 'text-classification',
93
- 'text-generation-with-past'
94
- ],
95
- value=None
96
  )
97
 
98
  self.framework = gr.Dropdown(
99
  label='Framework',
100
- choices=['pt', 'tf'],
101
- value=None
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
 
104
  self.weight_format = gr.Dropdown(
105
  label='Weight Format',
106
- choices=['fp32', 'fp16', 'int8', 'int4', 'mxfp4', 'nf4'],
 
 
 
 
 
 
 
107
  value=None,
108
- info="The level of compression we apply to the intermediate representation."
 
 
109
  )
110
-
111
  self.library = gr.Dropdown(
112
  label='Library',
113
  choices=[
114
- 'auto',
115
- 'transformers',
116
- 'diffusers',
117
  'timm',
118
- 'sentence_transformers',
119
  'open_clip'
120
  ],
121
- value=None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
 
124
  self.ratio = gr.Number(
125
  label='Ratio',
126
- value=None,
127
  minimum=0.0,
128
- maximum=1.0,
129
- step=0.1
 
 
 
 
 
 
 
 
 
 
 
130
  )
131
 
132
  self.group_size = gr.Number(
133
  label='Group Size',
134
- value=None,
135
- step=1
 
136
  )
137
 
138
  self.backup_precision = gr.Dropdown(
139
  label='Backup Precision',
140
- choices=['', 'int8_sym', 'int8_asym'],
141
- # value=None
 
 
 
 
 
 
 
142
  )
143
 
144
  self.dataset = gr.Dropdown(
145
  label='Dataset',
146
- choices=['none',
147
- 'auto',
148
- 'wikitext2',
149
- 'c4',
150
- 'c4-new',
151
  'contextual',
152
- 'conceptual_captions',
153
- 'laion/220k-GPT4Vision-captions-from-LIVIS',
154
- 'laion/filtered-wit'],
155
- value=None
 
 
 
 
 
 
 
 
 
 
156
  )
157
 
158
- self.trust_remote_code = gr.Checkbox(
159
- label='Trust Remote Code',
160
- value=False)
161
-
162
- self.disable_stateful = gr.Checkbox(
163
- label='Disable Stateful',
164
- value=False,
165
- info="Disables stateful inference. This is required for multi GPU inference due to how OpenVINO uses the KV cache. ")
166
-
167
- self.disable_convert_tokenizer = gr.Checkbox(
168
- label='Disable Convert Tokenizer',
169
- value=False,
170
- info="Disables the tokenizer conversion. Use when models have custom tokenizers which might have formatting Optimum does not expect."
171
- )
172
-
173
  self.all_layers = gr.Checkbox(
174
- label='All Layers',
175
- value=False)
176
-
 
 
 
 
 
177
  self.awq = gr.Checkbox(
178
- label='AWQ',
179
- value=False,
180
- info="Activation aware quantization algorithm from NNCF. Requires a dataset, which can also be a path. ")
181
-
182
- self.scale_estimation = gr.Checkbox(
183
- label='Scale Estimation',
184
- value=False)
185
-
186
- self.gptq = gr.Checkbox(
187
- label='GPTQ',
188
- value=False)
189
-
190
- self.lora_correction = gr.Checkbox(
191
- label='LoRA Correction',
192
- value=False)
193
-
194
- self.sym = gr.Checkbox(
195
- label='Symmetric Quantization',
196
- value=False,
197
- info="Symmetric quantization is faster and uses less memory. It is recommended for most use cases."
198
  )
199
-
200
- self.quant_mode = gr.Dropdown(
201
- label='Quantization Mode',
202
- choices=['sym', 'asym'],
203
- value=None
 
 
 
 
204
  )
205
 
206
- self.cache_dir = gr.Textbox(
207
- label='Cache Directory',
208
- placeholder='Path to cache directory'
 
 
 
 
 
209
  )
210
 
211
- self.pad_token_id = gr.Number(
212
- label='Pad Token ID',
213
- value=None,
214
- step=1,
215
- info="Will try to infer from tokenizer if not provided."
 
 
 
216
  )
217
 
218
- self.sensitivity_metric = gr.Dropdown(
219
  label='Sensitivity Metric',
220
- choices=['weight_quantization_error', 'hessian_input_activation',
221
- 'mean_activation_variance', 'max_activation_variance', 'mean_activation_magnitude'],
222
- value=None
 
 
 
 
 
223
  )
224
 
225
- self.num_samples = gr.Number(
226
  label='Number of Samples',
227
  value=None,
228
- step=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  )
230
 
231
- self.smooth_quant_alpha = gr.Number(
232
  label='Smooth Quant Alpha',
233
  value=None,
234
  minimum=0.0,
235
  maximum=1.0,
236
- step=0.1
 
 
 
 
237
  )
238
 
239
  self.command_output = gr.TextArea(
@@ -244,128 +283,148 @@ class ConversionTool:
244
  lines=5 # Adjust height
245
  )
246
 
247
- def construct_command(self, model_input, output_path, task, framework, weight_format, library,
248
- ratio, group_size, backup_precision, dataset,
249
- trust_remote_code, disable_stateful, disable_convert_tokenizer,
250
- all_layers, awq, scale_estimation, gptq, lora_correction, sym,
251
- quant_mode, cache_dir, pad_token_id, sensitivity_metric, num_samples,
252
- smooth_quant_alpha):
253
  """Construct the command string"""
254
  if not model_input or not output_path:
255
  return ''
256
-
257
  cmd_parts = ['optimum-cli export openvino']
258
  cmd_parts.append(f'-m "{model_input}"')
259
 
260
  if task and task != 'auto':
261
  cmd_parts.append(f'--task {task}')
262
-
263
  if framework:
264
  cmd_parts.append(f'--framework {framework}')
265
-
266
- if weight_format and weight_format != 'fp32':
 
 
 
267
  cmd_parts.append(f'--weight-format {weight_format}')
268
-
269
- if library and library != 'auto':
 
 
 
270
  cmd_parts.append(f'--library {library}')
271
-
272
- if ratio is not None and ratio != 0:
 
 
 
 
 
 
 
 
 
 
273
  cmd_parts.append(f'--ratio {ratio}')
274
-
275
- if group_size is not None and group_size != 0:
276
- cmd_parts.append(f'--group-size {group_size}')
277
-
278
- if backup_precision:
 
 
 
279
  cmd_parts.append(f'--backup-precision {backup_precision}')
280
-
281
- if dataset and dataset != 'none':
282
  cmd_parts.append(f'--dataset {dataset}')
283
-
284
- # Boolean flags - only add if True
285
- if trust_remote_code:
286
- cmd_parts.append('--trust-remote-code')
287
- if disable_stateful:
288
- cmd_parts.append('--disable-stateful')
289
- if disable_convert_tokenizer:
290
- cmd_parts.append('--disable-convert-tokenizer')
291
- if all_layers:
292
  cmd_parts.append('--all-layers')
293
- if awq:
 
294
  cmd_parts.append('--awq')
295
- if scale_estimation:
 
296
  cmd_parts.append('--scale-estimation')
297
- if gptq:
 
298
  cmd_parts.append('--gptq')
299
- if lora_correction:
 
300
  cmd_parts.append('--lora-correction')
301
- if sym:
302
- cmd_parts.append('--sym')
303
-
304
- # Additional optional arguments - only add if they have values
305
- if quant_mode:
306
- cmd_parts.append(f'--quant-mode {quant_mode}')
307
- if cache_dir:
308
- cmd_parts.append(f'--cache_dir "{cache_dir}"')
309
- if pad_token_id is not None and pad_token_id != 0:
310
- cmd_parts.append(f'--pad-token-id {pad_token_id}')
311
- if sensitivity_metric:
312
  cmd_parts.append(f'--sensitivity-metric {sensitivity_metric}')
313
- if num_samples is not None and num_samples != 0:
314
- cmd_parts.append(f'--num-samples {num_samples}')
315
- if smooth_quant_alpha is not None and smooth_quant_alpha != 0:
 
 
316
  cmd_parts.append(f'--smooth-quant-alpha {smooth_quant_alpha}')
317
 
318
- cmd_parts.append(f'"{output_path}"')
 
 
 
 
 
 
 
319
 
320
  constructed_command = ' '.join(cmd_parts)
321
  return constructed_command
322
 
323
  def gradio_app(self):
324
  """Create and run the Gradio interface."""
 
325
  inputs = [
326
  self.model_input,
327
  self.output_path,
328
  self.task,
329
  self.framework,
 
330
  self.weight_format,
 
331
  self.library,
 
 
 
 
332
  self.ratio,
 
333
  self.group_size,
334
  self.backup_precision,
335
  self.dataset,
336
- self.trust_remote_code,
337
- self.disable_stateful,
338
- self.disable_convert_tokenizer,
339
  self.all_layers,
340
  self.awq,
341
- self.scale_estimation,
342
- self.gptq,
343
- self.lora_correction,
344
- self.sym,
345
- self.quant_mode,
346
- self.cache_dir,
347
- self.pad_token_id,
348
- self.sensitivity_metric,
349
- self.num_samples,
350
- self.smooth_quant_alpha,
351
  ]
352
  interface = gr.Interface(
353
  fn=self.construct_command,
354
  inputs=inputs,
355
  outputs=self.command_output,
356
- title="OpenVINO Conversion Tool",
357
- description="Enter model information to generate an `optimum-cli` export command.",
358
- # article=INTRODUCTION,
359
- allow_flagging='auto'
 
 
 
 
360
  )
361
 
362
-
363
  return interface
364
 
365
- if __name__ == "__main__":
366
- tool = ConversionTool()
367
- app = tool.gradio_app()
368
- app.launch(share = False)
369
-
370
-
371
-
 
1
  import gradio as gr
2
+ # from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # Import for default cache_dir
3
+
4
+ # Dynamically get tasks (approximation, as original script uses TasksManager)
5
+ # In a real scenario, this might need a more robust way to get tasks if TasksManager is available
6
+ # For now, using the list from the previous file content + info from the script
7
+ AVAILABLE_TASKS = [
8
+ 'image-to-image', 'image-segmentation', 'image-text-to-text', 'inpainting',
9
+ 'sentence-similarity', 'text-to-audio', 'image-to-text',
10
+ 'automatic-speech-recognition', 'token-classification', 'text-to-image',
11
+ 'audio-classification', 'feature-extraction', 'semantic-segmentation',
12
+ 'masked-im', 'audio-xvector', 'audio-frame-classification',
13
+ 'text2text-generation', 'multiple-choice', 'depth-estimation',
14
+ 'image-classification', 'fill-mask', 'zero-shot-object-detection',
15
+ 'object-detection', 'question-answering', 'zero-shot-image-classification',
16
+ 'mask-generation', 'text-generation', 'text-classification',
17
+ 'text-generation-with-past'
18
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  class ConversionTool:
21
  def __init__(self):
 
23
  self.model_input = gr.Textbox(
24
  label='Model',
25
  placeholder='Model ID on huggingface.co or path on disk',
26
+ info="Model ID on huggingface.co or path on disk to load model from." # Updated info
27
  )
28
 
29
  self.output_path = gr.Textbox(
30
  label='Output Directory',
31
  placeholder='Path to store the generated OV model',
32
+ info="Path indicating the directory where to store the generated OV model." # Updated info
33
  )
34
 
35
  self.task = gr.Dropdown(
36
  label='Task',
37
+ choices=['auto'] + AVAILABLE_TASKS,
38
+ value='auto', # Default value is 'auto'
39
+ info=( # Updated info
40
+ "The task to export the model for. If not specified, the task will be auto-inferred based on metadata in the model repository."
41
+
42
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
 
45
  self.framework = gr.Dropdown(
46
  label='Framework',
47
+ choices=[None, 'pt', 'tf'], # Added None option
48
+ value=None,
49
+ info=( # Updated info
50
+ "The framework to use for the export. If not provided, will attempt to use the local checkpoint's "
51
+ "original framework or what is available in the environment."
52
+ )
53
+ )
54
+
55
+ self.trust_remote_code = gr.Checkbox( # Added trust_remote_code
56
+ label='Trust Remote Code',
57
+ value=False,
58
+ info=(
59
+ "Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which "
60
+ "you have read the code, as it will execute on your local machine arbitrary code present in the model repository."
61
+ )
62
  )
63
 
64
  self.weight_format = gr.Dropdown(
65
  label='Weight Format',
66
+ choices=['fp32', 'fp16', 'int8', 'int4', 'mxfp4', 'nf4'], # Added None option
67
+ value=None,
68
+ info="The weight format of the exported model." # Updated info
69
+ )
70
+
71
+ self.quant_mode = gr.Dropdown( # Added quant_mode
72
+ label='Quantization Mode',
73
+ choices=[None, 'int8', 'f8e4m3', 'f8e5m2', 'nf4_f8e4m3', 'nf4_f8e5m2', 'int4_f8e4m3', 'int4_f8e5m2'],
74
  value=None,
75
+ info=(
76
+ "Quantization precision mode. This is used for applying full model quantization including activations. "
77
+ )
78
  )
79
+
80
  self.library = gr.Dropdown(
81
  label='Library',
82
  choices=[
83
+ None, # Added None option
84
+ 'transformers',
85
+ 'diffusers',
86
  'timm',
87
+ 'sentence_transformers',
88
  'open_clip'
89
  ],
90
+ value=None, # Default is None, inferred later
91
+ info="The library used to load the model before export. If not provided, will attempt to infer the local checkpoint's library" # Updated info
92
+ )
93
+
94
+ self.cache_dir = gr.Textbox( # Added cache_dir
95
+ label='Cache Directory',
96
+ placeholder=f'Default: {HUGGINGFACE_HUB_CACHE}', # Use imported default
97
+ value=None, # Default to None, let the script handle the default path
98
+ info="The path to a directory in which the downloaded model should be cached if the standard cache should not be used."
99
+ )
100
+
101
+ self.pad_token_id = gr.Number( # Added pad_token_id
102
+ label='Pad Token ID',
103
+ value=None,
104
+ step=1,
105
+ info=(
106
+ "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
107
+ )
108
+ )
109
+
110
+ self.variant = gr.Textbox( # Added variant
111
+ label='Variant',
112
+ value=None,
113
+ info="If specified load weights from variant filename."
114
  )
115
 
116
  self.ratio = gr.Number(
117
  label='Ratio',
118
+ value=None, # Default is None
119
  minimum=0.0,
120
+ maximum=1.0, # Max is 1.0 according to help text
121
+ step=0.1,
122
+ info=( # Updated info
123
+ "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
124
+ "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
125
+ "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
126
+ )
127
+ )
128
+
129
+ self.sym = gr.Checkbox( # Moved sym higher to group with quantization params
130
+ label='Symmetric Quantization',
131
+ value=None, # Default is None in script
132
+ info=("Whether to apply symmetric quantization") # Updated info
133
  )
134
 
135
  self.group_size = gr.Number(
136
  label='Group Size',
137
+ value=None, # Default is None
138
+ step=1,
139
+ info=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.") # Updated info
140
  )
141
 
142
  self.backup_precision = gr.Dropdown(
143
  label='Backup Precision',
144
+ choices=[None, 'none', 'int8_sym', 'int8_asym'], # Added None and 'none'
145
+ value=None, # Default is None
146
+ info=( # Updated info
147
+ "Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. "
148
+ "If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of "
149
+ "the model weights, in this case weights are retained in their original precision without any "
150
+ "quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' "
151
+ "stands for 8-bit integer asymmetric quantization with zero points per each quantization group."
152
+ )
153
  )
154
 
155
  self.dataset = gr.Dropdown(
156
  label='Dataset',
157
+ choices=[None, # Added None option
158
+ 'auto',
159
+ 'wikitext2',
160
+ 'c4',
161
+ 'c4-new',
162
  'contextual',
163
+ 'conceptual_captions',
164
+ 'laion/220k-GPT4Vision-captions-from-LIVIS',
165
+ 'laion/filtered-wit'],
166
+ value=None,
167
+ info=( # Updated info
168
+ "The dataset used for data-aware compression or quantization with NNCF. "
169
+ "For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the "
170
+ "dataset will be collected from model's generations. "
171
+ "For diffusion models it should be on of ['conceptual_captions',"
172
+ "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
173
+ "For visual language models the dataset must be set to 'contextual'. "
174
+ "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
175
+ "equals 1.0, the dataset argument will not have an effect on the resulting model."
176
+ )
177
  )
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  self.all_layers = gr.Checkbox(
180
+ label='All Layers',
181
+ value=None, # Default is None in script
182
+ info=( # Updated info
183
+ "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
184
+ "compression is applied, they are compressed to INT8."
185
+ )
186
+ )
187
+
188
  self.awq = gr.Checkbox(
189
+ label='AWQ',
190
+ value=None, # Default is None in script
191
+ info=( # Updated info
192
+ "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires "
193
+ "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset "
194
+ "argument. Note: it is possible that there will be no matching patterns in the model to apply AWQ, in such "
195
+ "case it will be skipped."
196
+ )
 
 
 
 
 
 
 
 
 
 
 
 
197
  )
198
+
199
+ self.scale_estimation = gr.Checkbox( # Added scale_estimation
200
+ label='Scale Estimation',
201
+ value=None, # Default is None in script
202
+ info=(
203
+ "Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original "
204
+ "and compressed layers. Providing a dataset is required to run scale estimation. Please note, that "
205
+ "applying scale estimation takes additional memory and time."
206
+ )
207
  )
208
 
209
+ self.gptq = gr.Checkbox( # Added gptq
210
+ label='GPTQ',
211
+ value=None, # Default is None in script
212
+ info=(
213
+ "Indicates whether to apply GPTQ algorithm that optimizes compressed weights in a layer-wise fashion to "
214
+ "minimize the difference between activations of a compressed and original layer. Please note, that "
215
+ "applying GPTQ takes additional memory and time."
216
+ )
217
  )
218
 
219
+ self.lora_correction = gr.Checkbox( # Added lora_correction
220
+ label='LoRA Correction',
221
+ value=None, # Default is None in script
222
+ info=(
223
+ "Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank "
224
+ "adaptation layers in the model that can recover accuracy after weight compression at some cost of "
225
+ "inference latency. Please note, that applying LoRA Correction algorithm takes additional memory and time."
226
+ )
227
  )
228
 
229
+ self.sensitivity_metric = gr.Dropdown( # Added sensitivity_metric
230
  label='Sensitivity Metric',
231
+ choices=[None, 'weight_quantization_error', 'hessian_input_activation',
232
+ 'mean_activation_variance', 'max_activation_variance', 'mean_activation_magnitude'],
233
+ value=None,
234
+ info=(
235
+ "The sensitivity metric for assigning quantization precision to layers. It can be one of the following: "
236
+ "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
237
+ "'max_activation_variance', 'mean_activation_magnitude']."
238
+ )
239
  )
240
 
241
+ self.num_samples = gr.Number( # Added num_samples
242
  label='Number of Samples',
243
  value=None,
244
+ step=1,
245
+ info="The maximum number of samples to take from the dataset for quantization." # Updated info
246
+ )
247
+
248
+ self.disable_stateful = gr.Checkbox(
249
+ label='Disable Stateful',
250
+ value=False, # Default is False (stateful is enabled by default)
251
+ info=( # Updated info
252
+ "Disable stateful converted models, stateless models will be generated instead. Stateful models are produced by default when this key is not used. "
253
+ "In stateful models all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. "
254
+ "If --disable-stateful option is used, it may result in sub-optimal inference performance. "
255
+ "Use it when you intentionally want to use a stateless model, for example, to be compatible with existing "
256
+ "OpenVINO native inference code that expects KV-cache inputs and outputs in the model."
257
+ )
258
+ )
259
+
260
+ self.disable_convert_tokenizer = gr.Checkbox(
261
+ label='Disable Convert Tokenizer',
262
+ value=False, # Default is False (conversion is enabled by default)
263
+ info="Do not add converted tokenizer and detokenizer OpenVINO models." # Updated info
264
  )
265
 
266
+ self.smooth_quant_alpha = gr.Number( # Added smooth_quant_alpha
267
  label='Smooth Quant Alpha',
268
  value=None,
269
  minimum=0.0,
270
  maximum=1.0,
271
+ step=0.1,
272
+ info=(
273
+ "SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and "
274
+ "reduces quantization error. Valid only when activations quantization is enabled."
275
+ )
276
  )
277
 
278
  self.command_output = gr.TextArea(
 
283
  lines=5 # Adjust height
284
  )
285
 
286
+ def construct_command(self, model_input, output_path, task, framework, trust_remote_code, # Added trust_remote_code
287
+ weight_format, quant_mode, library, cache_dir, pad_token_id, variant, # Added new args
288
+ ratio, sym, group_size, backup_precision, dataset, all_layers, # Added sym
289
+ awq, scale_estimation, gptq, lora_correction, sensitivity_metric, num_samples, # Added new args
290
+ disable_stateful, disable_convert_tokenizer, smooth_quant_alpha): # Added smooth_quant_alpha
 
291
  """Construct the command string"""
292
  if not model_input or not output_path:
293
  return ''
294
+
295
  cmd_parts = ['optimum-cli export openvino']
296
  cmd_parts.append(f'-m "{model_input}"')
297
 
298
  if task and task != 'auto':
299
  cmd_parts.append(f'--task {task}')
300
+
301
  if framework:
302
  cmd_parts.append(f'--framework {framework}')
303
+
304
+ if trust_remote_code: # Added trust_remote_code flag
305
+ cmd_parts.append('--trust-remote-code')
306
+
307
+ if weight_format: # Check if not None/empty
308
  cmd_parts.append(f'--weight-format {weight_format}')
309
+
310
+ if quant_mode: # Added quant_mode
311
+ cmd_parts.append(f'--quant-mode {quant_mode}')
312
+
313
+ if library: # Check if not None/empty
314
  cmd_parts.append(f'--library {library}')
315
+
316
+ if cache_dir: # Added cache_dir
317
+ cmd_parts.append(f'--cache_dir "{cache_dir}"')
318
+
319
+ if pad_token_id: # Added pad_token_id
320
+ cmd_parts.append(f'--pad-token-id {int(pad_token_id)}') # Ensure int
321
+
322
+ if variant: # Added variant
323
+ cmd_parts.append(f'--variant "{variant}"')
324
+
325
+ # Compression/Quantization specific args
326
+ if ratio: # Check for None explicitly
327
  cmd_parts.append(f'--ratio {ratio}')
328
+
329
+ if sym: # Check for None explicitly and True
330
+ cmd_parts.append('--sym')
331
+
332
+ if group_size: # Check for None explicitly
333
+ cmd_parts.append(f'--group-size {int(group_size)}') # Ensure int
334
+
335
+ if backup_precision: # Check if not None/empty
336
  cmd_parts.append(f'--backup-precision {backup_precision}')
337
+
338
+ if dataset: # Check if not None/empty
339
  cmd_parts.append(f'--dataset {dataset}')
340
+
341
+ if all_layers: # Check for None explicitly and True
 
 
 
 
 
 
 
342
  cmd_parts.append('--all-layers')
343
+
344
+ if awq: # Check for None explicitly and True
345
  cmd_parts.append('--awq')
346
+
347
+ if scale_estimation: # Added scale_estimation flag
348
  cmd_parts.append('--scale-estimation')
349
+
350
+ if gptq is not None and gptq: # Added gptq flag
351
  cmd_parts.append('--gptq')
352
+
353
+ if lora_correction: # Added lora_correction flag
354
  cmd_parts.append('--lora-correction')
355
+
356
+ if sensitivity_metric: # Added sensitivity_metric
 
 
 
 
 
 
 
 
 
357
  cmd_parts.append(f'--sensitivity-metric {sensitivity_metric}')
358
+
359
+ if num_samples: # Added num_samples
360
+ cmd_parts.append(f'--num-samples {int(num_samples)}') # Ensure int
361
+
362
+ if smooth_quant_alpha: # Added smooth_quant_alpha
363
  cmd_parts.append(f'--smooth-quant-alpha {smooth_quant_alpha}')
364
 
365
+ # Other boolean flags
366
+ if disable_stateful: # Default is False, only add if True
367
+ cmd_parts.append('--disable-stateful')
368
+ if disable_convert_tokenizer: # Default is False, only add if True
369
+ cmd_parts.append('--disable-convert-tokenizer')
370
+
371
+ # Output path is always last and required
372
+ cmd_parts.append(f'"{output_path}"')
373
 
374
  constructed_command = ' '.join(cmd_parts)
375
  return constructed_command
376
 
377
  def gradio_app(self):
378
  """Create and run the Gradio interface."""
379
+ # Define inputs in the order they appear visually (or logically)
380
  inputs = [
381
  self.model_input,
382
  self.output_path,
383
  self.task,
384
  self.framework,
385
+ self.trust_remote_code, # Added
386
  self.weight_format,
387
+ self.quant_mode, # Added
388
  self.library,
389
+ self.cache_dir, # Added
390
+ self.pad_token_id, # Added
391
+ self.variant, # Added
392
+ # Quantization/Compression Group
393
  self.ratio,
394
+ self.sym, # Added
395
  self.group_size,
396
  self.backup_precision,
397
  self.dataset,
 
 
 
398
  self.all_layers,
399
  self.awq,
400
+ self.scale_estimation, # Added
401
+ self.gptq, # Added
402
+ self.lora_correction, # Added
403
+ self.sensitivity_metric, # Added
404
+ self.num_samples, # Added
405
+ self.smooth_quant_alpha, # Added
406
+ # Other Flags
407
+ self.disable_stateful,
408
+ self.disable_convert_tokenizer,
 
409
  ]
410
  interface = gr.Interface(
411
  fn=self.construct_command,
412
  inputs=inputs,
413
  outputs=self.command_output,
414
+ title="OpenVINO IR Model Conversion Tool",
415
+ description="""
416
+ Enter model information to generate an `optimum-cli export openvino` command.
417
+ Use the arguments below to configure the export process based on the OpenVINO exporter documentation.
418
+ Then run the generated command in the terminal where your OpenArc environment is activated.
419
+ """,
420
+ # article=INTRODUCTION, # Assuming INTRODUCTION is defined elsewhere or commented out
421
+ flagging_mode='auto' # Keep or remove based on preference
422
  )
423
 
 
424
  return interface
425
 
426
+ # Example usage (optional, keep commented out for library use)
427
+ # if __name__ == "__main__":
428
+ # tool = ConversionTool()
429
+ # app = tool.gradio_app()
430
+ # app.launch(share=False)