Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -306,82 +306,83 @@ Name for your new endpoint""")
|
|
| 306 |
)
|
| 307 |
|
| 308 |
with gr.Column(elem_classes=["group-border"]):
|
| 309 |
-
with gr.
|
| 310 |
-
gr.Markdown("""### Container Type
|
| 311 |
-
|
| 312 |
-
Text Generation Inference is an optimized container for text generation task""")
|
| 313 |
-
_ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
|
| 314 |
-
|
| 315 |
-
with gr.Row():
|
| 316 |
-
with gr.Column():
|
| 317 |
-
gr.Markdown("""### Custom Cuda Kernels
|
| 318 |
-
|
| 319 |
-
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
| 320 |
-
_ = gr.Dropdown(
|
| 321 |
-
value="Enabled",
|
| 322 |
-
choices=["Enabled", "Disabled"],
|
| 323 |
-
interactive=True,
|
| 324 |
-
show_label=False,
|
| 325 |
-
elem_classes=["no-label", "small-big"]
|
| 326 |
-
)
|
| 327 |
-
|
| 328 |
-
with gr.Column():
|
| 329 |
-
gr.Markdown("""### Quantization
|
| 330 |
-
|
| 331 |
-
Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
|
| 332 |
-
_ = gr.Dropdown(
|
| 333 |
-
value="None",
|
| 334 |
-
choices=["None", "Bitsandbytes", "GPTQ"],
|
| 335 |
-
interactive=True,
|
| 336 |
-
show_label=False,
|
| 337 |
-
elem_classes=["no-label", "small-big"]
|
| 338 |
-
)
|
| 339 |
-
|
| 340 |
-
with gr.Row():
|
| 341 |
-
with gr.Column():
|
| 342 |
-
gr.Markdown("""### Max Input Length (per Query)
|
| 343 |
-
|
| 344 |
-
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
| 345 |
-
_ = gr.Number(
|
| 346 |
-
value=1024,
|
| 347 |
-
interactive=True,
|
| 348 |
-
show_label=False,
|
| 349 |
-
elem_classes=["no-label", "small-big"]
|
| 350 |
-
)
|
| 351 |
-
|
| 352 |
-
with gr.Column():
|
| 353 |
-
gr.Markdown("""### Max Number of Tokens (per Query)
|
| 354 |
-
|
| 355 |
-
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
| 356 |
-
_ = gr.Number(
|
| 357 |
-
value=1512,
|
| 358 |
-
interactive=True,
|
| 359 |
-
show_label=False,
|
| 360 |
-
elem_classes=["no-label", "small-big"]
|
| 361 |
-
)
|
| 362 |
-
|
| 363 |
-
with gr.Row():
|
| 364 |
with gr.Column():
|
| 365 |
-
gr.Markdown("""###
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
_ = gr.
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
submit_button = gr.Button(
|
| 387 |
value="Submit",
|
|
|
|
| 306 |
)
|
| 307 |
|
| 308 |
with gr.Column(elem_classes=["group-border"]):
|
| 309 |
+
with gr.Accordion("Serving Container", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
with gr.Column():
|
| 311 |
+
gr.Markdown("""### Container Type
|
| 312 |
+
|
| 313 |
+
Text Generation Inference is an optimized container for text generation task""")
|
| 314 |
+
_ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
|
| 315 |
+
|
| 316 |
+
with gr.Row():
|
| 317 |
+
with gr.Column():
|
| 318 |
+
gr.Markdown("""### Custom Cuda Kernels
|
| 319 |
+
|
| 320 |
+
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
| 321 |
+
_ = gr.Dropdown(
|
| 322 |
+
value="Enabled",
|
| 323 |
+
choices=["Enabled", "Disabled"],
|
| 324 |
+
interactive=True,
|
| 325 |
+
show_label=False,
|
| 326 |
+
elem_classes=["no-label", "small-big"]
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
with gr.Column():
|
| 330 |
+
gr.Markdown("""### Quantization
|
| 331 |
+
|
| 332 |
+
Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
|
| 333 |
+
_ = gr.Dropdown(
|
| 334 |
+
value="None",
|
| 335 |
+
choices=["None", "Bitsandbytes", "GPTQ"],
|
| 336 |
+
interactive=True,
|
| 337 |
+
show_label=False,
|
| 338 |
+
elem_classes=["no-label", "small-big"]
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
with gr.Row():
|
| 342 |
+
with gr.Column():
|
| 343 |
+
gr.Markdown("""### Max Input Length (per Query)
|
| 344 |
+
|
| 345 |
+
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
| 346 |
+
_ = gr.Number(
|
| 347 |
+
value=1024,
|
| 348 |
+
interactive=True,
|
| 349 |
+
show_label=False,
|
| 350 |
+
elem_classes=["no-label", "small-big"]
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
with gr.Column():
|
| 354 |
+
gr.Markdown("""### Max Number of Tokens (per Query)
|
| 355 |
+
|
| 356 |
+
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
| 357 |
+
_ = gr.Number(
|
| 358 |
+
value=1512,
|
| 359 |
+
interactive=True,
|
| 360 |
+
show_label=False,
|
| 361 |
+
elem_classes=["no-label", "small-big"]
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
with gr.Row():
|
| 365 |
+
with gr.Column():
|
| 366 |
+
gr.Markdown("""### Max Batch Prefill Tokens
|
| 367 |
+
|
| 368 |
+
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
| 369 |
+
_ = gr.Number(
|
| 370 |
+
value=2048,
|
| 371 |
+
interactive=True,
|
| 372 |
+
show_label=False,
|
| 373 |
+
elem_classes=["no-label", "small-big"]
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
with gr.Column():
|
| 377 |
+
gr.Markdown("""### Max Batch Total Tokens
|
| 378 |
+
|
| 379 |
+
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
| 380 |
+
_ = gr.Number(
|
| 381 |
+
value=None,
|
| 382 |
+
interactive=True,
|
| 383 |
+
show_label=False,
|
| 384 |
+
elem_classes=["no-label", "small-big"]
|
| 385 |
+
)
|
| 386 |
|
| 387 |
submit_button = gr.Button(
|
| 388 |
value="Submit",
|