Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Runtime error

App Files Files Community

Luigi commited on Jun 18

Commit

22b94a2

1 Parent(s): 427620d

open n_threads to set by user

Browse files

Files changed (1) hide show

app.py +27 -15

app.py CHANGED Viewed

@@ -49,6 +49,7 @@ model_cache = {
     'model_file': None,
     'clip_file': None,
     'verbose': None,
     'llm': None
 }
@@ -83,9 +84,10 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
         "{% if add_generation_prompt %}Assistant:{% endif %}"
     )
-# Load and cache LLM (only on dropdown or verbose change)
-def update_llm(size, model_file, clip_file, verbose_mode):
-    if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
         mf, cf = ensure_weights(MODELS[size], model_file, clip_file)
         handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
         llm = Llama(
@@ -93,9 +95,9 @@ def update_llm(size, model_file, clip_file, verbose_mode):
             chat_handler=handler,
             n_ctx=512,
             verbose=verbose_mode,
-            n_threads=max(2, os.cpu_count())
         )
-        model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'llm': llm})
     return None
 # Build weight filename lists
@@ -155,6 +157,8 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
     t_start = time.time()
     buf = io.StringIO()
@@ -188,6 +192,7 @@ def main():
     logging.basicConfig(level=logging.INFO)
     default = '256M'
     default_verbose = True
     mf, cf = get_weight_files(default)
     with gr.Blocks() as demo:
@@ -197,33 +202,40 @@ def main():
             model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
             clip_dd   = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
             verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')
-        def on_size_change(sz, verbose):
             mlist, clist = get_weight_files(sz)
-            update_llm(sz, mlist[0], clist[0], verbose)
             return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])
         size_dd.change(
             fn=on_size_change,
-            inputs=[size_dd, verbose_cb],
             outputs=[model_dd, clip_dd]
         )
         model_dd.change(
-            fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
-            inputs=[size_dd, model_dd, clip_dd, verbose_cb],
             outputs=[]
         )
         clip_dd.change(
-            fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
-            inputs=[size_dd, model_dd, clip_dd, verbose_cb],
             outputs=[]
         )
         verbose_cb.change(
-            fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
-            inputs=[size_dd, model_dd, clip_dd, verbose_cb],
             outputs=[]
         )
-        update_llm(default, mf[0], cf[0], default_verbose)
         interval   = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
         sys_p      = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')

     'model_file': None,
     'clip_file': None,
     'verbose': None,
+    'n_threads': None,
     'llm': None
 }
         "{% if add_generation_prompt %}Assistant:{% endif %}"
     )
+# Load and cache LLM (only on dropdown or verbose or thread change)
+def update_llm(size, model_file, clip_file, verbose_mode, n_threads):
+    # Only reload if any of parameters changed
+    if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose'], model_cache['n_threads']) != (size, model_file, clip_file, verbose_mode, n_threads):
         mf, cf = ensure_weights(MODELS[size], model_file, clip_file)
         handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
         llm = Llama(
             chat_handler=handler,
             n_ctx=512,
             verbose=verbose_mode,
+            n_threads=n_threads
         )
+        model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'n_threads': n_threads, 'llm': llm})
     return None
 # Build weight filename lists
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
+    if model_cache.get('n_threads') is not None:
+        debug_msgs.append(f"[{timestamp}] llama_cpp n_threads = {model_cache['n_threads']}")
     t_start = time.time()
     buf = io.StringIO()
     logging.basicConfig(level=logging.INFO)
     default = '256M'
     default_verbose = True
+    default_threads = os.cpu_count() or 1
     mf, cf = get_weight_files(default)
     with gr.Blocks() as demo:
             model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
             clip_dd   = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
             verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')
+            thread_dd = gr.Slider(minimum=1, maximum=default_threads, step=1, value=default_threads, label='CPU Threads (n_threads)')
+        def on_size_change(sz, verbose, n_threads):
             mlist, clist = get_weight_files(sz)
+            update_llm(sz, mlist[0], clist[0], verbose, n_threads)
             return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])
         size_dd.change(
             fn=on_size_change,
+            inputs=[size_dd, verbose_cb, thread_dd],
             outputs=[model_dd, clip_dd]
         )
         model_dd.change(
+            fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
             outputs=[]
         )
         clip_dd.change(
+            fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
             outputs=[]
         )
         verbose_cb.change(
+            fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
             outputs=[]
         )
+        thread_dd.change(
+            fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
+            outputs=[]
+        )
+        # Initial load
+        update_llm(default, mf[0], cf[0], default_verbose, default_threads)
         interval   = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
         sys_p      = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')