Gemma-3-pt-llamacpp

Runtime error

App Files Files Community

Bradarr commited on Mar 13

Commit

37e38a4

verified ·

1 Parent(s): 2542cd8

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -26

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Importing required libraries
 import warnings
 warnings.filterwarnings("ignore")
 import os
@@ -16,7 +16,6 @@ import spaces
 # Download gguf model files (Simplified for the specified models)
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")  # Ensure token is set
 def download_model(repo_id, filename):
     try:
         hf_hub_download(
@@ -37,17 +36,16 @@ if not os.path.exists("./models/google.gemma-3-1b-pt.Q4_K_M.gguf"):
 if not os.path.exists("./models/google.gemma-3-12b-pt.Q4_K_M.gguf"):
     download_model("DevQuasar/google.gemma-3-12b-pt-GGUF", "google.gemma-3-12b-pt.Q4_K_M.gguf")
 if not os.path.exists("./models/google.gemma-3-4b-pt.Q4_K_M.gguf"):  # Example from original, in case needed.
-    download_model("DevQuasar/google.gemma-3-4b-pt-GGUF", "google.gemma-3-4b-pt.Q4_K_M.gguf")
 # Set the title and description
-title = "Gemma Markdown Generation"
-description = """Gemma models for markdown text generation and notebook continuation.  This interface is designed for generating markdown text continuations, not for interactive chat."""
 llm = None
 llm_model = None
 @spaces.GPU
 def generate_text(
     prompt: str,
@@ -60,6 +58,18 @@ def generate_text(
 ) -> Generator[str, None, None]:
     """
     Generates text based on a prompt, using the specified Gemma model.
     """
     try:
         global llm
@@ -68,7 +78,7 @@ def generate_text(
         model_path = os.path.join("models", model)
         if not os.path.exists(model_path):
             raise FileNotFoundError(f"Model file not found: {model_path}")
         # Load the model (only if it's a new model)
         if llm is None or llm_model != model:
             logging.info(f"Loading model: {model}")
@@ -76,10 +86,10 @@ def generate_text(
                 model_path=model_path,
                 flash_attn=True,
                 n_gpu_layers=999,  # Adjust based on your GPU availability
-                n_ctx=4096,  # Context window size.  Can increase.
-                n_threads=4,  # Adjust as needed for performance.
                 n_threads_batch=4,
-                verbose=False,  # Reduce unnecessary verbosity
             )
             llm_model = model
@@ -91,8 +101,8 @@ def generate_text(
             top_p=top_p,
             top_k=top_k,
             repeat_penalty=repeat_penalty,
-            stream=True,  # Ensure streaming is on
-            stop=["<|im_end|>", "<|endoftext|>", "<|file_separator|>"],  # Add appropriate stop tokens
         ):
             text_chunk = token["choices"][0]["text"]
             yield text_chunk
@@ -102,8 +112,8 @@ def generate_text(
 def clear_history():
-    """Clears the text input."""
-    return ""
 with gr.Blocks(theme="Ocean", title=title) as demo:
@@ -118,8 +128,13 @@ with gr.Blocks(theme="Ocean", title=title) as demo:
                 lines=10,
             )
             clear_button = gr.Button("Clear Input")
-            output_markdown = gr.Markdown(label="Generated Markdown")
         with gr.Column(scale=1):
             submit_button = gr.Button("Generate", variant="primary")
@@ -177,15 +192,16 @@ with gr.Blocks(theme="Ocean", title=title) as demo:
                 info="Penalize repeated words (higher = less repetition)",
             )
-    def streaming_markdown_output(prompt, model, max_tokens, temperature, top_p, top_k, repeat_penalty):
-        """Wraps the generator for Gradio and renders as Markdown."""
-        generated_markdown = ""
-        for text_chunk in generate_text(prompt, model, max_tokens, temperature, top_p, top_k, repeat_penalty):
-            generated_markdown += text_chunk
-            yield generated_markdown
     submit_button.click(
-        streaming_markdown_output,
         [
             input_textbox,
             model_dropdown,
@@ -195,10 +211,12 @@ with gr.Blocks(theme="Ocean", title=title) as demo:
             top_k_slider,
             repeat_penalty_slider,
         ],
-        output_markdown,
     )
     clear_button.click(clear_history, [], input_textbox)
 if __name__ == "__main__":
-    demo.launch(debug=False, share=False)

 # Importing required libraries
 import warnings
 warnings.filterwarnings("ignore")
 import os
 # Download gguf model files (Simplified for the specified models)
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")  # Ensure token is set
 def download_model(repo_id, filename):
     try:
         hf_hub_download(
 if not os.path.exists("./models/google.gemma-3-12b-pt.Q4_K_M.gguf"):
     download_model("DevQuasar/google.gemma-3-12b-pt-GGUF", "google.gemma-3-12b-pt.Q4_K_M.gguf")
 if not os.path.exists("./models/google.gemma-3-4b-pt.Q4_K_M.gguf"):  # Example from original, in case needed.
+     download_model("DevQuasar/google.gemma-3-4b-pt-GGUF", "google.gemma-3-4b-pt.Q4_K_M.gguf")
 # Set the title and description
+title = "Gemma Text Generation"
+description = """Gemma models for text generation and notebook continuation.  This interface is designed for generating text continuations, not for interactive chat."""
 llm = None
 llm_model = None
 @spaces.GPU
 def generate_text(
     prompt: str,
 ) -> Generator[str, None, None]:
     """
     Generates text based on a prompt, using the specified Gemma model.
+    Args:
+        prompt (str): The initial text to continue.
+        model (str): The model file to use (without path).
+        max_tokens (int): Maximum number of tokens to generate.
+        temperature (float): Controls randomness.
+        top_p (float): Nucleus sampling parameter.
+        top_k (int): Top-k sampling parameter.
+        repeat_penalty (float): Penalty for repeating tokens.
+    Yields:
+        str:  Generated text chunks, streamed as they become available.
     """
     try:
         global llm
         model_path = os.path.join("models", model)
         if not os.path.exists(model_path):
             raise FileNotFoundError(f"Model file not found: {model_path}")
         # Load the model (only if it's a new model)
         if llm is None or llm_model != model:
             logging.info(f"Loading model: {model}")
                 model_path=model_path,
                 flash_attn=True,
                 n_gpu_layers=999,  # Adjust based on your GPU availability
+                n_ctx=4096,      # Context window size.  Can increase.
+                n_threads=4,   # Adjust as needed for performance.
                 n_threads_batch=4,
+                verbose=False  #Reduce unnecessary verbosity
             )
             llm_model = model
             top_p=top_p,
             top_k=top_k,
             repeat_penalty=repeat_penalty,
+            stream=True, # Ensure streaming is on
+            stop=["<|im_end|>","<|endoftext|>","<|file_separator|>"],  # Add appropriate stop tokens
         ):
             text_chunk = token["choices"][0]["text"]
             yield text_chunk
 def clear_history():
+  """Clears the text input."""
+  return ""
 with gr.Blocks(theme="Ocean", title=title) as demo:
                 lines=10,
             )
             clear_button = gr.Button("Clear Input")
+            output_textbox = gr.Textbox(  # Changed to Textbox for streaming
+                label="Generated Text",
+                lines=10,  # Added lines for better display of longer outputs
+                interactive=False # Output shouldn't be editable
+            )
         with gr.Column(scale=1):
             submit_button = gr.Button("Generate", variant="primary")
                 info="Penalize repeated words (higher = less repetition)",
             )
+    def streaming_output(prompt, model, max_tokens, temperature, top_p, top_k, repeat_penalty):
+      """Wraps the generator for Gradio."""
+      generated_text = ""
+      for text_chunk in generate_text(prompt, model, max_tokens, temperature, top_p, top_k, repeat_penalty):
+          generated_text += text_chunk
+          yield generated_text
     submit_button.click(
+        streaming_output,
         [
             input_textbox,
             model_dropdown,
             top_k_slider,
             repeat_penalty_slider,
         ],
+        output_textbox,
     )
     clear_button.click(clear_history, [], input_textbox)
 if __name__ == "__main__":
+    demo.launch(debug=False, share=False) # Added share=False for clearer local-only run.