Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Aug 6

Commit

0def226

verified ·

1 Parent(s): 60ed470

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -9

app.py CHANGED Viewed

@@ -8,31 +8,39 @@ from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 device_id = 0 if torch.cuda.is_available() else -1
 captioning_pipeline = pipeline(
     "image-to-text",
-    model="Salesforce/blip-image-captioning-large", #
     device=device_id
 )
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
     use_auth_token=hf_token
 )
 @spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
     try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_file.write(image_file)
-            temp_image_path = temp_file.name
-        results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
@@ -42,8 +50,10 @@ def analyze_image_with_free_model(image_file):
         return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
 def get_audioldm_from_caption(caption):
     try:
@@ -64,6 +74,7 @@ def get_audioldm_from_caption(caption):
         print(f"Error generating audio from caption: {e}")
         return None
 css = """
 #col-container{
     margin: 0 auto;
@@ -116,9 +127,11 @@ with gr.Blocks(css=css) as demo:
     This app is a testament to the creative possibilities that emerge when technology meets art.
     Enjoy exploring the auditory landscape of your images!
     """)
-    def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
         return description
     def generate_sound(description):

 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
+from PIL import Image  # <-- ADDED THIS IMPORT
+import io               # <-- ADDED THIS IMPORT
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 device_id = 0 if torch.cuda.is_available() else -1
+# Correctly initialize the modern, reliable captioning pipeline
 captioning_pipeline = pipeline(
     "image-to-text",
+    model="Salesforce/blip-image-captioning-large",
     device=device_id
 )
+# Initialize the audio pipeline
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
     use_auth_token=hf_token
 )
+# === THIS IS THE CORRECTED FUNCTION ===
 @spaces.GPU(duration=120)
+def analyze_image_with_free_model(image_file_bytes):
     try:
+        # No more temp files!
+        # Open the image data directly from memory using Pillow
+        image = Image.open(io.BytesIO(image_file_bytes))
+        # Pass the Pillow Image object directly to the pipeline. This is the robust method.
+        results = captioning_pipeline(image)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
         return caption, False
     except Exception as e:
+        print(f"ERROR in analyze_image_with_free_model: {e}") # Print error to logs
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
 def get_audioldm_from_caption(caption):
     try:
         print(f"Error generating audio from caption: {e}")
         return None
+# --- Gradio UI (No changes needed here) ---
 css = """
 #col-container{
     margin: 0 auto;
     This app is a testament to the creative possibilities that emerge when technology meets art.
     Enjoy exploring the auditory landscape of your images!
     """)
+    # --- Gradio event handlers (I've updated the function called here) ---
+    def update_caption(image_file_bytes):
+        # We pass the bytes from the uploader directly to our corrected function
+        description, _ = analyze_image_with_free_model(image_file_bytes)
         return description
     def generate_sound(description):