Spaces:

rahul7star
/

Image2Video

Paused

App Files Files Community

rahul7star commited on 1 day ago

Commit

4679639

verified ·

1 Parent(s): 1eead2e

Update app_quant_latent.py

Browse files

Files changed (1) hide show

app_quant_latent.py +18 -16

app_quant_latent.py CHANGED Viewed

@@ -691,7 +691,6 @@ def generate_image_all_latents(prompt, height, width, steps, seed, guidance_scal
 def generate_image(prompt, height, width, steps, seed, guidance_scale=0.0):
     LOGS = []
     device = "cuda"
-    cpu_device = "cpu"
     generator = torch.Generator(device).manual_seed(int(seed))
     placeholder = Image.new("RGB", (width, height), color=(255, 255, 255))
@@ -701,31 +700,31 @@ def generate_image(prompt, height, width, steps, seed, guidance_scale=0.0):
     last_latents = []
     try:
-        # --- Initial latents (noise) ---
         latents = safe_get_latents(pipe, height, width, generator, device, LOGS)
         latents = latents.float().to(device)
-        # --- Run 1-2 diffusion steps to inject prompt info ---
-        with torch.no_grad():
-            partial_output = pipe(
-                prompt=prompt,
-                num_inference_steps=min(2, steps),
-                guidance_scale=guidance_scale,
-                generator=generator,
-                output_type="latent"
-            )
-        latents = partial_output  # latents now contain partial image info
-        # --- Last 5 previews: interpolate from partial to final latent ---
         num_previews = 5
         for i, alpha in enumerate(np.linspace(0.2, 1.0, num_previews)):
             try:
-                preview_latent = latents * alpha
                 preview_latent = preview_latent.to(pipe.vae.device).to(pipe.vae.dtype)
                 decoded = pipe.vae.decode(preview_latent, return_dict=False)[0]
                 decoded = (decoded / 2 + 0.5).clamp(0, 1)
-                decoded = decoded.cpu().permute(0, 2, 3, 1).float().numpy()
                 decoded = (decoded * 255).round().astype("uint8")
                 latent_img = Image.fromarray(decoded[0])
@@ -742,7 +741,7 @@ def generate_image(prompt, height, width, steps, seed, guidance_scale=0.0):
         latent_gallery.append(placeholder)
         yield None, latent_gallery[-5:], LOGS
-    # --- Final image: full pipeline ---
     try:
         output = pipe(
             prompt=prompt,
@@ -763,6 +762,9 @@ def generate_image(prompt, height, width, steps, seed, guidance_scale=0.0):
         final_gallery.append(placeholder)
         latent_gallery.append(placeholder)
         yield placeholder, latent_gallery[-5:] + [placeholder], LOGS
 # this is astable vesopn tha can gen final and a noise to latent
 @spaces.GPU
 def generate_image_verygood_realnoise(prompt, height, width, steps, seed, guidance_scale=0.0):

 def generate_image(prompt, height, width, steps, seed, guidance_scale=0.0):
     LOGS = []
     device = "cuda"
     generator = torch.Generator(device).manual_seed(int(seed))
     placeholder = Image.new("RGB", (width, height), color=(255, 255, 255))
     last_latents = []
     try:
+        # --- Step 1: generate initial noise latents ---
         latents = safe_get_latents(pipe, height, width, generator, device, LOGS)
         latents = latents.float().to(device)
+        # --- Step 2: partially denoise latents using a few diffusion steps ---
+        partial_latents = pipe(
+            prompt=prompt,
+            num_inference_steps=min(3, steps),  # 1-3 steps to inject image info
+            guidance_scale=guidance_scale,
+            generator=generator,
+            output_type="latent"
+        )
+        # --- Step 3: produce last 5 previews by interpolating partial -> final latent ---
         num_previews = 5
         for i, alpha in enumerate(np.linspace(0.2, 1.0, num_previews)):
             try:
+                # Linear interpolation between partial_latents and original
+                preview_latent = partial_latents * alpha + latents * (1 - alpha)
                 preview_latent = preview_latent.to(pipe.vae.device).to(pipe.vae.dtype)
+                # Decode preview
                 decoded = pipe.vae.decode(preview_latent, return_dict=False)[0]
                 decoded = (decoded / 2 + 0.5).clamp(0, 1)
+                decoded = decoded.cpu().permute(0, 2, 3, 1).numpy()
                 decoded = (decoded * 255).round().astype("uint8")
                 latent_img = Image.fromarray(decoded[0])
         latent_gallery.append(placeholder)
         yield None, latent_gallery[-5:], LOGS
+    # --- Step 4: generate final image ---
     try:
         output = pipe(
             prompt=prompt,
         final_gallery.append(placeholder)
         latent_gallery.append(placeholder)
         yield placeholder, latent_gallery[-5:] + [placeholder], LOGS
 # this is astable vesopn tha can gen final and a noise to latent
 @spaces.GPU
 def generate_image_verygood_realnoise(prompt, height, width, steps, seed, guidance_scale=0.0):