Spaces:

fuvty
/

C2C_demo

Runtime error

App Files Files Community

fuvty commited on Nov 5

Commit

8672996

1 Parent(s): f926ba4

[debug] zeroGPU

Browse files

Files changed (1) hide show

app.py +30 -23

app.py CHANGED Viewed

@@ -7,8 +7,8 @@ This creates a web interface to compare three inference modes simultaneously:
 3. C2C: Rosetta model with projectors
 ZeroGPU Support:
-- Models are loaded to CPU at startup
-- @spaces.GPU decorator moves models to GPU on-demand for each inference
 - Works seamlessly on both ZeroGPU and regular GPU environments
 """
@@ -221,9 +221,9 @@ class ModelManager:
     @spaces.GPU(duration=60)
     def generate_single(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from single model with streaming."""
-        # Move model to GPU for ZeroGPU
-        device = torch.device("cuda" if torch.cuda.is_available() else self.device)
-        if ZEROGPU_AVAILABLE and self.single_model.device.type != "cuda":
             self.single_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
@@ -252,23 +252,23 @@ class ModelManager:
         thread.start()
         # Stream tokens
-        generated_text = ""
         for token in streamer:
-            generated_text += token
-            yield generated_text
     @spaces.GPU(duration=90)
     def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
         """Generate response from T2T model with streaming (returns context, answer)."""
-        # Move models to GPU for ZeroGPU
-        device = torch.device("cuda" if torch.cuda.is_available() else self.device)
         if ZEROGPU_AVAILABLE:
-            if self.t2t_model.context_model.device.type != "cuda":
-                self.t2t_model.context_model.to(device)
-            if self.t2t_model.answer_model.device.type != "cuda":
-                self.t2t_model.answer_model.to(device)
         # Stage 1: Context generation
         context_streamer = TextIteratorStreamer(
             self.t2t_model.context_tokenizer,
@@ -349,13 +349,18 @@ class ModelManager:
         for token in answer_streamer:
             answer_text += token
             yield context_text, answer_text
     @spaces.GPU(duration=60)
     def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from C2C model with streaming."""
-        # Move model to GPU for ZeroGPU
-        device = torch.device("cuda" if torch.cuda.is_available() else self.device)
-        if ZEROGPU_AVAILABLE and self.c2c_model.device.type != "cuda":
             self.c2c_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
@@ -397,10 +402,12 @@ class ModelManager:
         thread.start()
         # Stream tokens
-        generated_text = ""
         for token in streamer:
-            generated_text += token
-            yield generated_text
 def create_demo(model_manager: ModelManager):

 3. C2C: Rosetta model with projectors
 ZeroGPU Support:
+- Models are loaded to CUDA at startup
+- @spaces.GPU decorator handles GPU allocation automatically for each inference
 - Works seamlessly on both ZeroGPU and regular GPU environments
 """
     @spaces.GPU(duration=60)
     def generate_single(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from single model with streaming."""
+        # For ZeroGPU, move model to GPU on-demand
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if ZEROGPU_AVAILABLE:
             self.single_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         thread.start()
         # Stream tokens
         for token in streamer:
+            yield token
+        thread.join()
+        if ZEROGPU_AVAILABLE:
+            self.single_model.to("cpu")
     @spaces.GPU(duration=90)
     def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
         """Generate response from T2T model with streaming (returns context, answer)."""
+        # For ZeroGPU, move model to GPU on-demand
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if ZEROGPU_AVAILABLE:
+            self.t2t_model.context_model.to(device)
+            self.t2t_model.answer_model.to(device)
         # Stage 1: Context generation
         context_streamer = TextIteratorStreamer(
             self.t2t_model.context_tokenizer,
         for token in answer_streamer:
             answer_text += token
             yield context_text, answer_text
+        thread.join()
+        if ZEROGPU_AVAILABLE:
+            self.t2t_model.context_model.to("cpu")
+            self.t2t_model.answer_model.to("cpu")
     @spaces.GPU(duration=60)
     def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from C2C model with streaming."""
+        # For ZeroGPU, move model to GPU on-demand
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if ZEROGPU_AVAILABLE:
             self.c2c_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         thread.start()
         # Stream tokens
         for token in streamer:
+            yield token
+        thread.join()
+        if ZEROGPU_AVAILABLE:
+            self.c2c_model.to("cpu")
 def create_demo(model_manager: ModelManager):