Spaces:

AvtnshM
/

ShukaAI_ASR

Running

App Files Files Community

AvtnshM commited on Sep 3

Commit

771dc21

verified ·

1 Parent(s): d0747b6

Lite-V3

Browse files

Files changed (1) hide show

app.py +145 -110

app.py CHANGED Viewed

@@ -2,206 +2,241 @@ import gradio as gr
 import torch
 import librosa
 import numpy as np
-from transformers import pipeline
 import gc
 import warnings
 warnings.filterwarnings("ignore")
-class OptimizedShukaASR:
     def __init__(self):
         self.pipe = None
-        self.load_model()
-    def load_model(self):
-        """Load model with optimizations for CPU inference"""
         try:
-            # Force CPU usage and optimize for inference
             self.pipe = pipeline(
                 model='sarvamai/shuka_v1',
                 trust_remote_code=True,
-                device=-1,  # Force CPU
                 model_kwargs={
-                    "torch_dtype": torch.float32,  # Use float32 for CPU
                     "low_cpu_mem_usage": True,
-                    "use_cache": True,
                 }
             )
-            # Set to eval mode and optimize
-            if hasattr(self.pipe.model, 'eval'):
-                self.pipe.model.eval()
-            # Compile for faster inference (PyTorch 2.0+)
-            try:
-                self.pipe.model = torch.compile(self.pipe.model, mode="reduce-overhead")
-            except:
-                pass  # Skip if torch.compile not available
-            print("Model loaded successfully with optimizations")
         except Exception as e:
-            print(f"Error loading model: {e}")
-            self.pipe = None
-    def preprocess_audio(self, audio_input, target_sr=16000, max_duration=30):
-        """Preprocess audio with length limiting and optimization"""
         try:
             if isinstance(audio_input, tuple):
                 sr, audio_data = audio_input
                 audio_data = audio_data.astype(np.float32)
                 if len(audio_data.shape) > 1:
-                    audio_data = audio_data.mean(axis=1)  # Convert to mono
-                audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize
             else:
-                audio_data, sr = librosa.load(audio_input, sr=target_sr)
-            # Limit audio duration to reduce processing time
-            max_samples = int(max_duration * target_sr)
-            if len(audio_data) > max_samples:
-                audio_data = audio_data[:max_samples]
-                print(f"Audio truncated to {max_duration} seconds")
-            # Resample if needed
-            if sr != target_sr:
-                audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=target_sr)
             return audio_data, target_sr
         except Exception as e:
             raise Exception(f"Audio preprocessing failed: {e}")
-    def transcribe(self, audio_input, language="auto"):
-        """Transcribe audio to text"""
-        if self.pipe is None:
-            return "Model not loaded. Please check the setup."
         try:
-            # Preprocess audio
-            audio, sr = self.preprocess_audio(audio_input)
-            # Prepare system prompt for ASR only
-            if language == "auto":
-                system_prompt = "Transcribe the following audio accurately. Only provide the transcription, nothing else."
-            else:
-                system_prompt = f"Transcribe the following audio in {language}. Only provide the transcription, nothing else."
             turns = [
-                {'role': 'system', 'content': system_prompt},
                 {'role': 'user', 'content': '<|audio|>'}
             ]
-            # Run inference with memory optimization
-            with torch.no_grad():
                 result = self.pipe(
                     {
                         'audio': audio,
                         'turns': turns,
                         'sampling_rate': sr
                     },
-                    max_new_tokens=256,  # Reduced for ASR only
-                    do_sample=False,     # Deterministic output
-                    temperature=0.1,     # Low temperature for accuracy
-                    pad_token_id=self.pipe.tokenizer.eos_token_id
                 )
-            # Clean up memory
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
             gc.collect()
-            # Extract transcription
             if isinstance(result, list) and len(result) > 0:
-                transcription = result[0].get('generated_text', '').strip()
             elif isinstance(result, dict):
-                transcription = result.get('generated_text', '').strip()
             else:
-                transcription = str(result).strip()
-            return transcription
         except Exception as e:
-            return f"Transcription failed: {str(e)}"
-# Initialize the ASR system
-asr_system = OptimizedShukaASR()
-def transcribe_audio(audio, language):
-    """Gradio interface function"""
     if audio is None:
-        return "Please provide an audio file."
-    result = asr_system.transcribe(audio, language)
-    return result
-# Language options
-languages = [
-    ("Auto-detect", "auto"),
     ("English", "english"),
     ("Hindi", "hindi"),
     ("Bengali", "bengali"),
     ("Gujarati", "gujarati"),
     ("Kannada", "kannada"),
     ("Malayalam", "malayalam"),
     ("Marathi", "marathi"),
-    ("Oriya", "oriya"),
     ("Punjabi", "punjabi"),
-    ("Tamil", "tamil"),
-    ("Telugu", "telugu")
 ]
-# Create Gradio interface
-with gr.Blocks(title="Shuka v1 ASR - Multilingual Speech Recognition") as demo:
-    gr.Markdown("# 🎙️ Shuka v1 ASR - Fast Multilingual Transcription")
-    gr.Markdown("Upload an audio file or record directly to get transcription in multiple Indic languages.")
     with gr.Row():
-        with gr.Column():
             audio_input = gr.Audio(
-                label="Audio Input",
                 type="filepath",
-                format="wav"
             )
-            language_dropdown = gr.Dropdown(
-                choices=languages,
                 value="auto",
-                label="Language (optional)"
             )
-            transcribe_btn = gr.Button("🚀 Transcribe", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="Transcription",
-                placeholder="Transcription will appear here...",
-                lines=10
             )
-    # Event handlers
-    transcribe_btn.click(
-        fn=transcribe_audio,
-        inputs=[audio_input, language_dropdown],
-        outputs=output_text
     )
-    # Auto-transcribe on audio upload
-    audio_input.change(
-        fn=transcribe_audio,
-        inputs=[audio_input, language_dropdown],
-        outputs=output_text
     )
-    # Examples section
-    gr.Markdown("## 📝 Tips for best results:")
-    gr.Markdown("""
-    - Audio files are automatically limited to 30 seconds for faster processing
-    - Supported formats: WAV, MP3, M4A, WEBM
-    - For best accuracy, use clear audio with minimal background noise
-    - The model supports 11 Indic languages + English
     """)
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        show_error=True
     )

 import torch
 import librosa
 import numpy as np
+from transformers import pipeline, AutoConfig
 import gc
 import warnings
+import os
 warnings.filterwarnings("ignore")
+# Set environment variables for optimization
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+class UltraLightShukaASR:
     def __init__(self):
         self.pipe = None
+        self.model_loaded = False
+    def load_model_lazy(self):
+        """Lazy load model only when needed"""
+        if self.model_loaded:
+            return True
         try:
+            print("Loading Shuka v1 model...")
+            # Try with minimal configuration first
             self.pipe = pipeline(
                 model='sarvamai/shuka_v1',
                 trust_remote_code=True,
+                device=-1,  # CPU only
                 model_kwargs={
                     "low_cpu_mem_usage": True,
+                    "use_cache": False,  # Disable cache to save memory
+                    "torch_dtype": torch.float32,
                 }
             )
+            print("✅ Model loaded successfully!")
+            self.model_loaded = True
+            return True
         except Exception as e:
+            print(f"❌ Model loading failed: {e}")
+            return False
+    def preprocess_audio_minimal(self, audio_input, target_sr=16000, max_duration=15):
+        """Minimal audio preprocessing for speed"""
         try:
             if isinstance(audio_input, tuple):
                 sr, audio_data = audio_input
                 audio_data = audio_data.astype(np.float32)
                 if len(audio_data.shape) > 1:
+                    audio_data = np.mean(audio_data, axis=1)
             else:
+                audio_data, sr = librosa.load(audio_input, sr=target_sr, duration=max_duration)
+            # Quick normalization
+            if np.max(np.abs(audio_data)) > 0:
+                audio_data = audio_data / np.max(np.abs(audio_data))
+            # Trim silence from start and end
+            audio_data, _ = librosa.effects.trim(audio_data, top_db=20)
             return audio_data, target_sr
         except Exception as e:
             raise Exception(f"Audio preprocessing failed: {e}")
+    def transcribe_fast(self, audio_input, language_hint=""):
+        """Fast transcription with minimal overhead"""
+        # Lazy load model
+        if not self.load_model_lazy():
+            return "❌ Model failed to load. Please check your setup."
         try:
+            # Quick audio processing
+            audio, sr = self.preprocess_audio_minimal(audio_input)
+            # Minimal system prompt for speed
+            system_content = "Transcribe audio to text."
+            if language_hint and language_hint != "auto":
+                system_content += f" Language: {language_hint}."
             turns = [
+                {'role': 'system', 'content': system_content},
                 {'role': 'user', 'content': '<|audio|>'}
             ]
+            # Fast inference settings
+            with torch.inference_mode():  # More efficient than no_grad
                 result = self.pipe(
                     {
                         'audio': audio,
                         'turns': turns,
                         'sampling_rate': sr
                     },
+                    max_new_tokens=128,    # Reduced further
+                    do_sample=False,       # Deterministic
+                    num_beams=1,          # No beam search
+                    early_stopping=True,   # Stop as soon as possible
+                    pad_token_id=self.pipe.tokenizer.eos_token_id if hasattr(self.pipe, 'tokenizer') else None
                 )
+            # Immediate cleanup
+            del audio
             gc.collect()
+            # Extract result
             if isinstance(result, list) and len(result) > 0:
+                text = result[0].get('generated_text', '').strip()
             elif isinstance(result, dict):
+                text = result.get('generated_text', '').strip()
             else:
+                text = str(result).strip()
+            # Clean up the output (remove system prompts if they appear)
+            if "Transcribe audio to text" in text:
+                text = text.replace("Transcribe audio to text", "").strip()
+            if text.startswith("Language:"):
+                text = text.split(".", 1)[-1].strip() if "." in text else text
+            return text if text else "No speech detected"
         except Exception as e:
+            return f"❌ Transcription error: {str(e)}"
+# Initialize ASR system
+print("Initializing Ultra-Light Shuka ASR...")
+asr_system = UltraLightShukaASR()
+def process_audio(audio, language):
+    """Main processing function"""
     if audio is None:
+        return "Please upload or record an audio file."
+    return asr_system.transcribe_fast(audio, language)
+# Simple language options
+LANGUAGES = [
+    ("Auto", "auto"),
     ("English", "english"),
     ("Hindi", "hindi"),
     ("Bengali", "bengali"),
+    ("Tamil", "tamil"),
+    ("Telugu", "telugu"),
     ("Gujarati", "gujarati"),
     ("Kannada", "kannada"),
     ("Malayalam", "malayalam"),
     ("Marathi", "marathi"),
     ("Punjabi", "punjabi"),
+    ("Oriya", "oriya")
 ]
+# Ultra-minimal Gradio interface
+css = """
+.gradio-container {
+    max-width: 800px !important;
+}
+.output-text textarea {
+    font-size: 16px !important;
+}
+"""
+with gr.Blocks(css=css, title="Fast Shuka ASR") as demo:
+    gr.HTML("""
+    <div style='text-align: center; margin-bottom: 20px;'>
+        <h1>🚀 Ultra-Fast Shuka v1 ASR</h1>
+        <p>Optimized for speed • Multilingual • 15-second max clips</p>
+    </div>
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
             audio_input = gr.Audio(
+                label="🎙️ Audio Input",
                 type="filepath",
+                format="wav",
+                elem_id="audio-input"
             )
+            language_select = gr.Dropdown(
+                choices=LANGUAGES,
                 value="auto",
+                label="🌍 Language Hint",
+                info="Optional - helps with accuracy"
+            )
+        with gr.Column(scale=2):
+            output_box = gr.Textbox(
+                label="📝 Transcription",
+                placeholder="Upload audio to see transcription here...",
+                lines=8,
+                elem_classes=["output-text"]
             )
+            gr.Button("🔄 Clear", size="sm").click(
+                lambda: ("", None),
+                outputs=[output_box, audio_input]
             )
+    # Auto-transcribe on upload
+    audio_input.change(
+        fn=process_audio,
+        inputs=[audio_input, language_select],
+        outputs=output_box,
+        show_progress=True
     )
+    # Also trigger on language change
+    language_select.change(
+        fn=process_audio,
+        inputs=[audio_input, language_select],
+        outputs=output_box,
+        show_progress=True
     )
+    gr.HTML("""
+    <div style='margin-top: 20px; padding: 15px; background: #f0f0f0; border-radius: 10px;'>
+        <h4>⚡ Speed Optimizations Active:</h4>
+        <ul style='margin: 10px 0;'>
+            <li>✅ Auto audio trimming (15s max)</li>
+            <li>✅ CPU-optimized inference</li>
+            <li>✅ Minimal token generation</li>
+            <li>✅ Memory cleanup after each request</li>
+        </ul>
+        <p><strong>Tip:</strong> For fastest results, use short, clear audio clips in WAV format.</p>
+    </div>
     """)
 if __name__ == "__main__":
+    demo.queue(max_size=3)  # Limit concurrent requests
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        show_error=True,
+        quiet=False
     )