Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import librosa | |
| import numpy as np | |
| from transformers import pipeline, AutoConfig | |
| import gc | |
| import warnings | |
| import os | |
| warnings.filterwarnings("ignore") | |
| # Set environment variables for optimization | |
| os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| class UltraLightShukaASR: | |
| def __init__(self): | |
| self.pipe = None | |
| self.model_loaded = False | |
| def load_model_lazy(self): | |
| """Lazy load model only when needed""" | |
| if self.model_loaded: | |
| return True | |
| try: | |
| print("Loading Shuka v1 model...") | |
| # Try with minimal configuration first | |
| self.pipe = pipeline( | |
| model='sarvamai/shuka_v1', | |
| trust_remote_code=True, | |
| device=-1, # CPU only | |
| model_kwargs={ | |
| "low_cpu_mem_usage": True, | |
| "use_cache": False, # Disable cache to save memory | |
| "torch_dtype": torch.float32, | |
| } | |
| ) | |
| print("β Model loaded successfully!") | |
| self.model_loaded = True | |
| return True | |
| except Exception as e: | |
| print(f"β Model loading failed: {e}") | |
| return False | |
| def preprocess_audio_minimal(self, audio_input, target_sr=16000, max_duration=15): | |
| """Minimal audio preprocessing for speed""" | |
| try: | |
| if isinstance(audio_input, tuple): | |
| sr, audio_data = audio_input | |
| audio_data = audio_data.astype(np.float32) | |
| if len(audio_data.shape) > 1: | |
| audio_data = np.mean(audio_data, axis=1) | |
| else: | |
| audio_data, sr = librosa.load(audio_input, sr=target_sr, duration=max_duration) | |
| # Quick normalization | |
| if np.max(np.abs(audio_data)) > 0: | |
| audio_data = audio_data / np.max(np.abs(audio_data)) | |
| # Trim silence from start and end | |
| audio_data, _ = librosa.effects.trim(audio_data, top_db=20) | |
| return audio_data, target_sr | |
| except Exception as e: | |
| raise Exception(f"Audio preprocessing failed: {e}") | |
| def transcribe_fast(self, audio_input, language_hint=""): | |
| """Fast transcription with minimal overhead""" | |
| # Lazy load model | |
| if not self.load_model_lazy(): | |
| return "β Model failed to load. Please check your setup." | |
| try: | |
| # Quick audio processing | |
| audio, sr = self.preprocess_audio_minimal(audio_input) | |
| # Minimal system prompt for speed | |
| system_content = "Transcribe audio to text." | |
| if language_hint and language_hint != "auto": | |
| system_content += f" Language: {language_hint}." | |
| turns = [ | |
| {'role': 'system', 'content': system_content}, | |
| {'role': 'user', 'content': '<|audio|>'} | |
| ] | |
| # Fast inference settings | |
| with torch.inference_mode(): # More efficient than no_grad | |
| result = self.pipe( | |
| { | |
| 'audio': audio, | |
| 'turns': turns, | |
| 'sampling_rate': sr | |
| }, | |
| max_new_tokens=128, # Reduced further | |
| do_sample=False, # Deterministic | |
| num_beams=1, # No beam search | |
| early_stopping=True, # Stop as soon as possible | |
| pad_token_id=self.pipe.tokenizer.eos_token_id if hasattr(self.pipe, 'tokenizer') else None | |
| ) | |
| # Immediate cleanup | |
| del audio | |
| gc.collect() | |
| # Extract result | |
| if isinstance(result, list) and len(result) > 0: | |
| text = result[0].get('generated_text', '').strip() | |
| elif isinstance(result, dict): | |
| text = result.get('generated_text', '').strip() | |
| else: | |
| text = str(result).strip() | |
| # Clean up the output (remove system prompts if they appear) | |
| if "Transcribe audio to text" in text: | |
| text = text.replace("Transcribe audio to text", "").strip() | |
| if text.startswith("Language:"): | |
| text = text.split(".", 1)[-1].strip() if "." in text else text | |
| return text if text else "No speech detected" | |
| except Exception as e: | |
| return f"β Transcription error: {str(e)}" | |
| # Initialize ASR system | |
| print("Initializing Ultra-Light Shuka ASR...") | |
| asr_system = UltraLightShukaASR() | |
| def process_audio(audio, language): | |
| """Main processing function""" | |
| if audio is None: | |
| return "Please upload or record an audio file." | |
| return asr_system.transcribe_fast(audio, language) | |
| # Simple language options | |
| LANGUAGES = [ | |
| ("Auto", "auto"), | |
| ("English", "english"), | |
| ("Hindi", "hindi"), | |
| ("Bengali", "bengali"), | |
| ("Tamil", "tamil"), | |
| ("Telugu", "telugu"), | |
| ("Gujarati", "gujarati"), | |
| ("Kannada", "kannada"), | |
| ("Malayalam", "malayalam"), | |
| ("Marathi", "marathi"), | |
| ("Punjabi", "punjabi"), | |
| ("Oriya", "oriya") | |
| ] | |
| # Ultra-minimal Gradio interface | |
| css = """ | |
| .gradio-container { | |
| max-width: 800px !important; | |
| } | |
| .output-text textarea { | |
| font-size: 16px !important; | |
| } | |
| """ | |
| with gr.Blocks(css=css, title="Fast Shuka ASR") as demo: | |
| gr.HTML(""" | |
| <div style='text-align: center; margin-bottom: 20px;'> | |
| <h1>π Ultra-Fast Shuka v1 ASR</h1> | |
| <p>Optimized for speed β’ Multilingual β’ 15-second max clips</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="ποΈ Audio Input", | |
| type="filepath", | |
| format="wav", | |
| elem_id="audio-input" | |
| ) | |
| language_select = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="auto", | |
| label="π Language Hint", | |
| info="Optional - helps with accuracy" | |
| ) | |
| with gr.Column(scale=2): | |
| output_box = gr.Textbox( | |
| label="π Transcription", | |
| placeholder="Upload audio to see transcription here...", | |
| lines=8, | |
| elem_classes=["output-text"] | |
| ) | |
| gr.Button("π Clear", size="sm").click( | |
| lambda: ("", None), | |
| outputs=[output_box, audio_input] | |
| ) | |
| # Auto-transcribe on upload | |
| audio_input.change( | |
| fn=process_audio, | |
| inputs=[audio_input, language_select], | |
| outputs=output_box, | |
| show_progress=True | |
| ) | |
| # Also trigger on language change | |
| language_select.change( | |
| fn=process_audio, | |
| inputs=[audio_input, language_select], | |
| outputs=output_box, | |
| show_progress=True | |
| ) | |
| gr.HTML(""" | |
| <div style='margin-top: 20px; padding: 15px; background: #f0f0f0; border-radius: 10px;'> | |
| <h4>β‘ Speed Optimizations Active:</h4> | |
| <ul style='margin: 10px 0;'> | |
| <li>β Auto audio trimming (15s max)</li> | |
| <li>β CPU-optimized inference</li> | |
| <li>β Minimal token generation</li> | |
| <li>β Memory cleanup after each request</li> | |
| </ul> | |
| <p><strong>Tip:</strong> For fastest results, use short, clear audio clips in WAV format.</p> | |
| </div> | |
| """) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=3) # Limit concurrent requests | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| quiet=False | |
| ) |