anaszil commited on
Commit
557d6ce
Β·
1 Parent(s): aa571e1
Files changed (7) hide show
  1. .gitattributes +0 -1
  2. .gitignore +2 -0
  3. README.md +5 -5
  4. app.py +185 -0
  5. packages.txt +1 -0
  6. record.py +29 -0
  7. requirements.txt +7 -0
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ gradio_cached_examples
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Whisper Darija
3
- emoji: 🐠
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Whisper Darija
3
+ emoji: 🌍
4
+ colorFrom: pink
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.29.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ import gradio as gr
5
+ import torch
6
+ from dotenv import load_dotenv
7
+ from huggingface_hub import login
8
+ from peft import PeftModel
9
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
10
+ from transformers.pipelines.base import Pipeline
11
+
12
+ load_dotenv()
13
+
14
+
15
+ def ensure_hf_login() -> None:
16
+ token = os.getenv("HF_TOKEN")
17
+ if not token:
18
+ print("HF_TOKEN not set; skipping Hugging Face login.")
19
+ return
20
+
21
+ try:
22
+ login(token=token)
23
+ except Exception as exc:
24
+ print(f"Failed to login to Hugging Face Hub: {exc}")
25
+
26
+
27
+ ensure_hf_login()
28
+
29
+
30
+ LANGUAGE = "Arabic"
31
+ BATCH_SIZE = 1
32
+ DEVICE = 0 if torch.cuda.is_available() else "cpu"
33
+ BASE_MODEL_PATH = "openai/whisper-large-v3-turbo"
34
+ LORA_PATH = "anaszil/whisper-large-v3-turbo-darija"
35
+
36
+
37
+ PIPELINE: Optional[Pipeline] = None
38
+
39
+
40
+ def _build_pipeline() -> Pipeline:
41
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
42
+
43
+ base_model = WhisperForConditionalGeneration.from_pretrained(
44
+ BASE_MODEL_PATH,
45
+ torch_dtype=torch_dtype,
46
+ )
47
+ model = PeftModel.from_pretrained(base_model, LORA_PATH)
48
+ processor = WhisperProcessor.from_pretrained(
49
+ BASE_MODEL_PATH,
50
+ language=LANGUAGE,
51
+ task="transcribe",
52
+ )
53
+
54
+ model.generation_config.language = LANGUAGE
55
+ model.generation_config.task = "transcribe"
56
+ model.generation_config.forced_decoder_ids = None
57
+ model.eval()
58
+
59
+ return pipeline(
60
+ task="automatic-speech-recognition",
61
+ model=model,
62
+ tokenizer=processor.tokenizer,
63
+ feature_extractor=processor.feature_extractor,
64
+ chunk_length_s=30,
65
+ device=DEVICE,
66
+ )
67
+
68
+
69
+ def get_pipeline() -> Pipeline:
70
+ global PIPELINE
71
+
72
+ if PIPELINE is None:
73
+ print("Loading Darija LoRA model...")
74
+ PIPELINE = _build_pipeline()
75
+
76
+ return PIPELINE
77
+
78
+
79
+ def format_timestamp(
80
+ seconds: Optional[float],
81
+ always_include_hours: bool = False,
82
+ decimal_marker: str = ".",
83
+ ) -> Optional[str]:
84
+ if seconds is None:
85
+ return seconds
86
+
87
+ milliseconds = round(seconds * 1000.0)
88
+
89
+ hours = milliseconds // 3_600_000
90
+ milliseconds -= hours * 3_600_000
91
+
92
+ minutes = milliseconds // 60_000
93
+ milliseconds -= minutes * 60_000
94
+
95
+ whole_seconds = milliseconds // 1_000
96
+ milliseconds -= whole_seconds * 1_000
97
+
98
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
99
+
100
+ return (
101
+ f"{hours_marker}{minutes:02d}:{whole_seconds:02d}"
102
+ f"{decimal_marker}{milliseconds:03d}"
103
+ )
104
+
105
+
106
+ def transcribe(audio_input, return_timestamps: bool):
107
+ if audio_input is None:
108
+ return "Please provide audio input either via microphone or file upload."
109
+
110
+ asr_pipeline = get_pipeline()
111
+
112
+ outputs = asr_pipeline(
113
+ audio_input,
114
+ batch_size=BATCH_SIZE,
115
+ generate_kwargs={"task": "transcribe", "language": LANGUAGE},
116
+ return_timestamps=return_timestamps,
117
+ )
118
+
119
+ text = outputs["text"]
120
+ if return_timestamps:
121
+ chunks = outputs.get("chunks") or []
122
+ text = "\n".join(
123
+ f"[{format_timestamp(chunk['timestamp'][0])} -> "
124
+ f"{format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
125
+ for chunk in chunks
126
+ )
127
+
128
+ return text
129
+
130
+
131
+ def process_audio(audio_file, uploaded_file, timestamps):
132
+ audio_input = audio_file or uploaded_file
133
+ if audio_input is None:
134
+ return "Please provide audio input.", "No audio input detected."
135
+
136
+ try:
137
+ transcription = transcribe(audio_input, timestamps)
138
+ return transcription, "Transcription completed with the Darija LoRA model."
139
+ except Exception as exc:
140
+ return f"Error: {exc}", f"Transcription failed: {exc}"
141
+
142
+
143
+ with gr.Blocks(title="Darija Speech Transcription") as demo:
144
+ gr.Markdown("# Darija Speech Transcription Demo")
145
+ gr.Markdown("Transcribe Darija audio with the fine-tuned Whisper LoRA model.")
146
+
147
+ with gr.Row():
148
+ with gr.Column(scale=1):
149
+ timestamps_checkbox = gr.Checkbox(
150
+ label="Return timestamps",
151
+ value=False,
152
+ )
153
+
154
+ audio_input = gr.Audio(
155
+ source="microphone",
156
+ type="filepath",
157
+ label="Record or Upload Audio",
158
+ )
159
+
160
+ file_input = gr.Audio(
161
+ source="upload",
162
+ type="filepath",
163
+ label="Upload Audio File",
164
+ )
165
+
166
+ transcribe_button = gr.Button("Transcribe", variant="primary")
167
+
168
+ with gr.Column(scale=1):
169
+ output_text = gr.Textbox(
170
+ label="Transcription Output",
171
+ lines=10,
172
+ show_copy_button=True,
173
+ )
174
+
175
+ status_message = gr.Markdown("")
176
+
177
+ transcribe_button.click(
178
+ fn=process_audio,
179
+ inputs=[audio_input, file_input, timestamps_checkbox],
180
+ outputs=[output_text, status_message],
181
+ )
182
+
183
+ demo.load(fn=get_pipeline, inputs=None, outputs=None)
184
+
185
+ demo.launch(enable_queue=True)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
record.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sounddevice as sd
2
+ import numpy as np
3
+ import scipy.io.wavfile as wav
4
+
5
+ # Configuration
6
+ DURATION = 5 # Recording duration in seconds
7
+ SAMPLERATE = 16000 # Whisper expects 16kHz sample rate
8
+ OUTPUT_FILENAME = "recorded_audio.wav" # Output WAV file
9
+
10
+
11
+ def record_audio(duration=DURATION, samplerate=SAMPLERATE, filename=OUTPUT_FILENAME):
12
+ """Records audio from the microphone and saves it as a WAV file."""
13
+
14
+ print(f"🎀 Recording for {duration} seconds... Speak now!")
15
+
16
+ # Record audio
17
+ audio_data = sd.rec(
18
+ int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16
19
+ )
20
+ sd.wait() # Wait for recording to complete
21
+
22
+ # Save as WAV file
23
+ wav.write(filename, samplerate, audio_data)
24
+
25
+ print(f"βœ… Recording complete! Audio saved as '{filename}'")
26
+
27
+
28
+ if __name__ == "__main__":
29
+ record_audio()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio==3.29.0
4
+ python-dotenv
5
+ sounddevice
6
+ scipy
7
+ peft==0.14.0