Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14, 2025

Commit

44ea2d4

1 Parent(s): 4e73867

optz the data loading

Browse files

Files changed (1) hide show

app.py +109 -27

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import jiwer
 import numpy as np
 from functools import lru_cache
 import traceback
 # Cache the dataset loading to avoid reloading on refresh
 @lru_cache(maxsize=1)
@@ -24,31 +25,69 @@ def load_data():
             print(f"Error loading with explicit path: {str(e2)}")
             raise
 # Calculate WER for a group of examples
 def calculate_wer(examples):
     if not examples:
         return 0.0
     try:
         # Filter valid examples in a single pass
         valid_pairs = []
         for ex in examples:
             try:
-                # Print a sample example to debug
-                if len(valid_pairs) == 0:
-                    print(f"Sample example keys: {ex.keys()}")
-                transcription = ex.get("transcription", "")
-                input1 = ex.get("input1", "")
-                # Only add valid pairs with non-empty strings
-                if transcription and input1 and isinstance(transcription, str) and isinstance(input1, str):
-                    # Limit text length to avoid potential issues
-                    transcription = transcription.strip()[:1000]  # Limit to 1000 chars
-                    input1 = input1.strip()[:1000]
-                    valid_pairs.append((transcription, input1))
             except Exception as ex_error:
-                # Skip problematic examples but continue processing
                 print(f"Error processing example: {str(ex_error)}")
                 continue
@@ -57,20 +96,55 @@ def calculate_wer(examples):
             return np.nan
         # Print sample pairs for debugging
-        print(f"Sample pair for WER calculation: {valid_pairs[0]}")
         print(f"Total valid pairs: {len(valid_pairs)}")
-        # Unzip the pairs in one operation
-        references, hypotheses = zip(*valid_pairs) if valid_pairs else ([], [])
-        # Calculate WER
         try:
-            wer = jiwer.wer(references, hypotheses)
-            print(f"Calculated WER: {wer}")
             return wer
         except Exception as wer_error:
-            print(f"Error calculating WER: {str(wer_error)}")
-            return np.nan
     except Exception as e:
         print(f"Error in calculate_wer: {str(e)}")
@@ -80,6 +154,11 @@ def calculate_wer(examples):
 # Get WER metrics by source
 def get_wer_metrics(dataset):
     try:
         # Group examples by source
         examples_by_source = {}
@@ -96,6 +175,7 @@ def get_wer_metrics(dataset):
         # Get all unique sources
         all_sources = sorted(examples_by_source.keys())
         # Calculate metrics for each source
         results = []
@@ -105,8 +185,8 @@ def get_wer_metrics(dataset):
                 count = len(examples)
                 if count > 0:
-                    print(f"Calculating WER for source {source} with {count} examples")
-                    wer = calculate_wer(examples)
                 else:
                     wer = np.nan
@@ -123,11 +203,13 @@ def get_wer_metrics(dataset):
                     "WER": np.nan
                 })
-        # Calculate overall metrics once
         try:
             total_count = len(dataset)
-            print(f"Calculating overall WER for {total_count} examples")
-            overall_wer = calculate_wer(dataset)
             results.append({
                 "Source": "OVERALL",
@@ -187,7 +269,7 @@ with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
         refresh_btn = gr.Button("Refresh Leaderboard")
     with gr.Row():
-        error_output = gr.Textbox(label="Debug Information", visible=True)
     with gr.Row():
         try:
@@ -202,7 +284,7 @@ with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
     def refresh_and_report():
         try:
             df = create_leaderboard()
-            debug_info = "Leaderboard refreshed successfully."
             return df, debug_info
         except Exception as e:
             error_msg = f"Error refreshing leaderboard: {str(e)}\n{traceback.format_exc()}"

 import numpy as np
 from functools import lru_cache
 import traceback
+import re
 # Cache the dataset loading to avoid reloading on refresh
 @lru_cache(maxsize=1)
             print(f"Error loading with explicit path: {str(e2)}")
             raise
+# Preprocess text for better WER calculation
+def preprocess_text(text):
+    if not text or not isinstance(text, str):
+        return ""
+    # Convert to lowercase
+    text = text.lower()
+    # Remove punctuation
+    text = re.sub(r'[^\w\s]', '', text)
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
 # Calculate WER for a group of examples
 def calculate_wer(examples):
     if not examples:
         return 0.0
     try:
+        # First, let's examine the first example in detail
+        if examples and len(examples) > 0:
+            example = examples[0]
+            print("\n===== EXAMPLE DATA INSPECTION =====")
+            print(f"Keys in example: {example.keys()}")
+            # Try different possible field names
+            possible_reference_fields = ["transcription", "reference", "ground_truth", "target"]
+            possible_hypothesis_fields = ["input1", "hypothesis", "asr_output", "source_text"]
+            for field in possible_reference_fields:
+                if field in example:
+                    print(f"Reference field '{field}' found with value: {str(example[field])[:100]}...")
+            for field in possible_hypothesis_fields:
+                if field in example:
+                    print(f"Hypothesis field '{field}' found with value: {str(example[field])[:100]}...")
         # Filter valid examples in a single pass
         valid_pairs = []
         for ex in examples:
             try:
+                # First try the expected field names
+                if "transcription" in ex and "input1" in ex:
+                    reference = ex["transcription"]
+                    hypothesis = ex["input1"]
+                # Try alternate field pairs if the standard ones don't exist
+                elif "transcription" in ex and "hypothesis_concatenated" in ex and ex["hypothesis_concatenated"]:
+                    reference = ex["transcription"]
+                    hypothesis = ex["hypothesis_concatenated"].split('.')[0]  # Take first sentence
+                elif "reference" in ex and "hypothesis" in ex:
+                    reference = ex["reference"]
+                    hypothesis = ex["hypothesis"]
+                else:
+                    continue  # Skip this example if we can't find matching fields
+                # Clean and preprocess the text
+                reference = preprocess_text(reference)
+                hypothesis = preprocess_text(hypothesis)
+                # Only add if both have valid content
+                if reference and hypothesis:
+                    valid_pairs.append((reference, hypothesis))
             except Exception as ex_error:
                 print(f"Error processing example: {str(ex_error)}")
                 continue
             return np.nan
         # Print sample pairs for debugging
+        print(f"\nSample pair for WER calculation:")
+        print(f"Reference: '{valid_pairs[0][0]}'")
+        print(f"Hypothesis: '{valid_pairs[0][1]}'")
         print(f"Total valid pairs: {len(valid_pairs)}")
+        # Make sure we have enough valid examples
+        if len(valid_pairs) < 5:
+            print("WARNING: Very few valid pairs for WER calculation")
+            if len(valid_pairs) < 2:
+                print("Not enough data for reliable WER calculation")
+                return np.nan
+        # Unzip the pairs
+        references, hypotheses = zip(*valid_pairs)
+        # Calculate WER with additional transforms
         try:
+            # Set up transformation pipeline for jiwer
+            transformation = jiwer.Compose([
+                jiwer.ToLowerCase(),
+                jiwer.RemoveMultipleSpaces(),
+                jiwer.Strip(),
+                jiwer.RemovePunctuation(),
+                jiwer.ReduceToListOfWords()
+            ])
+            # Calculate WER with transformations
+            wer = jiwer.wer(
+                references,
+                hypotheses,
+                truth_transform=transformation,
+                hypothesis_transform=transformation
+            )
+            print(f"Successfully calculated WER: {wer}")
             return wer
         except Exception as wer_error:
+            print(f"Error calculating WER with jiwer: {str(wer_error)}")
+            # Fallback: Calculate character error rate manually for one sample
+            try:
+                if valid_pairs:
+                    ref = valid_pairs[0][0]
+                    hyp = valid_pairs[0][1]
+                    distance = jiwer.transforms.cer(ref, hyp)
+                    print(f"Fallback CER for first sample: {distance}")
+                return np.nan
+            except:
+                return np.nan
     except Exception as e:
         print(f"Error in calculate_wer: {str(e)}")
 # Get WER metrics by source
 def get_wer_metrics(dataset):
     try:
+        # Print dataset info
+        print(f"\n===== DATASET INFO =====")
+        print(f"Dataset size: {len(dataset)}")
+        print(f"Dataset features: {dataset.features}")
         # Group examples by source
         examples_by_source = {}
         # Get all unique sources
         all_sources = sorted(examples_by_source.keys())
+        print(f"Found sources: {all_sources}")
         # Calculate metrics for each source
         results = []
                 count = len(examples)
                 if count > 0:
+                    print(f"\nCalculating WER for source {source} with {count} examples")
+                    wer = calculate_wer(examples[:100])  # Start with a sample for debugging
                 else:
                     wer = np.nan
                     "WER": np.nan
                 })
+        # Calculate overall metrics with a sample
         try:
             total_count = len(dataset)
+            print(f"\nCalculating overall WER with a sample of examples")
+            # Use a sample for overall calculation to avoid overloading
+            sample_size = min(1000, total_count)
+            overall_wer = calculate_wer(dataset[:sample_size])
             results.append({
                 "Source": "OVERALL",
         refresh_btn = gr.Button("Refresh Leaderboard")
     with gr.Row():
+        error_output = gr.Textbox(label="Debug Information", visible=True, lines=10)
     with gr.Row():
         try:
     def refresh_and_report():
         try:
             df = create_leaderboard()
+            debug_info = "Leaderboard refreshed successfully. Check console for detailed debug information."
             return df, debug_info
         except Exception as e:
             error_msg = f"Error refreshing leaderboard: {str(e)}\n{traceback.format_exc()}"