Spaces:

DataQuests
/

DeepCritical

Running

App Files Files Community

SeasonalFall84 commited on 5 days ago

Commit

8e9e85e

1 Parent(s): 8fcb613

Merge exp/tts-modal-env: Add TTS AudioRefiner with LLM polish, Modal deployment, and async fixes

Browse files

Files changed (10) hide show

.gitignore +3 -0
deployments/README.md +46 -0
deployments/modal_tts.py +101 -0
dev/__init__.py +0 -1
src/agents/audio_refiner.py +397 -0
src/app.py +28 -10
src/services/audio_processing.py +14 -2
src/services/tts_modal.py +144 -95
src/utils/config.py +4 -0
tests/unit/agents/test_audio_refiner.py +306 -0

.gitignore CHANGED Viewed

@@ -57,6 +57,9 @@ reference_repos/DeepCritical/
 # Keep the README in reference_repos
 !reference_repos/README.md
 # OS
 .DS_Store
 Thumbs.db

 # Keep the README in reference_repos
 !reference_repos/README.md
+# Development directory
+dev/
 # OS
 .DS_Store
 Thumbs.db

deployments/README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# Deployments
+This directory contains infrastructure deployment scripts for DeepCritical services.
+## Modal Deployments
+### TTS Service (`modal_tts.py`)
+Deploys the Kokoro TTS (Text-to-Speech) function to Modal's GPU infrastructure.
+**Deploy:**
+```bash
+modal deploy deployments/modal_tts.py
+```
+**Features:**
+- Kokoro 82M TTS model
+- GPU-accelerated (T4)
+- Voice options: af_heart, af_bella, am_michael, etc.
+- Configurable speech speed
+**Requirements:**
+- Modal account and credentials (`MODAL_TOKEN_ID`, `MODAL_TOKEN_SECRET` in `.env`)
+- GPU quota on Modal
+**After Deployment:**
+The function will be available at:
+- App: `deepcritical-tts`
+- Function: `kokoro_tts_function`
+The main application (`src/services/tts_modal.py`) will call this deployed function.
+---
+## Adding New Deployments
+When adding new deployment scripts:
+1. Create a new file: `deployments/<service_name>.py`
+2. Use Modal's app pattern:
+   ```python
+   import modal
+   app = modal.App("deepcritical-<service-name>")
+   ```
+3. Document in this README
+4. Test deployment: `modal deploy deployments/<service_name>.py`

deployments/modal_tts.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""Deploy Kokoro TTS function to Modal.
+This script deploys the TTS function to Modal so it can be called
+from the main DeepCritical application.
+Usage:
+    modal deploy deploy_modal_tts.py
+After deployment, the function will be available at:
+    App: deepcritical-tts
+    Function: kokoro_tts_function
+"""
+import modal
+import numpy as np
+# Create Modal app
+app = modal.App("deepcritical-tts")
+# Define Kokoro TTS dependencies
+KOKORO_DEPENDENCIES = [
+    "torch>=2.0.0",
+    "transformers>=4.30.0",
+    "numpy<2.0",
+]
+# Create Modal image with Kokoro
+tts_image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("git")  # Install git first for pip install from github
+    .pip_install(*KOKORO_DEPENDENCIES)
+    .pip_install("git+https://github.com/hexgrad/kokoro.git")
+)
+@app.function(
+    image=tts_image,
+    gpu="T4",
+    timeout=60,
+)
+def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
+    """Modal GPU function for Kokoro TTS.
+    This function runs on Modal's GPU infrastructure.
+    Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
+    Args:
+        text: Text to synthesize
+        voice: Voice ID (e.g., af_heart, af_bella, am_michael)
+        speed: Speech speed multiplier (0.5-2.0)
+    Returns:
+        Tuple of (sample_rate, audio_array)
+    """
+    import numpy as np
+    try:
+        import torch
+        from kokoro import KModel, KPipeline
+        # Initialize model (cached on GPU)
+        model = KModel().to("cuda").eval()
+        pipeline = KPipeline(lang_code=voice[0])
+        pack = pipeline.load_voice(voice)
+        # Generate audio - accumulate all chunks
+        audio_chunks = []
+        for _, ps, _ in pipeline(text, voice, speed):
+            ref_s = pack[len(ps) - 1]
+            audio = model(ps, ref_s, speed)
+            audio_chunks.append(audio.numpy())
+        # Concatenate all audio chunks
+        if audio_chunks:
+            full_audio = np.concatenate(audio_chunks)
+            return (24000, full_audio)
+        # If no audio generated, return empty
+        return (24000, np.zeros(1, dtype=np.float32))
+    except ImportError as e:
+        raise RuntimeError(
+            f"Kokoro not installed: {e}. "
+            "Install with: pip install git+https://github.com/hexgrad/kokoro.git"
+        ) from e
+    except Exception as e:
+        raise RuntimeError(f"TTS synthesis failed: {e}") from e
+# Optional: Add a test entrypoint
+@app.local_entrypoint()
+def test():
+    """Test the TTS function."""
+    print("Testing Modal TTS function...")
+    sample_rate, audio = kokoro_tts_function.remote(
+        "Hello, this is a test.",
+        "af_heart",
+        1.0
+    )
+    print(f"Generated audio: {sample_rate}Hz, shape={audio.shape}")
+    print("✓ TTS function works!")

dev/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Development utilities and plugins."""

src/agents/audio_refiner.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""Audio Refiner Agent - Cleans markdown reports for TTS audio clarity.
+This agent transforms markdown-formatted research reports into clean,
+audio-friendly plain text suitable for text-to-speech synthesis.
+"""
+import re
+from typing import Optional
+import structlog
+from pydantic_ai import Agent
+from src.utils.llm_factory import get_pydantic_ai_model
+logger = structlog.get_logger(__name__)
+class AudioRefiner:
+    """Refines markdown reports for optimal TTS audio output.
+    Handles common formatting issues that make text difficult to listen to:
+    - Markdown syntax (headers, bold, italic, links)
+    - Citations and reference markers
+    - Roman numerals in medical contexts
+    - Multiple References sections
+    - Special characters and formatting artifacts
+    """
+    # Roman numeral to integer mapping
+    ROMAN_VALUES = {
+        'I': 1, 'V': 5, 'X': 10, 'L': 50,
+        'C': 100, 'D': 500, 'M': 1000
+    }
+    # Number to word mapping (1-20, common in medical literature)
+    NUMBER_TO_WORD = {
+        1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five',
+        6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten',
+        11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen',
+        15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen',
+        19: 'Nineteen', 20: 'Twenty'
+    }
+    async def refine_for_audio(self, markdown_text: str, use_llm_polish: bool = False) -> str:
+        """Transform markdown report into audio-friendly plain text.
+        Args:
+            markdown_text: Markdown-formatted research report
+            use_llm_polish: If True, apply LLM-based final polish (optional)
+        Returns:
+            Clean plain text optimized for TTS audio
+        """
+        logger.info("Refining report for audio output", use_llm_polish=use_llm_polish)
+        text = markdown_text
+        # Step 1: Keep only content before first References section
+        text = self._remove_references_sections(text)
+        # Step 2: Remove markdown formatting
+        text = self._remove_markdown_syntax(text)
+        # Step 3: Convert roman numerals to words
+        text = self._convert_roman_numerals(text)
+        # Step 4: Remove citations
+        text = self._remove_citations(text)
+        # Step 5: Clean up special characters and artifacts
+        text = self._clean_special_characters(text)
+        # Step 6: Normalize whitespace
+        text = self._normalize_whitespace(text)
+        # Step 7 (Optional): LLM polish for edge cases
+        if use_llm_polish:
+            text = await self._llm_polish(text)
+        logger.info(
+            "Audio refinement complete",
+            original_length=len(markdown_text),
+            refined_length=len(text),
+            llm_polish_applied=use_llm_polish
+        )
+        return text.strip()
+    def _remove_references_sections(self, text: str) -> str:
+        """Remove References sections while preserving other content.
+        Removes the References section and its content until the next section
+        heading or end of document. Handles multiple References sections.
+        Matches various References heading formats:
+        - # References
+        - ## References
+        - **References:**
+        - **Additional References:**
+        """
+        # Pattern to match References section heading (case-insensitive)
+        # Only matches headings that contain "Reference" or "References"
+        references_pattern = r'\n(?:#+\s*References?:?\s*\n|\*\*\s*(?:Additional\s+)?References?:?\s*\*\*\s*\n)'
+        # Find all References sections
+        while True:
+            match = re.search(references_pattern, text, re.IGNORECASE)
+            if not match:
+                break
+            # Find the start of the References section
+            section_start = match.start()
+            # Find the next section (markdown header or bold heading) or end of document
+            # Match: "# Header", "## Header", or "**Header**"
+            next_section_patterns = [
+                r'\n#+\s+\w+',  # Markdown headers (# Section, ## Section)
+                r'\n\*\*[A-Z][^*]+\*\*',  # Bold headings (**Section Name**)
+            ]
+            remaining_text = text[match.end():]
+            next_section_match = None
+            # Try all patterns and find the earliest match
+            earliest_match = None
+            for pattern in next_section_patterns:
+                m = re.search(pattern, remaining_text)
+                if m and (earliest_match is None or m.start() < earliest_match.start()):
+                    earliest_match = m
+            next_section_match = earliest_match
+            if next_section_match:
+                # Remove from References heading to next section
+                section_end = match.end() + next_section_match.start()
+            else:
+                # No next section - remove to end of document
+                section_end = len(text)
+            # Remove the References section
+            text = text[:section_start] + text[section_end:]
+            logger.debug(
+                "Removed References section",
+                removed_chars=section_end - section_start
+            )
+        return text
+    def _remove_markdown_syntax(self, text: str) -> str:
+        """Remove markdown formatting syntax."""
+        # Headers (# ## ###)
+        text = re.sub(r'^\s*#+\s+', '', text, flags=re.MULTILINE)
+        # Bold (**text** or __text__)
+        text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
+        text = re.sub(r'__([^_]+)__', r'\1', text)
+        # Italic (*text* or _text_)
+        text = re.sub(r'\*([^*]+)\*', r'\1', text)
+        text = re.sub(r'_([^_]+)_', r'\1', text)
+        # Links [text](url) → text
+        text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
+        # Inline code `code` → code
+        text = re.sub(r'`([^`]+)`', r'\1', text)
+        # Strikethrough ~~text~~
+        text = re.sub(r'~~([^~]+)~~', r'\1', text)
+        # Blockquotes (> text)
+        text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
+        # Horizontal rules (---, ***, ___)
+        text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
+        # List markers (-, *, 1., 2.)
+        text = re.sub(r'^\s*[-*]\s+', '', text, flags=re.MULTILINE)
+        text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
+        return text
+    def _roman_to_int(self, roman: str) -> Optional[int]:
+        """Convert roman numeral string to integer.
+        Args:
+            roman: Roman numeral string (e.g., 'IV', 'XII')
+        Returns:
+            Integer value, or None if invalid roman numeral
+        """
+        roman = roman.upper()
+        result = 0
+        prev_value = 0
+        for char in reversed(roman):
+            if char not in self.ROMAN_VALUES:
+                return None
+            value = self.ROMAN_VALUES[char]
+            # Subtractive notation (IV = 4, IX = 9)
+            if value < prev_value:
+                result -= value
+            else:
+                result += value
+            prev_value = value
+        return result
+    def _int_to_word(self, num: int) -> str:
+        """Convert integer to word representation.
+        Args:
+            num: Integer to convert (1-20 supported)
+        Returns:
+            Word representation (e.g., 'One', 'Twelve')
+        """
+        if num in self.NUMBER_TO_WORD:
+            return self.NUMBER_TO_WORD[num]
+        else:
+            # For numbers > 20, just return the digit
+            return str(num)
+    def _convert_roman_numerals(self, text: str) -> str:
+        """Convert roman numerals to words for better TTS pronunciation.
+        Handles patterns like:
+        - Phase I, Phase II, Phase III
+        - Trial I, Trial II
+        - Type I, Type II
+        - Stage I, Stage II
+        - Standalone I, II, III (with word boundaries)
+        """
+        def replace_roman(match):
+            """Callback to replace matched roman numeral."""
+            prefix = match.group(1)  # Word before roman numeral (if any)
+            roman = match.group(2)   # The roman numeral
+            # Convert to integer
+            num = self._roman_to_int(roman)
+            if num is None:
+                return match.group(0)  # Return original if invalid
+            # Convert to word
+            word = self._int_to_word(num)
+            # Return with prefix if present
+            if prefix:
+                return f"{prefix} {word}"
+            else:
+                return word
+        # Pattern: Optional word + space + roman numeral
+        # Matches: "Phase I", "Trial II", standalone "I", "II"
+        # Uses word boundaries to avoid matching "I" in "INVALID"
+        pattern = r'\b(Phase|Trial|Type|Stage|Class|Group|Arm|Cohort)?\s*([IVXLCDM]+)\b'
+        text = re.sub(pattern, replace_roman, text)
+        return text
+    def _remove_citations(self, text: str) -> str:
+        """Remove citation markers and references."""
+        # Numbered citations [1], [2], [1,2], [1-3]
+        text = re.sub(r'\[\d+(?:[-,]\d+)*\]', '', text)
+        # Author citations (Smith et al., 2023) or (Smith et al. 2023)
+        text = re.sub(r'\([A-Z][a-z]+\s+et\s+al\.?,?\s+\d{4}\)', '', text)
+        # Simple year citations (2023)
+        text = re.sub(r'\(\d{4}\)', '', text)
+        # Author-year (Smith, 2023)
+        text = re.sub(r'\([A-Z][a-z]+,?\s+\d{4}\)', '', text)
+        # Footnote markers (¹, ², ³)
+        text = re.sub(r'[¹²³⁴⁵⁶⁷⁸⁹⁰]+', '', text)
+        return text
+    def _clean_special_characters(self, text: str) -> str:
+        """Clean up special characters and formatting artifacts."""
+        # Replace em dashes with regular dashes
+        text = text.replace('\u2014', '-')  # em dash
+        text = text.replace('\u2013', '-')  # en dash
+        # Replace smart quotes with regular quotes
+        text = text.replace('\u201c', '"')  # left double quote
+        text = text.replace('\u201d', '"')  # right double quote
+        text = text.replace('\u2018', "'")  # left single quote
+        text = text.replace('\u2019', "'")  # right single quote
+        # Remove excessive punctuation (!!!, ???)
+        text = re.sub(r'([!?]){2,}', r'\1', text)
+        # Remove asterisks used for footnotes
+        text = re.sub(r'\*+', '', text)
+        # Remove hash symbols (from headers)
+        text = text.replace('#', '')
+        # Remove excessive dots (...)
+        text = re.sub(r'\.{4,}', '...', text)
+        return text
+    def _normalize_whitespace(self, text: str) -> str:
+        """Normalize whitespace for clean audio output."""
+        # Replace multiple spaces with single space
+        text = re.sub(r' {2,}', ' ', text)
+        # Replace multiple newlines with double newline (paragraph break)
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # Remove trailing/leading whitespace from lines
+        text = '\n'.join(line.strip() for line in text.split('\n'))
+        # Remove empty lines at start/end
+        text = text.strip()
+        return text
+    async def _llm_polish(self, text: str) -> str:
+        """Apply LLM-based final polish to catch edge cases.
+        This is a lightweight pass that removes any remaining formatting
+        artifacts the rule-based methods might have missed.
+        Args:
+            text: Pre-cleaned text from rule-based methods
+        Returns:
+            Final polished text ready for TTS
+        """
+        try:
+            # Create a simple agent for text cleanup
+            model = get_pydantic_ai_model()
+            polish_agent = Agent(
+                model=model,
+                system_prompt=(
+                    "You are a text cleanup assistant. Your ONLY job is to remove "
+                    "any remaining formatting artifacts (markdown, citations, special "
+                    "characters) that make text unsuitable for text-to-speech audio. "
+                    "DO NOT rewrite, improve, or change the content. "
+                    "DO NOT add explanations. "
+                    "ONLY output the cleaned text."
+                ),
+            )
+            # Run asynchronously
+            result = await polish_agent.run(
+                f"Clean this text for audio (remove any formatting artifacts):\n\n{text}"
+            )
+            polished_text = result.output.strip()
+            logger.info(
+                "llm_polish_applied",
+                original_length=len(text),
+                polished_length=len(polished_text)
+            )
+            return polished_text
+        except Exception as e:
+            logger.warning(
+                "llm_polish_failed",
+                error=str(e),
+                message="Falling back to rule-based output"
+            )
+            # Graceful fallback: return original text if LLM fails
+            return text
+# Singleton instance for easy import
+audio_refiner = AudioRefiner()
+async def refine_text_for_audio(markdown_text: str, use_llm_polish: bool = False) -> str:
+    """Convenience function to refine markdown text for audio.
+    Args:
+        markdown_text: Markdown-formatted text
+        use_llm_polish: If True, apply LLM-based final polish (optional)
+    Returns:
+        Audio-friendly plain text
+    """
+    return await audio_refiner.refine_for_audio(markdown_text, use_llm_polish=use_llm_polish)

src/app.py CHANGED Viewed

@@ -18,6 +18,7 @@ import structlog
 from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
 from src.services.multimodal_processing import get_multimodal_service
 from src.utils.config import settings
 from src.utils.models import AgentEvent, OrchestratorConfig
@@ -446,6 +447,7 @@ async def research_agent(
     enable_audio_input: bool = True,
     tts_voice: str = "af_heart",
     tts_speed: float = 1.0,
     web_search_provider: str = "auto",
     oauth_token: gr.OAuthToken | None = None,
     oauth_profile: gr.OAuthProfile | None = None,
@@ -465,6 +467,7 @@ async def research_agent(
         enable_audio_input: Whether to process audio inputs
         tts_voice: TTS voice selection
         tts_speed: TTS speech speed
         web_search_provider: Web search provider selection
         oauth_token: Gradio OAuth token (None if user not logged in)
         oauth_profile: Gradio OAuth profile (None if user not logged in)
@@ -585,17 +588,23 @@ async def research_agent(
         # Optional: Generate audio output if enabled
         if settings.enable_audio_output and settings.modal_available:
             try:
-                from src.services.tts_modal import get_tts_service
-                tts_service = get_tts_service()
                 # Get the last message from history for TTS
                 last_message = history[-1].get("content", "") if history else processed_text
                 if last_message:
-                    await tts_service.synthesize_async(
-                        text=last_message,
-                        voice=tts_voice,
-                        speed=tts_speed,
-                    )
             except Exception as e:
                 logger.warning("audio_synthesis_failed", error=str(e))
                 # Continue without audio output
@@ -1081,6 +1090,13 @@ def create_demo() -> gr.Blocks:
                     interactive=False,  # GPU type set at function definition time, requires restart
                 )
                 # Audio output component (for TTS response) - moved to sidebar
                 audio_output = gr.Audio(
                     label="🔊 Audio Response",
@@ -1091,18 +1107,19 @@ def create_demo() -> gr.Blocks:
         # This must be after audio_output is defined
         def update_tts_visibility(
             enabled: bool,
-        ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
             """Update visibility of TTS components based on enable checkbox."""
             return (
                 gr.update(visible=enabled),
                 gr.update(visible=enabled),
                 gr.update(visible=enabled),
             )
         enable_audio_output_checkbox.change(
             fn=update_tts_visibility,
             inputs=[enable_audio_output_checkbox],
-            outputs=[tts_voice_dropdown, tts_speed_slider, audio_output],
         )
         # Chat interface with multimodal support
@@ -1196,6 +1213,7 @@ def create_demo() -> gr.Blocks:
                 enable_audio_input_checkbox,
                 tts_voice_dropdown,
                 tts_speed_slider,
                 web_search_provider_dropdown,
                 # Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
             ],

 from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
+from src.services.audio_processing import get_audio_service
 from src.services.multimodal_processing import get_multimodal_service
 from src.utils.config import settings
 from src.utils.models import AgentEvent, OrchestratorConfig
     enable_audio_input: bool = True,
     tts_voice: str = "af_heart",
     tts_speed: float = 1.0,
+    tts_use_llm_polish: bool = False,
     web_search_provider: str = "auto",
     oauth_token: gr.OAuthToken | None = None,
     oauth_profile: gr.OAuthProfile | None = None,
         enable_audio_input: Whether to process audio inputs
         tts_voice: TTS voice selection
         tts_speed: TTS speech speed
+        tts_use_llm_polish: Apply LLM-based final polish to audio text (costs API calls)
         web_search_provider: Web search provider selection
         oauth_token: Gradio OAuth token (None if user not logged in)
         oauth_profile: Gradio OAuth profile (None if user not logged in)
         # Optional: Generate audio output if enabled
         if settings.enable_audio_output and settings.modal_available:
             try:
+                audio_service = get_audio_service()
                 # Get the last message from history for TTS
                 last_message = history[-1].get("content", "") if history else processed_text
                 if last_message:
+                    # Temporarily override tts_use_llm_polish setting from UI
+                    original_llm_polish = settings.tts_use_llm_polish
+                    try:
+                        settings.tts_use_llm_polish = tts_use_llm_polish
+                        # Use UI-configured voice and speed, fallback to settings defaults
+                        await audio_service.generate_audio_output(
+                            text=last_message,
+                            voice=tts_voice or settings.tts_voice,
+                            speed=tts_speed if tts_speed else settings.tts_speed,
+                        )
+                    finally:
+                        # Restore original setting
+                        settings.tts_use_llm_polish = original_llm_polish
             except Exception as e:
                 logger.warning("audio_synthesis_failed", error=str(e))
                 # Continue without audio output
                     interactive=False,  # GPU type set at function definition time, requires restart
                 )
+                tts_use_llm_polish_checkbox = gr.Checkbox(
+                    value=settings.tts_use_llm_polish,
+                    label="Use LLM Polish for Audio",
+                    info="Apply LLM-based final polish to remove remaining formatting artifacts (costs API calls)",
+                    visible=settings.enable_audio_output,
+                )
                 # Audio output component (for TTS response) - moved to sidebar
                 audio_output = gr.Audio(
                     label="🔊 Audio Response",
         # This must be after audio_output is defined
         def update_tts_visibility(
             enabled: bool,
+        ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], dict[str, Any]]:
             """Update visibility of TTS components based on enable checkbox."""
             return (
                 gr.update(visible=enabled),
                 gr.update(visible=enabled),
                 gr.update(visible=enabled),
+                gr.update(visible=enabled),
             )
         enable_audio_output_checkbox.change(
             fn=update_tts_visibility,
             inputs=[enable_audio_output_checkbox],
+            outputs=[tts_voice_dropdown, tts_speed_slider, tts_use_llm_polish_checkbox, audio_output],
         )
         # Chat interface with multimodal support
                 enable_audio_input_checkbox,
                 tts_voice_dropdown,
                 tts_speed_slider,
+                tts_use_llm_polish_checkbox,
                 web_search_provider_dropdown,
                 # Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
             ],

src/services/audio_processing.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Any
 import numpy as np
 import structlog
 from src.services.stt_gradio import STTService, get_stt_service
 from src.utils.config import settings
@@ -85,7 +86,7 @@ class AudioService:
         """Generate audio output from text.
         Args:
-            text: Text to synthesize
             voice: Voice ID (default: settings.tts_voice)
             speed: Speech speed (default: settings.tts_speed)
@@ -101,11 +102,22 @@ class AudioService:
             return None
         try:
             # Use provided voice/speed or fallback to settings defaults
             voice = voice if voice else settings.tts_voice
             speed = speed if speed is not None else settings.tts_speed
-            audio_output = await self.tts.synthesize_async(text, voice, speed)  # type: ignore[misc]
             if audio_output:
                 logger.info(

 import numpy as np
 import structlog
+from src.agents.audio_refiner import audio_refiner
 from src.services.stt_gradio import STTService, get_stt_service
 from src.utils.config import settings
         """Generate audio output from text.
         Args:
+            text: Text to synthesize (markdown will be cleaned for audio)
             voice: Voice ID (default: settings.tts_voice)
             speed: Speech speed (default: settings.tts_speed)
             return None
         try:
+            # Refine text for audio (remove markdown, citations, etc.)
+            # Use LLM polish if enabled in settings
+            refined_text = await audio_refiner.refine_for_audio(
+                text,
+                use_llm_polish=settings.tts_use_llm_polish
+            )
+            logger.info("text_refined_for_audio",
+                       original_length=len(text),
+                       refined_length=len(refined_text),
+                       llm_polish_enabled=settings.tts_use_llm_polish)
             # Use provided voice/speed or fallback to settings defaults
             voice = voice if voice else settings.tts_voice
             speed = speed if speed is not None else settings.tts_speed
+            audio_output = await self.tts.synthesize_async(refined_text, voice, speed)  # type: ignore[misc]
             if audio_output:
                 logger.info(

src/services/tts_modal.py CHANGED Viewed

@@ -1,12 +1,18 @@
 """Text-to-Speech service using Kokoro 82M via Modal GPU."""
 import asyncio
 from functools import lru_cache
 from typing import Any
 import numpy as np
 import structlog
 from src.utils.config import settings
 from src.utils.exceptions import ConfigurationError
@@ -24,39 +30,52 @@ KOKORO_DEPENDENCIES = [
 # Modal app and function definitions (module-level for Modal)
 _modal_app: Any | None = None
 _tts_function: Any | None = None
 def _get_modal_app() -> Any:
-    """Get or create Modal app instance."""
     global _modal_app
     if _modal_app is None:
         try:
             import modal
-            # Validate Modal credentials before attempting lookup
-            if not settings.modal_available:
                 raise ConfigurationError(
-                    "Modal credentials not configured. Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables."
                 )
             # Validate token ID format (Modal token IDs are typically UUIDs or specific formats)
-            token_id = settings.modal_token_id
-            if token_id:
-                # Basic validation: token ID should not be empty and should be a reasonable length
-                if len(token_id.strip()) < 10:
-                    raise ConfigurationError(
-                        f"Modal token ID appears malformed (too short: {len(token_id)} chars). "
-                        "Token ID should be a valid Modal token identifier."
-                    )
             try:
-                _modal_app = modal.App.lookup("deepcritical-tts", create_if_missing=True)
             except Exception as e:
                 error_msg = str(e).lower()
                 if "token" in error_msg or "malformed" in error_msg or "invalid" in error_msg:
                     raise ConfigurationError(
                         f"Modal token validation failed: {e}. "
-                        "Please check that MODAL_TOKEN_ID and MODAL_TOKEN_SECRET are correctly set."
                     ) from e
                 raise
         except ImportError as e:
@@ -69,23 +88,92 @@ def _get_modal_app() -> Any:
 # Define Modal image with Kokoro dependencies (module-level)
 def _get_tts_image() -> Any:
     """Get Modal image with Kokoro dependencies."""
     try:
         import modal
-        return (
             modal.Image.debian_slim(python_version="3.11")
             .pip_install(*KOKORO_DEPENDENCIES)
             .pip_install("git+https://github.com/hexgrad/kokoro.git")
         )
     except ImportError:
         return None
 def _setup_modal_function() -> None:
     """Setup Modal GPU function for TTS (called once, lazy initialization).
-    Note: GPU type is set at function definition time. Changes to settings.tts_gpu
-    require app restart to take effect.
     """
     global _tts_function
@@ -93,80 +181,27 @@ def _setup_modal_function() -> None:
         return  # Already set up
     try:
-        app = _get_modal_app()
-        tts_image = _get_tts_image()
-        if tts_image is None:
-            raise ConfigurationError("Modal image setup failed")
-        # Get GPU and timeout from settings (with defaults)
-        # Note: These are evaluated at function definition time, not at call time
-        # Changes to settings require app restart
-        gpu_type = getattr(settings, "tts_gpu", None) or "T4"
-        timeout_seconds = getattr(settings, "tts_timeout", None) or 60
-        # Define GPU function at module level (required by Modal)
-        # Modal functions are immutable once defined, so GPU changes require restart
-        @app.function(  # type: ignore[misc]
-            image=tts_image,
-            gpu=gpu_type,
-            timeout=timeout_seconds,
-        )
-        def kokoro_tts_function(
-            text: str, voice: str, speed: float
-        ) -> tuple[int, np.ndarray[Any, Any]]:  # type: ignore[type-arg]
-            """Modal GPU function for Kokoro TTS.
-            This function runs on Modal's GPU infrastructure.
-            Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
-            Reference: https://huggingface.co/spaces/hexgrad/Kokoro-TTS/raw/main/app.py
-            """
-            import numpy as np
-            # Import Kokoro inside function (lazy load)
-            try:
-                from kokoro import KModel, KPipeline
-                # Initialize model (cached on GPU)
-                model = KModel().to("cuda").eval()
-                pipeline = KPipeline(lang_code=voice[0])
-                pack = pipeline.load_voice(voice)
-                # Generate audio
-                for _, ps, _ in pipeline(text, voice, speed):
-                    ref_s = pack[len(ps) - 1]
-                    audio = model(ps, ref_s, speed)
-                    return (24000, audio.numpy())
-                # If no audio generated, return empty
-                return (24000, np.zeros(1, dtype=np.float32))
-            except ImportError as e:
-                raise ConfigurationError(
-                    "Kokoro not installed. Install with: pip install git+https://github.com/hexgrad/kokoro.git"
-                ) from e
-            except Exception as e:
-                raise ConfigurationError(f"TTS synthesis failed: {e}") from e
-        # Store function reference for remote calls
-        _tts_function = kokoro_tts_function
-        # Verify function is properly attached to app
-        if not hasattr(app, kokoro_tts_function.__name__):
-            logger.warning(
-                "modal_function_not_attached", function_name=kokoro_tts_function.__name__
-            )
         logger.info(
-            "modal_tts_function_setup_complete",
-            gpu=gpu_type,
-            timeout=timeout_seconds,
-            function_name=kokoro_tts_function.__name__,
         )
     except Exception as e:
         logger.error("modal_tts_function_setup_failed", error=str(e))
-        raise ConfigurationError(f"Failed to setup Modal TTS function: {e}") from e
 class ModalTTSExecutor:
@@ -180,13 +215,17 @@ class ModalTTSExecutor:
         """Initialize Modal TTS executor.
         Note:
-            Logs a warning if Modal credentials are not configured.
-            Execution will fail at runtime without valid credentials.
         """
-        # Check for Modal credentials
-        if not settings.modal_available:
             logger.warning(
-                "Modal credentials not found. TTS will not be available unless modal setup is run."
             )
     def synthesize(
@@ -195,7 +234,7 @@ class ModalTTSExecutor:
         voice: str = "af_heart",
         speed: float = 1.0,
         timeout: int = 60,
-    ) -> tuple[int, np.ndarray[Any, Any]]:  # type: ignore[type-arg]
         """Synthesize text to speech using Kokoro on Modal GPU.
         Args:
@@ -226,7 +265,7 @@ class ModalTTSExecutor:
                 "tts_synthesis_complete", sample_rate=result[0], audio_shape=result[1].shape
             )
-            return result  # type: ignore[no-any-return]
         except Exception as e:
             logger.error("tts_synthesis_failed", error=str(e), error_type=type(e).__name__)
@@ -237,9 +276,19 @@ class TTSService:
     """TTS service wrapper for async usage."""
     def __init__(self) -> None:
-        """Initialize TTS service."""
-        if not settings.modal_available:
-            raise ConfigurationError("Modal credentials required for TTS")
         self.executor = ModalTTSExecutor()
     async def synthesize_async(
@@ -247,7 +296,7 @@ class TTSService:
         text: str,
         voice: str = "af_heart",
         speed: float = 1.0,
-    ) -> tuple[int, np.ndarray[Any, Any]] | None:  # type: ignore[type-arg]
         """Async wrapper for TTS synthesis.
         Args:

 """Text-to-Speech service using Kokoro 82M via Modal GPU."""
 import asyncio
+import os
 from functools import lru_cache
 from typing import Any
 import numpy as np
 import structlog
+# Load .env file BEFORE importing Modal SDK
+# Modal SDK reads MODAL_TOKEN_ID and MODAL_TOKEN_SECRET from environment on import
+from dotenv import load_dotenv
+load_dotenv()
 from src.utils.config import settings
 from src.utils.exceptions import ConfigurationError
 # Modal app and function definitions (module-level for Modal)
 _modal_app: Any | None = None
 _tts_function: Any | None = None
+_tts_image: Any | None = None
 def _get_modal_app() -> Any:
+    """Get or create Modal app instance.
+    Retrieves Modal credentials directly from environment variables (.env file)
+    instead of relying on settings configuration.
+    """
     global _modal_app
     if _modal_app is None:
         try:
             import modal
+            # Get credentials directly from environment variables
+            token_id = os.getenv("MODAL_TOKEN_ID")
+            token_secret = os.getenv("MODAL_TOKEN_SECRET")
+            # Validate Modal credentials
+            if not token_id or not token_secret:
                 raise ConfigurationError(
+                    "Modal credentials not found in environment. "
+                    "Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in .env file."
                 )
             # Validate token ID format (Modal token IDs are typically UUIDs or specific formats)
+            if len(token_id.strip()) < 10:
+                raise ConfigurationError(
+                    f"Modal token ID appears malformed (too short: {len(token_id)} chars). "
+                    "Token ID should be a valid Modal token identifier."
+                )
+            logger.info(
+                "modal_credentials_loaded",
+                token_id_prefix=token_id[:8] + "...",  # Log prefix for debugging
+                has_secret=bool(token_secret),
+            )
             try:
+                _modal_app = modal.App("deepcritical-tts")
             except Exception as e:
                 error_msg = str(e).lower()
                 if "token" in error_msg or "malformed" in error_msg or "invalid" in error_msg:
                     raise ConfigurationError(
                         f"Modal token validation failed: {e}. "
+                        "Please check that MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in .env are correctly set."
                     ) from e
                 raise
         except ImportError as e:
 # Define Modal image with Kokoro dependencies (module-level)
 def _get_tts_image() -> Any:
     """Get Modal image with Kokoro dependencies."""
+    global _tts_image
+    if _tts_image is not None:
+        return _tts_image
     try:
         import modal
+        _tts_image = (
             modal.Image.debian_slim(python_version="3.11")
             .pip_install(*KOKORO_DEPENDENCIES)
             .pip_install("git+https://github.com/hexgrad/kokoro.git")
         )
+        return _tts_image
     except ImportError:
         return None
+# Modal TTS function - Using serialized=True to allow dynamic creation
+# This will be initialized lazily when _setup_modal_function() is called
+def _create_tts_function() -> Any:
+    """Create the Modal TTS function using serialized=True.
+    The serialized=True parameter allows the function to be defined outside
+    of global scope, which is necessary for dynamic initialization.
+    """
+    app = _get_modal_app()
+    tts_image = _get_tts_image()
+    if tts_image is None:
+        raise ConfigurationError("Modal image setup failed")
+    # Get GPU and timeout from settings (with defaults)
+    gpu_type = getattr(settings, "tts_gpu", None) or "T4"
+    timeout_seconds = getattr(settings, "tts_timeout", None) or 60
+    @app.function(
+        image=tts_image,
+        gpu=gpu_type,
+        timeout=timeout_seconds,
+        serialized=True,  # Allow function to be defined outside global scope
+    )
+    def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
+        """Modal GPU function for Kokoro TTS.
+        This function runs on Modal's GPU infrastructure.
+        Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
+        Reference: https://huggingface.co/spaces/hexgrad/Kokoro-TTS/raw/main/app.py
+        """
+        import numpy as np
+        # Import Kokoro inside function (lazy load)
+        try:
+            import torch
+            from kokoro import KModel, KPipeline
+            # Initialize model (cached on GPU)
+            model = KModel().to("cuda").eval()
+            pipeline = KPipeline(lang_code=voice[0])
+            pack = pipeline.load_voice(voice)
+            # Generate audio
+            for _, ps, _ in pipeline(text, voice, speed):
+                ref_s = pack[len(ps) - 1]
+                audio = model(ps, ref_s, speed)
+                return (24000, audio.numpy())
+            # If no audio generated, return empty
+            return (24000, np.zeros(1, dtype=np.float32))
+        except ImportError as e:
+            raise ConfigurationError(
+                "Kokoro not installed. Install with: pip install git+https://github.com/hexgrad/kokoro.git"
+            ) from e
+        except Exception as e:
+            raise ConfigurationError(f"TTS synthesis failed: {e}") from e
+    return kokoro_tts_function
 def _setup_modal_function() -> None:
     """Setup Modal GPU function for TTS (called once, lazy initialization).
+    Looks up the deployed Modal function instead of creating a new one.
+    This requires the 'deepcritical-tts' app to be deployed on Modal.
+    To deploy: modal deploy <script_with_tts_function>.py
     """
     global _tts_function
         return  # Already set up
     try:
+        import modal
+        # Look up the deployed function from the Modal server
+        # This requires the app to be deployed: modal deploy tts_modal.py
+        _tts_function = modal.Function.from_name(
+            "deepcritical-tts",
+            "kokoro_tts_function"
+        )
         logger.info(
+            "modal_tts_function_lookup_complete",
+            app_name="deepcritical-tts",
+            function_name="kokoro_tts_function",
         )
     except Exception as e:
         logger.error("modal_tts_function_setup_failed", error=str(e))
+        raise ConfigurationError(
+            f"Failed to lookup Modal TTS function: {e}. "
+            "Make sure the 'deepcritical-tts' app is deployed on Modal."
+        ) from e
 class ModalTTSExecutor:
         """Initialize Modal TTS executor.
         Note:
+            Logs a warning if Modal credentials are not configured in environment.
+            Execution will fail at runtime without valid credentials in .env file.
         """
+        # Check for Modal credentials directly from environment
+        token_id = os.getenv("MODAL_TOKEN_ID")
+        token_secret = os.getenv("MODAL_TOKEN_SECRET")
+        if not token_id or not token_secret:
             logger.warning(
+                "Modal credentials not found in environment. "
+                "TTS will not be available. Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in .env file."
             )
     def synthesize(
         voice: str = "af_heart",
         speed: float = 1.0,
         timeout: int = 60,
+    ) -> tuple[int, np.ndarray]:
         """Synthesize text to speech using Kokoro on Modal GPU.
         Args:
                 "tts_synthesis_complete", sample_rate=result[0], audio_shape=result[1].shape
             )
+            return result
         except Exception as e:
             logger.error("tts_synthesis_failed", error=str(e), error_type=type(e).__name__)
     """TTS service wrapper for async usage."""
     def __init__(self) -> None:
+        """Initialize TTS service.
+        Validates Modal credentials from environment variables (.env file).
+        """
+        # Check credentials directly from environment
+        token_id = os.getenv("MODAL_TOKEN_ID")
+        token_secret = os.getenv("MODAL_TOKEN_SECRET")
+        if not token_id or not token_secret:
+            raise ConfigurationError(
+                "Modal credentials required for TTS. "
+                "Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in .env file."
+            )
         self.executor = ModalTTSExecutor()
     async def synthesize_async(
         text: str,
         voice: str = "af_heart",
         speed: float = 1.0,
+    ) -> tuple[int, np.ndarray] | None:
         """Async wrapper for TTS synthesis.
         Args:

src/utils/config.py CHANGED Viewed

@@ -172,6 +172,10 @@ class Settings(BaseSettings):
         le=2.0,
         description="TTS speech speed multiplier (0.5x to 2.0x)",
     )
     tts_gpu: str | None = Field(
         default=None,
         description="Modal GPU type for TTS (T4, A10, A100, L4, L40S). None uses default T4.",

         le=2.0,
         description="TTS speech speed multiplier (0.5x to 2.0x)",
     )
+    tts_use_llm_polish: bool = Field(
+        default=False,
+        description="Use LLM for final text polish before TTS (optional, costs API calls)",
+    )
     tts_gpu: str | None = Field(
         default=None,
         description="Modal GPU type for TTS (T4, A10, A100, L4, L40S). None uses default T4.",

tests/unit/agents/test_audio_refiner.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""Unit tests for AudioRefiner agent."""
+import pytest
+from unittest.mock import AsyncMock, Mock, patch
+from src.agents.audio_refiner import AudioRefiner, refine_text_for_audio
+class TestAudioRefiner:
+    """Test suite for AudioRefiner functionality."""
+    @pytest.fixture
+    def refiner(self):
+        """Create AudioRefiner instance."""
+        return AudioRefiner()
+    def test_remove_markdown_headers(self, refiner):
+        """Test removal of markdown headers."""
+        text = """# Main Title
+## Subtitle
+### Section
+Content here"""
+        result = refiner._remove_markdown_syntax(text)
+        assert "#" not in result
+        assert "Main Title" in result
+        assert "Subtitle" in result
+    def test_remove_bold_italic(self, refiner):
+        """Test removal of bold and italic formatting."""
+        text = "**Bold text** and *italic text* and __another bold__"
+        result = refiner._remove_markdown_syntax(text)
+        assert "**" not in result
+        assert "*" not in result
+        assert "__" not in result
+        assert "Bold text" in result
+        assert "italic text" in result
+    def test_remove_links(self, refiner):
+        """Test removal of markdown links."""
+        text = "Check [this link](https://example.com) for details"
+        result = refiner._remove_markdown_syntax(text)
+        assert "[" not in result
+        assert "]" not in result
+        assert "https://" not in result
+        assert "this link" in result
+    def test_remove_citations_numbered(self, refiner):
+        """Test removal of numbered citations."""
+        text = "Research shows [1] that metformin [2,3] works [4-6]."
+        result = refiner._remove_citations(text)
+        assert "[1]" not in result
+        assert "[2,3]" not in result
+        assert "[4-6]" not in result
+        assert "Research shows" in result
+    def test_remove_citations_author_year(self, refiner):
+        """Test removal of author-year citations."""
+        text = "Studies (Smith et al., 2023) and (Jones, 2022) confirm this."
+        result = refiner._remove_citations(text)
+        assert "(Smith et al., 2023)" not in result
+        assert "(Jones, 2022)" not in result
+        assert "Studies" in result
+        assert "confirm this" in result
+    def test_remove_first_references_section(self, refiner):
+        """Test that References sections are removed while preserving other content."""
+        text = """Main content here.
+# References
+[1] First reference
+[2] Second reference
+# More Content
+This should remain.
+## References
+This second References should also be removed."""
+        result = refiner._remove_references_sections(text)
+        assert "Main content here" in result
+        assert "References" not in result
+        assert "First reference" not in result
+        assert "More Content" in result  # Content after References should be preserved
+        assert "This should remain" in result
+        assert "second References should also be removed" not in result  # Second References section removed
+    def test_roman_to_int_conversion(self, refiner):
+        """Test roman numeral to integer conversion."""
+        assert refiner._roman_to_int("I") == 1
+        assert refiner._roman_to_int("II") == 2
+        assert refiner._roman_to_int("III") == 3
+        assert refiner._roman_to_int("IV") == 4
+        assert refiner._roman_to_int("V") == 5
+        assert refiner._roman_to_int("IX") == 9
+        assert refiner._roman_to_int("X") == 10
+        assert refiner._roman_to_int("XII") == 12
+        assert refiner._roman_to_int("XX") == 20
+    def test_int_to_word_conversion(self, refiner):
+        """Test integer to word conversion."""
+        assert refiner._int_to_word(1) == "One"
+        assert refiner._int_to_word(2) == "Two"
+        assert refiner._int_to_word(3) == "Three"
+        assert refiner._int_to_word(10) == "Ten"
+        assert refiner._int_to_word(20) == "Twenty"
+        assert refiner._int_to_word(25) == "25"  # Falls back to digit
+    def test_convert_roman_numerals_with_context(self, refiner):
+        """Test roman numeral conversion with context words."""
+        test_cases = [
+            ("Phase I trial", "Phase One trial"),
+            ("Phase II study", "Phase Two study"),
+            ("Phase III data", "Phase Three data"),
+            ("Type I diabetes", "Type One diabetes"),
+            ("Type II error", "Type Two error"),
+            ("Stage IV cancer", "Stage Four cancer"),
+            ("Trial I results", "Trial One results"),
+        ]
+        for input_text, expected in test_cases:
+            result = refiner._convert_roman_numerals(input_text)
+            assert expected in result, f"Failed for: {input_text}"
+    def test_convert_standalone_roman_numerals(self, refiner):
+        """Test standalone roman numeral conversion."""
+        text = "Results for I, II, and III are positive."
+        result = refiner._convert_roman_numerals(text)
+        # Standalone roman numerals should be converted
+        assert "One" in result or "Two" in result or "Three" in result
+    def test_dont_convert_roman_in_words(self, refiner):
+        """Test that roman numerals inside words aren't converted."""
+        text = "INVALID data fromIXIN compound"
+        result = refiner._convert_roman_numerals(text)
+        # Should not break words containing I, V, X, etc.
+        assert "INVALID" in result or "Invalid" in result  # May be case-normalized
+    def test_clean_special_characters(self, refiner):
+        """Test special character cleanup."""
+        # Using unicode escapes to avoid syntax issues
+        text = "Text with \u2014 em-dash and \u201csmart quotes\u201d and \u2018apostrophes\u2019."
+        result = refiner._clean_special_characters(text)
+        assert "\u2014" not in result  # em-dash
+        assert "\u201c" not in result  # smart quote open
+        assert "\u2018" not in result  # smart apostrophe
+        assert "-" in result
+    def test_normalize_whitespace(self, refiner):
+        """Test whitespace normalization."""
+        text = "Text  with   multiple    spaces\n\n\n\nand many newlines"
+        result = refiner._normalize_whitespace(text)
+        assert "  " not in result  # No double spaces
+        assert "\n\n\n" not in result  # Max two newlines
+    async def test_full_refine_workflow(self, refiner):
+        """Test complete refinement workflow."""
+        markdown_text = """# Summary
+**Metformin** shows promise for *long COVID* treatment [1].
+## Phase I Trials
+Research (Smith et al., 2023) indicates [2,3]:
+- 50% improvement
+- Low side effects
+Check [this study](https://example.com) for details.
+# References
+[1] Smith, J. et al. (2023)
+[2] Jones, K. (2022)
+"""
+        result = await refiner.refine_for_audio(markdown_text)
+        # Check markdown removed
+        assert "#" not in result
+        assert "**" not in result
+        assert "*" not in result
+        # Check citations removed
+        assert "[1]" not in result
+        assert "(Smith et al., 2023)" not in result
+        # Check roman numerals converted
+        assert "Phase One" in result
+        # Check references section removed
+        assert "References" not in result
+        assert "Smith, J. et al." not in result
+        # Check content preserved
+        assert "Metformin" in result
+        assert "long COVID" in result
+    async def test_convenience_function(self):
+        """Test convenience function."""
+        text = "**Bold** text with [link](url)"
+        result = await refine_text_for_audio(text)
+        assert "**" not in result
+        assert "[link]" not in result
+        assert "Bold" in result
+    async def test_empty_text(self, refiner):
+        """Test handling of empty text."""
+        assert await refiner.refine_for_audio("") == ""
+        assert await refiner.refine_for_audio("   ") == ""
+    async def test_no_references_section(self, refiner):
+        """Test text without References section."""
+        text = "Main content without references."
+        result = await refiner.refine_for_audio(text)
+        assert "Main content without references" in result
+    def test_multiple_reference_formats(self, refiner):
+        """Test different References section formats."""
+        formats = [
+            ("# References\nContent", True),  # Markdown header - will be removed
+            ("## References\nContent", True),  # Markdown header - will be removed
+            ("**References**\nContent", True),  # Bold heading - will be removed
+            ("References:\nContent", False),  # Standalone without markers - NOT removed (edge case)
+        ]
+        for format_text, should_remove in formats:
+            text = f"Main content\n{format_text}"
+            result = refiner._remove_references_sections(text)
+            assert "Main content" in result
+            if should_remove:
+                assert "References" not in result or result.count("References") == 0
+            # Standalone "References:" without markers is an edge case we don't handle
+    def test_preserve_paragraph_structure(self, refiner):
+        """Test that paragraph structure is preserved."""
+        text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
+        result = refiner._normalize_whitespace(text)
+        # Should have paragraph breaks (double newlines)
+        assert "\n\n" in result
+        # But not excessive newlines
+        assert "\n\n\n" not in result
+    @patch('src.agents.audio_refiner.get_pydantic_ai_model')
+    async def test_llm_polish_disabled_by_default(self, mock_get_model, refiner):
+        """Test that LLM polish is not called by default."""
+        text = "Test text"
+        result = await refiner.refine_for_audio(text, use_llm_polish=False)
+        # LLM should not be called when disabled
+        mock_get_model.assert_not_called()
+        assert "Test text" in result
+    @patch('src.agents.audio_refiner.Agent')
+    @patch('src.agents.audio_refiner.get_pydantic_ai_model')
+    async def test_llm_polish_enabled(self, mock_get_model, mock_agent_class, refiner):
+        """Test that LLM polish is called when enabled."""
+        # Setup mock
+        mock_model = Mock()
+        mock_get_model.return_value = mock_model
+        mock_agent_instance = Mock()
+        mock_result = Mock()
+        mock_result.output = "Polished text"
+        mock_agent_instance.run = AsyncMock(return_value=mock_result)
+        mock_agent_class.return_value = mock_agent_instance
+        # Test with LLM polish enabled
+        text = "**Test** text"
+        result = await refiner.refine_for_audio(text, use_llm_polish=True)
+        # Verify LLM was called
+        mock_get_model.assert_called_once()
+        mock_agent_class.assert_called_once()
+        mock_agent_instance.run.assert_called_once()
+        assert result == "Polished text"
+    @patch('src.agents.audio_refiner.Agent')
+    @patch('src.agents.audio_refiner.get_pydantic_ai_model')
+    async def test_llm_polish_graceful_fallback(self, mock_get_model, mock_agent_class, refiner):
+        """Test graceful fallback when LLM polish fails."""
+        # Setup mock to raise exception
+        mock_get_model.return_value = Mock()
+        mock_agent_instance = Mock()
+        mock_agent_instance.run = AsyncMock(side_effect=Exception("API Error"))
+        mock_agent_class.return_value = mock_agent_instance
+        # Test with LLM polish enabled but failing
+        text = "Test text"
+        result = await refiner.refine_for_audio(text, use_llm_polish=True)
+        # Should fall back to rule-based output
+        assert "Test text" in result
+        assert result != ""  # Should not be empty
+    async def test_convenience_function_with_llm_polish(self):
+        """Test convenience function with LLM polish parameter."""
+        with patch.object(AudioRefiner, 'refine_for_audio') as mock_refine:
+            mock_refine.return_value = AsyncMock(return_value="Refined text")()
+            # Test without LLM polish
+            result = await refine_text_for_audio("Test", use_llm_polish=False)
+            mock_refine.assert_called_with("Test", use_llm_polish=False)
+            # Test with LLM polish
+            result = await refine_text_for_audio("Test", use_llm_polish=True)
+            mock_refine.assert_called_with("Test", use_llm_polish=True)