Spaces:

samaritan-ai
/

marianmt-he2arc-sam

Sleeping

File size: 11,085 Bytes

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time
from typing import Optional
import json

# Page configuration
st.set_page_config(
    page_title="Samaritan Hebrew to Samaritan Targumic Aramaic Translation",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for modern styling
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        font-weight: 700;
        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        text-align: center;
        margin-bottom: 2rem;
    }
    
    .sub-header {
        font-size: 1.2rem;
        color: #666;
        text-align: center;
        margin-bottom: 3rem;
    }
    
    .translation-box {
        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
        padding: 2rem;
        border-radius: 15px;
        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
        margin: 1rem 0;
    }
    
    .input-area {
        background: white;
        border-radius: 10px;
        padding: 1.5rem;
        box-shadow: 0 4px 16px rgba(0,0,0,0.05);
    }
    
    .output-area {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        border-radius: 10px;
        padding: 1.5rem;
        box-shadow: 0 4px 16px rgba(0,0,0,0.1);
    }
    
    .direction-selector {
        background: white;
        border-radius: 10px;
        padding: 1rem;
        box-shadow: 0 4px 16px rgba(0,0,0,0.05);
        margin-bottom: 1rem;
    }
    
    .stButton > button {
        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
        color: white;
        border: none;
        border-radius: 25px;
        padding: 0.75rem 2rem;
        font-weight: 600;
        transition: all 0.3s ease;
    }
    
    .stButton > button:hover {
        transform: translateY(-2px);
        box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4);
    }
    
    .model-info {
        background: #f8f9fa;
        border-radius: 10px;
        padding: 1rem;
        margin: 1rem 0;
        border-left: 4px solid #667eea;
    }
</style>
""", unsafe_allow_html=True)

@st.cache_resource
def load_model():
    """Load the Hugging Face model and tokenizer with caching."""
    model_name = "johnlockejrr/marianmt-he2arc-sam"
    
    with st.spinner("Loading translation model..."):
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            
            # Move to GPU if available
            device = "cuda" if torch.cuda.is_available() else "cpu"
            model.to(device)
            model.eval()
            
            return tokenizer, model, device
        except Exception as e:
            st.error(f"Error loading model: {str(e)}")
            return None, None, None

def translate_text(text: str, direction: str, tokenizer, model, device: str, max_length: int = 512) -> Optional[str]:
    """Translate text using the loaded model."""
    if not text.strip():
        return None
    
    try:
        # Add language prefix based on direction (using the correct sem-sem model format)
        if direction == "Hebrew to Aramaic":
            input_text = f">>heb<< {text}"
        else:  # Aramaic to Hebrew
            input_text = f">>arc<< {text}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=max_length,
            truncation=True,
            padding=True
        ).to(device)
        
        # Generate translation
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=4,
                length_penalty=0.6,
                early_stopping=True,
                do_sample=False
            )
        
        # Decode output
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation
        
    except Exception as e:
        st.error(f"Translation error: {str(e)}")
        return None

def main():
    # Header
    st.markdown('<h1 class="main-header">📚 Samaritan Hebrew-Aramaic Translator</h1>', unsafe_allow_html=True)
    st.markdown('<p class="sub-header">Powered by the johnlockejrr/marianmt-he2arc-sam model</p>', unsafe_allow_html=True)
    
    # Load model
    tokenizer, model, device = load_model()
    
    if tokenizer is None or model is None:
        st.error("Failed to load the translation model. Please check your internet connection and try again.")
        return
    
    # Sidebar for settings
    with st.sidebar:
        st.markdown("### ⚙️ Settings")
        
        # Max length setting
        max_length = st.slider(
            "Maximum Output Length",
            min_value=64,
            max_value=512,
            value=256,
            step=32,
            help="Maximum length of the generated translation"
        )
        
        # Model info
        st.markdown("### 📊 Model Information")
        st.markdown(f"**Model:** johnlockejrr/marianmt-he2arc-sam")
        st.markdown(f"**Device:** {device.upper()}")
        st.markdown(f"**Tokenizer:** {tokenizer.__class__.__name__}")
        st.markdown(f"**Model Type:** {model.__class__.__name__}")
        st.markdown(f"**Direction:** Samaritan Hebrew → Samaritan Aramaic")
        
        # Clear button
        if st.button("🗑️ Clear All"):
            st.rerun()
    
    # Main content area
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.markdown('<div class="input-area">', unsafe_allow_html=True)
        st.markdown("### 📝 Input Text")
        
        # Text input
        input_text = st.text_area(
            "Enter Samaritan Hebrew text to translate",
            height=200,
            placeholder="Enter your Samaritan Hebrew text here...",
            help="Type or paste the Samaritan Hebrew text you want to translate to Samaritan Aramaic"
        )
        
        # Translate button
        translate_button = st.button(
            "🔄 Translate to Samaritan Aramaic",
            type="primary",
            use_container_width=True
        )
        st.markdown('</div>', unsafe_allow_html=True)
    
    with col2:
        st.markdown('<div class="output-area">', unsafe_allow_html=True)
        st.markdown("### 🎯 Samaritan Aramaic Translation")
        
        if translate_button and input_text.strip():
            with st.spinner("Translating to Samaritan Aramaic..."):
                # Add a small delay for better UX
                time.sleep(0.5)
                
                translation = translate_text(
                    input_text, 
                    "Hebrew to Aramaic", 
                    tokenizer, 
                    model, 
                    device, 
                    max_length
                )
                
                if translation:
                    st.markdown(f"**Samaritan Aramaic:**")
                    # Display translation in a code block that can be easily copied
                    st.code(translation, language=None)
                else:
                    st.error("Translation failed. Please try again.")
        else:
            st.markdown("*Samaritan Aramaic translation will appear here*")
        st.markdown('</div>', unsafe_allow_html=True)
    
    # Additional features
    st.markdown("---")
    
    # Batch translation section
    st.markdown("### 📚 Batch Translation")
    st.markdown("Upload a text file with multiple Samaritan Hebrew lines to translate them all to Samaritan Aramaic.")
    
    uploaded_file = st.file_uploader(
        "Choose a text file",
        type=['txt'],
        help="Upload a .txt file with one Samaritan Hebrew text per line"
    )
    
    if uploaded_file is not None:
        try:
            # Read file content
            content = uploaded_file.read().decode('utf-8')
            lines = [line.strip() for line in content.split('\n') if line.strip()]
            
            if lines:
                st.success(f"📄 Loaded {len(lines)} lines from {uploaded_file.name}")
                
                if st.button("🔄 Translate All to Samaritan Aramaic", type="primary"):
                    st.markdown("### 📋 Batch Translation Results")
                    
                    # Create a progress bar
                    progress_bar = st.progress(0)
                    status_text = st.empty()
                    
                    results = []
                    for i, line in enumerate(lines):
                        status_text.text(f"Translating line {i+1}/{len(lines)}: {line[:50]}...")
                        
                        translation = translate_text(
                            line, 
                            "Hebrew to Aramaic", 
                            tokenizer, 
                            model, 
                            device, 
                            max_length
                        )
                        
                        results.append({
                            'original': line,
                            'translation': translation or "Translation failed"
                        })
                        
                        # Update progress
                        progress_bar.progress((i + 1) / len(lines))
                    
                    status_text.text("✅ Translation complete!")
                    
                    # Display results
                    for i, result in enumerate(results):
                        with st.expander(f"Line {i+1}: {result['original'][:50]}..."):
                            st.markdown(f"**Samaritan Hebrew:** {result['original']}")
                            st.markdown(f"**Samaritan Aramaic:** {result['translation']}")
                    
                    # Download results
                    csv_content = "Samaritan Hebrew,Samaritan Aramaic\n"
                    for result in results:
                        csv_content += f'"{result["original"]}","{result["translation"]}"\n'
                    
                    st.download_button(
                        label="📥 Download Results as CSV",
                        data=csv_content,
                        file_name="samaritan_translations.csv",
                        mime="text/csv"
                    )
        
        except Exception as e:
            st.error(f"Error reading file: {str(e)}")
    
    # Footer
    st.markdown("---")
    st.markdown("""
    <div style="text-align: center; color: #666; padding: 2rem;">
        <p>Built with ❤️ using Streamlit and Hugging Face Transformers</p>
        <p>Samaritan Hebrew to Samaritan Aramaic Translation</p>
        <p>Model: johnlockejrr/marianmt-he2arc-sam</p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()