# data_loader.py
"""
Simple dataset loader for the workshop.
Expected CSV columns: 'review' (text), optionally 'label' (gold label)
Place your Kaggle CSV in the project folder as imdb.csv (or change the path below).
"""

import pandas as pd
from typing import Tuple

DEFAULT_CSV = "imdb.csv"

def load_data(path: str = DEFAULT_CSV) -> pd.DataFrame:
    """
    Load dataset and perform basic cleaning.
    Returns a pandas DataFrame with at least a 'review' column.
    """
    df = pd.read_csv(path)
    # Normalize column names
    df.columns = [c.strip() for c in df.columns]
    # Heuristics to find text column
    text_cols = [c for c in df.columns if 'review' in c.lower() or 'text' in c.lower()]
    if not text_cols:
        # fallback: take the first string column
        string_cols = [c for c in df.columns if df[c].dtype == object]
        if not string_cols:
            raise ValueError("No text-like column found in the CSV. Ensure a 'review' column exists.")
        text_col = string_cols[0]
    else:
        text_col = text_cols[0]

    # rename to standard name
    if text_col != 'review':
        df = df.rename(columns={text_col: 'review'})

    # drop rows with missing reviews
    df = df.dropna(subset=['review']).reset_index(drop=True)

    return df

def sample_data(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    """Return a small random sample for fast local testing / demo."""
    return df.sample(min(n, len(df))).reset_index(drop=True)

if __name__ == "__main__":
    df = load_data()
    print("Loaded dataset with", len(df), "rows")
    print(df.head())