# data_loader.py """ Simple dataset loader for the workshop. Expected CSV columns: 'review' (text), optionally 'label' (gold label) Place your Kaggle CSV in the project folder as imdb.csv (or change the path below). """ import pandas as pd from typing import Tuple DEFAULT_CSV = "imdb.csv" def load_data(path: str = DEFAULT_CSV) -> pd.DataFrame: """ Load dataset and perform basic cleaning. Returns a pandas DataFrame with at least a 'review' column. """ df = pd.read_csv(path) # Normalize column names df.columns = [c.strip() for c in df.columns] # Heuristics to find text column text_cols = [c for c in df.columns if 'review' in c.lower() or 'text' in c.lower()] if not text_cols: # fallback: take the first string column string_cols = [c for c in df.columns if df[c].dtype == object] if not string_cols: raise ValueError("No text-like column found in the CSV. Ensure a 'review' column exists.") text_col = string_cols[0] else: text_col = text_cols[0] # rename to standard name if text_col != 'review': df = df.rename(columns={text_col: 'review'}) # drop rows with missing reviews df = df.dropna(subset=['review']).reset_index(drop=True) return df def sample_data(df: pd.DataFrame, n: int = 10) -> pd.DataFrame: """Return a small random sample for fast local testing / demo.""" return df.sample(min(n, len(df))).reset_index(drop=True) if __name__ == "__main__": df = load_data() print("Loaded dataset with", len(df), "rows") print(df.head())