File size: 1,644 Bytes
65e3bf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# data_loader.py
"""

Simple dataset loader for the workshop.

Expected CSV columns: 'review' (text), optionally 'label' (gold label)

Place your Kaggle CSV in the project folder as imdb.csv (or change the path below).

"""

import pandas as pd
from typing import Tuple

DEFAULT_CSV = "imdb.csv"

def load_data(path: str = DEFAULT_CSV) -> pd.DataFrame:
    """

    Load dataset and perform basic cleaning.

    Returns a pandas DataFrame with at least a 'review' column.

    """
    df = pd.read_csv(path)
    # Normalize column names
    df.columns = [c.strip() for c in df.columns]
    # Heuristics to find text column
    text_cols = [c for c in df.columns if 'review' in c.lower() or 'text' in c.lower()]
    if not text_cols:
        # fallback: take the first string column
        string_cols = [c for c in df.columns if df[c].dtype == object]
        if not string_cols:
            raise ValueError("No text-like column found in the CSV. Ensure a 'review' column exists.")
        text_col = string_cols[0]
    else:
        text_col = text_cols[0]

    # rename to standard name
    if text_col != 'review':
        df = df.rename(columns={text_col: 'review'})

    # drop rows with missing reviews
    df = df.dropna(subset=['review']).reset_index(drop=True)

    return df

def sample_data(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    """Return a small random sample for fast local testing / demo."""
    return df.sample(min(n, len(df))).reset_index(drop=True)

if __name__ == "__main__":
    df = load_data()
    print("Loaded dataset with", len(df), "rows")
    print(df.head())