Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| def load_and_prepare_data(): | |
| # Load dataset directly from Hugging Face | |
| raw_dataset = load_dataset("go_emotions") | |
| # Convert to DataFrame for easy manipulation | |
| df = pd.DataFrame(raw_dataset["train"]) | |
| # Grab only examples with a single label (simplification) | |
| df = df[df["labels"].apply(lambda x: len(x) == 1)].reset_index(drop=True) | |
| # Keep just the first label in list | |
| df["label"] = df["labels"].apply(lambda x: x[0]) | |
| # Clean text if needed | |
| df["clean_text"] = df["text"].str.lower() | |
| # Map integer label to string label using Hugging Face's label list | |
| label_names = raw_dataset["train"].features["labels"].feature.names | |
| df["emotion"] = df["label"].apply(lambda x: label_names[x]) | |
| # Encode emotion names for training | |
| label_encoder = LabelEncoder() | |
| df["label_encoded"] = label_encoder.fit_transform(df["emotion"]) | |
| # Split into train/test | |
| train_texts, test_texts, train_labels, test_labels = train_test_split( | |
| df["clean_text"].tolist(), | |
| df["label_encoded"].tolist(), | |
| test_size=0.2, | |
| stratify=df["label_encoded"], | |
| random_state=42 | |
| ) | |
| return train_texts, test_texts, train_labels, test_labels, label_encoder | |