Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| from datasets import Dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from sklearn.metrics import accuracy_score, f1_score | |
| import torch | |
| import os | |
| model_name = "distilbert-base-multilingual-cased" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def preprocess_function(examples): | |
| return tokenizer(examples["comment"], truncation=True, padding=True) | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = torch.argmax(torch.tensor(logits), dim=-1) | |
| acc = accuracy_score(labels, predictions) | |
| f1 = f1_score(labels, predictions, average="weighted") | |
| return {"accuracy": acc, "f1": f1} | |
| def train_model(file): | |
| try: | |
| df = pd.read_csv(file.name) | |
| if "comment" not in df.columns or "label" not in df.columns: | |
| return "File CSV phải có cột 'comment' và 'label'" | |
| dataset = Dataset.from_pandas(df) | |
| tokenized_dataset = dataset.map(preprocess_function, batched=True) | |
| tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2) | |
| train_dataset = tokenized_dataset["train"] | |
| eval_dataset = tokenized_dataset["test"] | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) | |
| args = TrainingArguments( | |
| output_dir="results", | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| num_train_epochs=3, | |
| weight_decay=0.01, | |
| logging_dir="logs", | |
| logging_steps=10, | |
| push_to_hub=True, | |
| hub_model_id="vnanhtuan/fine-tune-danh-gia-cam-xuc" | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| tokenizer=tokenizer, | |
| compute_metrics=compute_metrics, | |
| ) | |
| trainer.train() | |
| trainer.push_to_hub() | |
| return "Huấn luyện hoàn tất và model đã được đẩy lên Hugging Face." | |
| except Exception as e: | |
| import traceback | |
| return f"❌ Error: {str(e)}\n\n{traceback.format_exc()}" | |
| def predict_sentiment(text): | |
| model = AutoModelForSequenceClassification.from_pretrained("vnanhtuan/fine-tune-danh-gia-cam-xuc") | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| prediction = torch.argmax(outputs.logits, dim=1).item() | |
| label_text = "Tích cực" if prediction == 1 else "Tiêu cực" | |
| return label_text | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Fine-tune mô hình phân loại cảm xúc tiếng Việt") | |
| with gr.Tab("Huấn luyện mô hình"): | |
| csv_file = gr.File(label="Tải file CSV gồm 2 cột: comment, label") | |
| train_button = gr.Button("Bắt đầu huấn luyện") | |
| train_output = gr.Textbox(label="Kết quả") | |
| train_button.click(fn=train_model, inputs=csv_file, outputs=train_output) | |
| with gr.Tab("Dự đoán cảm xúc"): | |
| input_text = gr.Textbox(label="Nhập câu cần phân tích") | |
| output_label = gr.Textbox(label="Kết quả dự đoán") | |
| predict_button = gr.Button("Dự đoán") | |
| predict_button.click(fn=predict_sentiment, inputs=input_text, outputs=output_label) | |
| demo.launch() | |