vnanhtuan's picture
Update app.py
427d67b verified
import gradio as gr
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch
import os
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(examples):
return tokenizer(examples["comment"], truncation=True, padding=True)
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1)
acc = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average="weighted")
return {"accuracy": acc, "f1": f1}
def train_model(file):
try:
df = pd.read_csv(file.name)
if "comment" not in df.columns or "label" not in df.columns:
return "File CSV phải có cột 'comment' và 'label'"
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
args = TrainingArguments(
output_dir="results",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="logs",
logging_steps=10,
push_to_hub=True,
hub_model_id="vnanhtuan/fine-tune-danh-gia-cam-xuc"
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.push_to_hub()
return "Huấn luyện hoàn tất và model đã được đẩy lên Hugging Face."
except Exception as e:
import traceback
return f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
def predict_sentiment(text):
model = AutoModelForSequenceClassification.from_pretrained("vnanhtuan/fine-tune-danh-gia-cam-xuc")
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=1).item()
label_text = "Tích cực" if prediction == 1 else "Tiêu cực"
return label_text
with gr.Blocks() as demo:
gr.Markdown("# Fine-tune mô hình phân loại cảm xúc tiếng Việt")
with gr.Tab("Huấn luyện mô hình"):
csv_file = gr.File(label="Tải file CSV gồm 2 cột: comment, label")
train_button = gr.Button("Bắt đầu huấn luyện")
train_output = gr.Textbox(label="Kết quả")
train_button.click(fn=train_model, inputs=csv_file, outputs=train_output)
with gr.Tab("Dự đoán cảm xúc"):
input_text = gr.Textbox(label="Nhập câu cần phân tích")
output_label = gr.Textbox(label="Kết quả dự đoán")
predict_button = gr.Button("Dự đoán")
predict_button.click(fn=predict_sentiment, inputs=input_text, outputs=output_label)
demo.launch()