Spaces:
Running
Running
| from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults | |
| from evaluate.evaluation_suite import SubTask | |
| from evaluate.visualization import radar_plot | |
| _HEADER = "GLUE/AdvGlue Evaluation Results" | |
| _DESCRIPTION = """ | |
| The suite compares the GLUE results with Adversarial GLUE (AdvGLUE), a | |
| multi-task benchmark that tests the vulnerability of modern large-scale | |
| language models againstvarious adversarial attacks.""" | |
| class Suite(ModelCardSuiteResults): | |
| def __init__(self, name): | |
| super().__init__(name) | |
| self.result_keys = ["accuracy", "f1"] | |
| self.preprocessor = lambda x: {"text": x["text"].lower()} | |
| self.suite = [ | |
| SubTask( | |
| task_type="text-classification", | |
| data="glue", | |
| subset="sst2", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "sentence", | |
| "label_column": "label", | |
| "config_name": "sst2", | |
| "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="adv_glue", | |
| subset="adv_sst2", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "sentence", | |
| "label_column": "label", | |
| "config_name": "sst2", | |
| "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="glue", | |
| subset="qqp", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "question1", | |
| "second_input_column": "question2", | |
| "label_column": "label", | |
| "config_name": "qqp", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="adv_glue", | |
| subset="adv_qqp", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "question1", | |
| "second_input_column": "question2", | |
| "label_column": "label", | |
| "config_name": "qqp", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="glue", | |
| subset="qnli", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "question", | |
| "second_input_column": "sentence", | |
| "label_column": "label", | |
| "config_name": "qnli", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="adv_glue", | |
| subset="adv_qnli", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "question", | |
| "second_input_column": "sentence", | |
| "label_column": "label", | |
| "config_name": "qnli", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="glue", | |
| subset="rte", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "sentence1", | |
| "second_input_column": "sentence2", | |
| "label_column": "label", | |
| "config_name": "rte", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="adv_glue", | |
| subset="adv_rte", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "sentence1", | |
| "second_input_column": "sentence2", | |
| "label_column": "label", | |
| "config_name": "rte", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="glue", | |
| subset="mnli", | |
| split="validation_mismatched[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "premise", | |
| "second_input_column": "hypothesis", | |
| "config_name": "mnli", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, | |
| }, | |
| ), | |
| SubTask( | |
| task_type="text-classification", | |
| data="adv_glue", | |
| subset="adv_mnli", | |
| split="validation[:5]", | |
| args_for_task={ | |
| "metric": "glue", | |
| "input_column": "premise", | |
| "second_input_column": "hypothesis", | |
| "config_name": "mnli", | |
| "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, | |
| }, | |
| ), | |
| ] | |
| def process_results(self, results): | |
| radar_data = [ | |
| {"accuracy " + result["task_name"].split("/")[-1]: result["accuracy"] for result in results[::2]}, | |
| { | |
| "accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: result["accuracy"] | |
| for result in results[1::2] | |
| }, | |
| ] | |
| return radar_plot(radar_data, ["GLUE", "AdvGLUE"]) | |
| def plot_results(self, results, model_or_pipeline): | |
| radar_data = self.process_results(results) | |
| graphic = radar_plot(radar_data, ["GLUE " + model_or_pipeline, "AdvGLUE " + model_or_pipeline]) | |
| return graphic | |