| | """ |
| | Dataset creation tools. |
| | |
| | Keep to-level imports clean of non-trivial imports for specific tools, |
| | because this file is imported for various purposes |
| | """ |
| |
|
| | import ast |
| | import concurrent.futures |
| | import contextlib |
| | import hashlib |
| | import json |
| | import os |
| | import shutil |
| | import signal |
| | import sys |
| | import traceback |
| | from concurrent.futures import ProcessPoolExecutor |
| |
|
| | import psutil |
| | import pytest |
| | import pandas as pd |
| | import numpy as np |
| | from tqdm import tqdm |
| |
|
| | from utils import flatten_list, remove |
| |
|
| |
|
| | def parse_rst_file(filepath): |
| | with open(filepath, 'r') as f: |
| | input_data = f.read() |
| | settings_overrides = {'initial_header_level': 2} |
| | from docutils import core |
| | document = core.publish_doctree( |
| | source=input_data, |
| | source_path=filepath, |
| | settings_overrides=settings_overrides, |
| | ) |
| | qa_pairs = [] |
| | current_section = None |
| | current_question = "" |
| | current_answer = "" |
| | for node in document.traverse(): |
| | if node.__class__.__name__ == 'section': |
| | current_section = "" |
| | elif current_section is not None: |
| | if node.__class__.__name__ == 'Text': |
| | if node.astext()[-1] == "?": |
| | if current_question: |
| | qa_pairs.append((current_question, current_answer)) |
| | current_question = node.astext() |
| | current_answer = "" |
| | else: |
| | current_answer += node.astext() |
| | if current_answer: |
| | qa_pairs.append((current_question, current_answer)) |
| | return {k: v for k, v in qa_pairs} |
| |
|
| |
|
| | def test_scrape_dai_docs(): |
| | home = os.path.expanduser('~') |
| | file = os.path.join(home, 'h2oai/docs/faq.rst') |
| | qa_pairs = parse_rst_file(file) |
| | prompt_type = 'human_bot' |
| | from prompter import prompt_types |
| | assert prompt_type in prompt_types |
| | save_thing = [{"instruction": k, "output": v, 'prompt_type': prompt_type} for k, v in qa_pairs.items()] |
| | output_file = "dai_faq.json" |
| | with open(output_file, "wt") as f: |
| | f.write(json.dumps(save_thing, indent=2)) |
| |
|
| |
|
| | def test_scrape_dai_docs_all(): |
| | """ |
| | pytest create_data.py::test_scrape_dai_docs_all |
| | """ |
| | import glob |
| | import nltk |
| | nltk.download('punkt') |
| | dd = {} |
| | np.random.seed(1234) |
| | home = os.path.expanduser('~') |
| | files = list(glob.glob(os.path.join(home, "h2oai/docs/**/*rst"))) |
| | np.random.shuffle(files) |
| | val_count = int(0.05 * len(files)) |
| | train_files = files[val_count:] |
| | valid_files = files[:val_count] |
| | things = [ |
| | ("dai_docs.train.json", train_files), |
| | ("dai_docs.valid.json", valid_files) |
| | ] |
| | for LEN in [100, 200, 500]: |
| | for output_file, ff in things: |
| | if output_file not in dd: |
| | dd[output_file] = [] |
| | for f in ff: |
| | with open(f) as input: |
| | blob = input.read() |
| | blob = blob.replace("~~", "") |
| | blob = blob.replace("==", "") |
| | blob = blob.replace("''", "") |
| | blob = blob.replace("--", "") |
| | blob = blob.replace("**", "") |
| | dd[output_file].extend(get_sentences(blob, length=LEN)) |
| | for output_file, _ in things: |
| | save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in dd[output_file]] |
| | with open(output_file, "wt") as f: |
| | f.write(json.dumps(save_thing, indent=2)) |
| |
|
| |
|
| | def get_sentences(blob, length): |
| | """ |
| | break-up input text into sentences and then output list of sentences of about length in size |
| | :param blob: |
| | :param length: |
| | :return: |
| | """ |
| | import nltk |
| | nltk.download('punkt') |
| | from nltk.tokenize import sent_tokenize |
| | sentences = sent_tokenize(blob) |
| | my_sentences = [] |
| | my_string = "" |
| | for sentence in sentences: |
| | if len(my_string) + len(sentence) <= length: |
| | if my_string: |
| | my_string += " " + sentence |
| | else: |
| | my_string = sentence |
| | else: |
| | my_sentences.append(my_string) |
| | my_string = "" |
| | return my_sentences or [my_string] |
| |
|
| |
|
| | def setup_dai_docs(path=None, dst="working_dir_docs", from_hf=False): |
| | """ |
| | Only supported if have access to source code or HF token for HF spaces and from_hf=True |
| | :param path: |
| | :param dst: |
| | :param from_hf: |
| | :return: |
| | """ |
| |
|
| | home = os.path.expanduser('~') |
| |
|
| | if from_hf: |
| | |
| | from huggingface_hub import hf_hub_download |
| | |
| | token = os.getenv('HUGGING_FACE_HUB_TOKEN', True) |
| | path_to_zip_file = hf_hub_download('h2oai/dai_docs', 'dai_docs.zip', token=token, repo_type='dataset') |
| | path = 'h2oai' |
| | import zipfile |
| | with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref: |
| | zip_ref.extractall(path) |
| | path = os.path.join(path, 'docs/**/*') |
| |
|
| | if path is None: |
| | if os.path.isdir(os.path.join(home, 'h2oai')): |
| | path = os.path.join(home, "h2oai/docs/**/*") |
| | else: |
| | assert os.path.isdir(os.path.join(home, 'h2oai.superclean')), '%s does not exist' % path |
| | path = os.path.join(home, "h2oai.superclean/docs/**/*") |
| | import glob |
| | files = list(glob.glob(path, recursive=True)) |
| |
|
| | |
| |
|
| | remove(dst) |
| | os.makedirs(dst) |
| |
|
| | |
| | for fil in files: |
| | if os.path.isfile(fil): |
| | shutil.copy(fil, dst) |
| |
|
| | |
| | scorers_dir = os.path.join(dst, 'scorers') |
| | makedirs(scorers_dir) |
| | for fil in glob.glob(os.path.join(dst, '*.frag')): |
| | shutil.copy(fil, scorers_dir) |
| |
|
| | return dst |
| |
|
| |
|
| | def rst_to_outputs(files, min_len=30, max_len=2048 // 2 - 30): |
| | |
| |
|
| | |
| | import pypandoc |
| | basedir = os.path.abspath(os.getcwd()) |
| |
|
| | outputs = [] |
| | for fil in files: |
| | os.chdir(basedir) |
| | os.chdir(os.path.dirname(fil)) |
| | fil = os.path.basename(fil) |
| | print("Processing %s" % fil, flush=True) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | out_format = 'plain' |
| | |
| | extra_args = ['--wrap=preserve', '--resource path="%s" % dst'] |
| |
|
| | plain_list = [] |
| | try: |
| | |
| | input_rst = pypandoc.convert_file(fil, 'rst') |
| | input_list = input_rst.split('\n``') |
| | for input_subrst in input_list: |
| | input_plain = pypandoc.convert_text(input_subrst, format='rst', to='plain') |
| | plain_list.append([input_plain, fil]) |
| | except Exception as e: |
| | print("file exception: %s %s" % (fil, str(e)), flush=True) |
| |
|
| | if not plain_list: |
| | |
| | output = pypandoc.convert_file(fil, out_format, extra_args=extra_args, format='rst') |
| | outputs1 = get_sentences(output, length=max_len) |
| | for oi, output in enumerate(outputs1): |
| | output = output.replace('\n\n', '\n') |
| | plain_list.append([output, fil]) |
| | outputs.extend(plain_list) |
| |
|
| | |
| | |
| |
|
| | |
| | new_outputs = [] |
| | num_truncated = 0 |
| | num_orig = len(outputs) |
| | for output, fil in outputs: |
| | if len(output) < max_len: |
| | new_outputs.append([output, fil]) |
| | continue |
| | outputs1 = get_sentences(output, length=max_len) |
| | for oi, output1 in enumerate(outputs1): |
| | output1 = output1.replace('\n\n', '\n') |
| | new_outputs.append([output1, fil]) |
| | num_truncated += 1 |
| | print('num_orig: %s num_truncated: %s' % (num_orig, num_truncated), flush=True) |
| |
|
| | new_outputs = [[k.strip(), fil] for k, fil in new_outputs if len(k.strip()) > min_len] |
| |
|
| | return new_outputs |
| |
|
| |
|
| | def test_scrape_dai_docs_all_pandoc(): |
| | """ |
| | pytest -s -v create_data.py::test_scrape_dai_docs_all_pandoc |
| | :return: |
| | """ |
| |
|
| | dst = setup_dai_docs() |
| |
|
| | import glob |
| | files = list(glob.glob(os.path.join(dst, '*rst'), recursive=True)) |
| |
|
| | basedir = os.path.abspath(os.getcwd()) |
| | new_outputs = rst_to_outputs(files) |
| | os.chdir(basedir) |
| |
|
| | remove(dst) |
| | save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in new_outputs] |
| | output_file = "dai_docs.train_cleaned.json" |
| | with open(output_file, "wt") as f: |
| | f.write(json.dumps(save_thing, indent=2)) |
| |
|
| |
|
| | def test_config_to_json(): |
| | """ |
| | Needs to run from Driverless AI source directory. |
| | E.g. (base) jon@gpu:~/h2oai$ pytest -s -v /data/jon/h2ogpt/create_data.py::test_config_to_json ; cp config.json /data/jon/h2ogpt/ |
| | :return: |
| | """ |
| | try: |
| | |
| | import json |
| | from h2oaicore.systemutils import config |
| | toml_list = [] |
| | for k, v in config.get_meta_dict().items(): |
| | title = (v.title + ": ") if v.title else '' |
| | comment = v.comment or '' |
| | if not (title or comment): |
| | continue |
| | toml_list.extend( |
| | [ |
| | { |
| | 'prompt_type': 'plain', |
| | 'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace( |
| | "\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'plain', |
| | 'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace( |
| | "\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'plain', |
| | 'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace( |
| | "\n", ""), |
| | } if title and comment else None, |
| | { |
| | 'prompt_type': 'human_bot', |
| | 'instruction': f'Explain the following expert setting for Driverless AI', |
| | 'input': f"{k}", |
| | 'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'human_bot', |
| | 'instruction': f'Explain the following expert setting for Driverless AI', |
| | 'input': f"{k}", |
| | 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'human_bot', |
| | 'instruction': f'Explain the following expert setting for Driverless AI', |
| | 'input': f"{k.replace('_', ' ')}", |
| | 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'human_bot', |
| | 'instruction': f'Explain the following expert setting for Driverless AI', |
| | 'input': f"{title}", |
| | 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'human_bot', |
| | 'instruction': f'Provide a short explanation of the expert setting {k}', |
| | 'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""), |
| | }, |
| | { |
| | 'prompt_type': 'human_bot', |
| | 'instruction': f'Provide a detailed explanation of the expert setting {k}', |
| | 'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), |
| | }, |
| | ] |
| | ) |
| | toml_list = [x for x in toml_list if x] |
| | with open("config.json", "wt") as f: |
| | f.write(json.dumps(toml_list, indent=2)) |
| | except Exception as e: |
| | print("Exception: %s" % str(e), flush=True) |
| |
|
| |
|
| | def copy_tree(src, dst, follow_symlink=False): |
| | makedirs(dst, exist_ok=True) |
| | for (path, dirs, files) in os.walk(src, followlinks=follow_symlink): |
| | new_path = path.replace(src, dst) |
| | makedirs(new_path, exist_ok=True) |
| | for file in files: |
| | filename = os.path.join(path, file) |
| | new_filename = os.path.join(new_path, file) |
| | |
| | try: |
| | atomic_copy(filename, new_filename) |
| | except FileNotFoundError: |
| | pass |
| |
|
| |
|
| | def atomic_move(src, dst): |
| | try: |
| | shutil.move(src, dst) |
| | except (shutil.Error, FileExistsError): |
| | pass |
| | remove(src) |
| |
|
| |
|
| | def atomic_copy(src=None, dst=None, with_permissions=True): |
| | if os.path.isfile(dst): |
| | return |
| | import uuid |
| | my_uuid = uuid.uuid4() |
| | dst_tmp = dst + str(my_uuid) |
| | makedirs(os.path.dirname(dst), exist_ok=True) |
| | if with_permissions: |
| | shutil.copy(src, dst_tmp) |
| | else: |
| | shutil.copyfile(src, dst_tmp) |
| | atomic_move(dst_tmp, dst) |
| | remove(dst_tmp) |
| |
|
| |
|
| | def makedirs(path, exist_ok=True): |
| | """ |
| | Avoid some inefficiency in os.makedirs() |
| | :param path: |
| | :param exist_ok: |
| | :return: |
| | """ |
| | if os.path.isdir(path) and os.path.exists(path): |
| | assert exist_ok, "Path already exists" |
| | return path |
| | os.makedirs(path, exist_ok=exist_ok) |
| |
|
| |
|
| | |
| | |
| | def test_prep_instruct_vicuna(): |
| | from datasets import load_dataset |
| | filename = 'ShareGPT_unfiltered_cleaned_split.json' |
| | if not os.path.exists(filename): |
| | os.system( |
| | 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename) |
| | data = load_dataset("json", data_files={"train": filename})["train"] |
| | training_rows = [] |
| | for i in range(data.num_rows): |
| | conversations = data[i]['conversations'] |
| | assert isinstance(conversations, list), conversations |
| | convo = "" |
| | for j, conv in enumerate(conversations): |
| | |
| | |
| | if conv['from'] == 'human': |
| | FROM = '<human>: ' |
| | elif conv['from'] == 'gpt': |
| | FROM = '<bot>: ' |
| | convo += f"{FROM}" + conv['value'] + "\n" |
| | if convo: |
| | training_rows.append(dict(input=convo)) |
| | with open(filename + ".generate_human_bot.train_plain.json", "wt") as f: |
| | f.write(json.dumps(training_rows, indent=2)) |
| |
|
| |
|
| | POSTFIX = ".generate_human_bot.train_plain.json" |
| |
|
| | |
| | OIG_DATASETS = [ |
| | "unified_chip2.jsonl", |
| | "unified_grade_school_math_instructions.jsonl", |
| | "unified_poetry_2_song.jsonl", |
| | "unified_plot_screenplay_books_dialog.jsonl", |
| | ] |
| |
|
| | |
| | ALL_OIG_DATASETS = ['unified_abstract_infill.jsonl', |
| | 'unified_basic.jsonl', |
| | 'unified_canadian_parliament.jsonl', |
| | 'unified_chip2.jsonl', |
| | 'unified_conv_finqa.jsonl', |
| | 'unified_cuad.jsonl', |
| | 'unified_essays.jsonl', |
| | 'unified_flan.jsonl.gz', |
| | 'unified_grade_school_math_instructions.jsonl', |
| | 'unified_hc3_human.jsonl', |
| | 'unified_image_prompts_instructions.jsonl', |
| | 'unified_joke_explanations.jsonl', |
| | 'unified_mathqa_flanv2_kojma_cot.jsonl', |
| | 'unified_merged_code_xp3.jsonl', |
| | 'unified_multi_news.jsonl', |
| | 'unified_multi_sum.jsonl', |
| | 'unified_ni.jsonl.gz', |
| | 'unified_nq.jsonl', |
| | 'unified_openai_summarize_tldr.jsonl', |
| | 'unified_oscar_en_sample_dialog.jsonl', |
| | 'unified_p3.jsonl.gz', |
| | 'unified_plot_screenplay_books_dialog.jsonl', |
| | 'unified_poetry_2_song.jsonl', |
| | 'unified_poetry_instructions.jsonl', |
| | 'unified_rallio_safety_and_prosocial.jsonl', |
| | 'unified_rallio_soda_upgraded_2048.jsonl', |
| | 'unified_soda_dialog.jsonl', |
| | 'unified_sqlv1.jsonl', |
| | 'unified_sqlv2.jsonl', |
| | 'unified_squad_v2.jsonl', |
| | 'unified_squad_v2_more_neg.jsonl', |
| | 'unified_ul2_plus_oscar_en_sample_dialog.jsonl', |
| | 'unified_unifiedskg_instructions.jsonl', |
| | 'unified_unnatural_instructions.jsonl', |
| | 'unified_xp3_sample.jsonl'] |
| |
|
| | useful_oig_files = ['unified_rallio_safety_and_prosocial.jsonl.parquet', |
| | 'unified_chip2.jsonl.parquet', |
| | 'unified_cuad.jsonl.parquet', |
| | 'unified_essays.jsonl.parquet', |
| | 'unified_flan.jsonl.gz.parquet', |
| | 'unified_grade_school_math_instructions.jsonl.parquet', |
| | 'unified_hc3_human.jsonl.parquet', |
| | 'unified_mathqa_flanv2_kojma_cot.jsonl.parquet', |
| | 'unified_merged_code_xp3.jsonl.parquet', |
| | 'unified_multi_news.jsonl.parquet', |
| | |
| | 'unified_ni.jsonl.gz.parquet', |
| | 'unified_openai_summarize_tldr.jsonl.parquet', |
| | |
| | 'unified_plot_screenplay_books_dialog.jsonl.parquet', |
| | 'unified_soda_dialog.jsonl.parquet', |
| | 'unified_unnatural_instructions.jsonl.parquet', |
| | ] |
| |
|
| |
|
| | @pytest.mark.parametrize("filename", OIG_DATASETS) |
| | def test_get_small_sample_oig_data(filename): |
| | if not os.path.exists(filename): |
| | os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename) |
| | import json |
| | rows = [] |
| | with open(filename, "r") as f: |
| | for line in f.readlines(): |
| | row = json.loads(line) |
| | rows.append(dict(input=row["text"])) |
| | with open(filename + POSTFIX, "w") as f: |
| | f.write(json.dumps(rows, indent=2)) |
| |
|
| |
|
| | @pytest.mark.parametrize("filename", ALL_OIG_DATASETS) |
| | def test_download_useful_data_as_parquet(filename): |
| | dest_file = filename + '.parquet' |
| | if dest_file not in useful_oig_files: |
| | pytest.skip('file declared not useful') |
| | if not os.path.exists(filename): |
| | os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename) |
| | if not os.path.exists(dest_file): |
| | df = pd.read_json(path_or_buf=filename, lines=True) |
| | df.to_parquet(dest_file, index=False) |
| |
|
| |
|
| | def test_merge_shuffle_small_sample_oig_data(): |
| | np.random.seed(1234) |
| | rows = [] |
| | for filename in OIG_DATASETS: |
| | with open(filename + POSTFIX, "r") as f: |
| | rows.extend(json.loads(f.read())) |
| | np.random.shuffle(rows) |
| | with open("merged_shuffled_OIG_%s.json" % hashlib.sha256(str(OIG_DATASETS).encode()).hexdigest()[:10], "w") as f: |
| | f.write(json.dumps(rows, indent=2)) |
| |
|
| |
|
| | def test_join_jsons(): |
| | files = ['config.json'] * 1 + \ |
| | ['dai_docs.train_cleaned.json'] * 2 + \ |
| | ['dai_faq.json'] * 3 |
| | print(files) |
| | lst = [] |
| | [lst.extend(json.load(open(fil, 'rt'))) for fil in files] |
| | print(len(lst)) |
| | json.dump(lst, open("merged.json", "wt"), indent=2) |
| |
|
| |
|
| | @pytest.mark.parametrize("filename", ['Anthropic/hh-rlhf']) |
| | def test_make_rlhf_good_data(filename): |
| | from datasets import load_dataset |
| | rows = load_dataset(filename)["train"]["chosen"] |
| | new_rows = [] |
| | for row in rows: |
| | if row[:2] == "\n\n": |
| | row = row[2:] |
| | row = row.replace("Human: ", "<human>: ") |
| | row = row.replace("Assistant: ", "<bot>: ") |
| | new_rows.append(dict(input=row)) |
| | with open(filename.replace("/", "_") + POSTFIX, "w") as f: |
| | f.write(json.dumps(new_rows, indent=2)) |
| |
|
| |
|
| | def test_show_prompts(): |
| | files = ['config.json'] * 1 + \ |
| | ['dai_docs.train_cleaned.json'] * 1 + \ |
| | ['dai_faq.json'] * 1 |
| | file_points = [json.load(open(fil, 'rt')) for fil in files] |
| | from prompter import generate_prompt |
| | for data_points in file_points: |
| | for data_point in data_points: |
| | print(generate_prompt(data_point, 'plain', '', False, False)[0]) |
| |
|
| |
|
| | def test_get_open_datasets(): |
| | |
| | open_tags = ['license:Apache License 2.0', |
| | 'license:mit', |
| | 'license:apache', |
| | 'license:apache2', |
| | 'license:apache-2.0', |
| | 'license:bsd', |
| | 'license:bsd-2-clause', |
| | 'license:bsd-3-clause', |
| | 'license:bsd-3-clause-clear', |
| | 'license:lgpl-2.1', |
| | 'license:lgpl-3.0', |
| | 'license:lgpl-lr', |
| | 'license:lgpl', |
| | 'license:openrail++', |
| | 'license:openrail', |
| | 'license:bigscience-bloom-rail-1.0', |
| | |
| | 'license:other', |
| | 'license:unknown', |
| | |
| | |
| | 'license:odc-by', |
| | 'license:cc-by-4.0', |
| | 'license:cc-by-3.0', |
| | 'license:cc-by-2.0', |
| | 'license:cc-by-2.5', |
| | |
| | 'license:odbl', |
| | 'license:pddl', |
| | 'license:ms-pl', |
| | 'license:zlib', |
| | ] |
| | |
| |
|
| | from huggingface_hub import list_datasets |
| | datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags]) |
| | datasets += [x for x in list_datasets(author='openai')] |
| | |
| | all_license_tags = set(flatten_list([[y for y in x.tags if 'license' in y] for x in datasets])) |
| | print(len(all_license_tags)) |
| | open_datasets = [x for x in datasets if any([y in x.tags for y in open_tags]) or 'license:' not in str(x.tags)] |
| | print('open_datasets', len(open_datasets)) |
| | all_task_tags = set(flatten_list([[y for y in x.tags if 'task' in y] for x in open_datasets])) |
| | print('all_task_tags', len(all_task_tags)) |
| | excluded_tags = ['image', 'hate', 'tabular', 'table-', 'classification', 'retrieval', |
| | 'translation', 'identification', 'object', 'mask', 'to-text', |
| | 'face-detection', 'audio', 'voice', 'reinforcement', 'depth-est', |
| | 'forecasting', 'parsing', 'visual', 'speech', 'multiple-choice', |
| | 'slot-filling', 'irds/argsme', '-scoring', 'other', 'graph-ml', |
| | 'feature-extraction', 'keyword-spotting', |
| | 'coreference-resolution', 'segmentation', |
| | 'word-sense-disambiguation', |
| | 'lemmatization'] |
| | task_tags = [x.replace('task_categories:', '').replace('task_ids:', '') |
| | for x in all_task_tags if not any([y in x for y in |
| | excluded_tags])] |
| | print('task_tags', len(task_tags)) |
| | |
| | open_tasked_datasets = [x for x in open_datasets if |
| | any([y in str([x for x in x.tags if 'task' in x]) for y in task_tags]) and |
| | not any([y in str([x for x in x.tags if 'task' in x]) for y in excluded_tags]) or |
| | 'task_categories' not in str(x.tags) and 'task_ids' not in str(x.tags)] |
| | open_tasked_datasets = [x for x in open_tasked_datasets if not x.disabled] |
| | open_tasked_datasets = [x for x in open_tasked_datasets if not x.gated] |
| | open_tasked_datasets = [x for x in open_tasked_datasets if not x.private] |
| | print('open_tasked_datasets', len(open_tasked_datasets)) |
| | sizes = list(set(flatten_list([[(y, x.id) for y in x.tags if 'size' in y] for x in open_tasked_datasets]))) |
| | languages = list(set(flatten_list([[(y, x.id) for y in x.tags if 'language:' in y] for x in open_tasked_datasets]))) |
| | open_english_tasked_datasets = [x for x in open_tasked_datasets if |
| | 'language:' not in str(x.tags) or |
| | 'language:en' in str(x.tags)] |
| | small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if |
| | 'n<1K' in str(x.tags) or |
| | '1K<n<10K' in str(x.tags) or |
| | '1K0<n<100K' in str(x.tags) or |
| | '100K<n<1M' in str(x.tags) or |
| | 'size_category' not in str(x.tags) |
| | ] |
| | |
| | |
| | ids = [x.id for x in small_open_english_tasked_datasets] |
| |
|
| | |
| | |
| | assert 'alespalla/chatbot_instruction_prompts' in ids |
| | assert 'laion/OIG' in ids |
| | assert 'openai/webgpt_comparisons' in ids |
| | assert 'openai/summarize_from_feedback' in ids |
| | assert 'Anthropic/hh-rlhf' in ids |
| |
|
| | |
| | |
| |
|
| | print('open_english_tasked_datasets: ', ids, flush=True) |
| |
|
| | exclude_ids = ['allenai/nllb', |
| | 'hf-internal-testing/fixtures_image_utils', |
| | 'allenai/c4', |
| | 'agemagician/uniref50', |
| | 'huggingface-course/documentation-images', |
| | 'smilegate-ai/kor_unsmile', |
| | 'MohamedRashad/ChatGPT-prompts', |
| | 'humarin/chatgpt-paraphrases', |
| | 'Jeska/vaccinchat', |
| | 'alespalla/chatbot_instruction_prompts', |
| | 'allenai/prosocial-dialog', |
| | |
| | 'AlekseyKorshuk/persona-chat', |
| | 'bavard/personachat_truecased', |
| | 'adamlin/daily_dialog', |
| | 'adamlin/FewShotWoz', |
| | 'benjaminbeilharz/better_daily_dialog', |
| | 'benjaminbeilharz/daily_dialog_w_turn_templates', |
| | 'benjaminbeilharz/empathetic_dialogues_for_lm', |
| | 'GEM-submissions/GEM__bart_base_schema_guided_dialog__1645547915', |
| | 'ia-bentebib/conv_ai_2_fr', |
| | 'ia-bentebib/daily_dialog_fr', |
| | 'ia-bentebib/dialog_re_fr', |
| | 'ia-bentebib/empathetic_dialogues_fr', |
| | 'roskoN/dailydialog', |
| | 'VadorMazer/skyrimdialogstest', |
| | 'bigbio/med_qa', |
| | 'biu-nlp/qa_srl2018', |
| | 'biu-nlp/qa_discourse', |
| | 'iarfmoose/qa_evaluator', |
| | 'jeopardy', |
| | 'narrativeqa', |
| | 'nomic-ai/gpt4all_prompt_generations', |
| | 'nomic-ai/gpt4all_prompt_generations_with_p3', |
| | 'HuggingFaceH4/alpaca', |
| | 'tatsu-lab/alpaca', |
| | 'yahma/alpaca-cleaned', |
| | 'Hello-SimpleAI/HC3', |
| | 'glue', |
| | 'sahil2801/CodeAlpaca-20k', |
| | 'Short-Answer-Feedback/saf_communication_networks_english', |
| | ] |
| | small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if x.id not in exclude_ids] |
| | |
| | small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id] |
| | |
| | small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if |
| | 'hf-internal-testing' not in x.id] |
| | small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if |
| | 'chinese' not in x.id] |
| |
|
| | sorted_small_open_english_tasked_datasets = sorted([(x.downloads, x) for x in small_open_english_tasked_datasets], |
| | key=lambda x: x[0], reverse=True) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | https://huggingface.co/datasets/wikihow/blob/main/wikihow.py |
| | https://github.com/mahnazkoupaee/WikiHow-Dataset |
| | https://ucsb.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 |
| | https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 |
| | """ |
| |
|
| | """ |
| | # some ambiguous or non-commercial datasets |
| | https://github.com/PhoebusSi/alpaca-CoT |
| | """ |
| |
|
| | timeout = 3 * 60 |
| | |
| | for num_downloads, dataset in sorted_small_open_english_tasked_datasets: |
| | data_id = dataset.id |
| | func = do_one |
| | args = (data_id, num_downloads) |
| | kwargs = {} |
| | with ProcessPoolExecutor(max_workers=1) as executor: |
| | future = executor.submit(func, *args, **kwargs) |
| | try: |
| | future.result(timeout=timeout) |
| | except concurrent.futures.TimeoutError: |
| | print("\n\ndata_id %s timeout\n\n" % data_id, flush=True) |
| | for child in psutil.Process(os.getpid()).children(recursive=True): |
| | os.kill(child.pid, signal.SIGINT) |
| | os.kill(child.pid, signal.SIGTERM) |
| | os.kill(child.pid, signal.SIGKILL) |
| |
|
| |
|
| | def do_one(data_id, num_downloads): |
| | from datasets import load_dataset |
| | out_file = "data_%s.parquet" % str(data_id.replace('/', '_')) |
| | if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024 ** 3: |
| | return |
| | try: |
| | print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True) |
| | avail_list = None |
| | try: |
| | data = load_dataset(data_id, 'foobar') |
| | except Exception as e: |
| | if 'Available: ' in str(e): |
| | avail_list = ast.literal_eval(str(e).split('Available:')[1].strip()) |
| | else: |
| | avail_list = None |
| | if avail_list is None: |
| | avail_list = [None] |
| | print("%s avail_list: %s" % (data_id, avail_list), flush=True) |
| |
|
| | for name in avail_list: |
| | out_file = "data_%s_%s.parquet" % (str(data_id.replace('/', '_')), str(name)) |
| | if os.path.isfile(out_file): |
| | continue |
| | data = load_dataset(data_id, name) |
| | column_names_dict = data.column_names |
| | column_names = column_names_dict[list(column_names_dict.keys())[0]] |
| | print("Processing data_id %s num_downloads: %s columns: %s" % (data_id, num_downloads, column_names), |
| | flush=True) |
| | data_dict = data.data |
| | col_dict = data.num_columns |
| | first_col = list(col_dict.keys())[0] |
| | if 'train' in data_dict: |
| | df = data['train'].to_pandas() |
| | else: |
| | df = data[first_col].to_pandas() |
| | |
| | df.to_parquet(out_file, index=False) |
| | except Exception as e: |
| | t, v, tb = sys.exc_info() |
| | ex = ''.join(traceback.format_exception(t, v, tb)) |
| | print("Exception: %s %s" % (data_id, ex), flush=True) |
| |
|
| |
|
| | def test_otherlic(): |
| | from huggingface_hub import list_datasets |
| | lic = ['license:odc-by', |
| | 'license:cc-by-4.0', |
| | 'license:cc-by-3.0', |
| | 'license:cc-by-2.0', |
| | 'license:cc-by-2.5', |
| | 'license:cc-by-sa-4.0', |
| | 'license:odbl', |
| | 'license:pddl', |
| | 'license:ms-pl', |
| | 'license:zlib', |
| | ] |
| | datasets = flatten_list([[x for x in list_datasets(filter=y) if 'translation' not in str(x.tags)] for y in lic]) |
| | print(len(datasets)) |
| |
|
| |
|
| | |
| | |
| | useful = ['Dahoas/instruct-human-assistant-prompt', |
| | 'Dahoas/first-instruct-human-assistant-prompt', |
| | 'knkarthick/dialogsum', |
| | 'McGill-NLP/FaithDial', |
| | 'Zaid/quac_expanded', |
| | '0-hero/OIG-small-chip2', |
| | 'alistvt/coqa-flat', |
| | 'AnonymousSub/MedQuAD_47441_Question_Answer_Pairs', |
| | 'Anthropic/hh-rlhf', |
| | 'arjunth2001/online_privacy_qna', |
| | 'Dahoas/instruct_helpful_preferences', |
| | 'Dahoas/rl-prompt-dataset', |
| | 'Dahoas/rm-static', |
| | 'Dahoas/static-hh', |
| | 'Dahoas/synthetic-instruct-gptj-pairwise', |
| | 'eli5', |
| | 'gsm8k', |
| | 'guanaco/guanaco', |
| | 'kastan/rlhf-qa-comparisons', |
| | 'kastan/rlhf-qa-conditional-generation-v2', |
| | 'OllieStanley/humaneval-mbpp-codegen-qa', |
| | 'OllieStanley/humaneval-mbpp-testgen-qa', |
| | 'Graverman/Instruct-to-Code', |
| | 'openai/summarize_from_feedback', |
| | 'relbert/analogy_questions', |
| | 'yitingxie/rlhf-reward-datasets', |
| | 'yizhongw/self_instruct', |
| | 'HuggingFaceH4/asss', |
| | 'kastan/rlhf-qa-conditional-generation-v2', |
| | 'cosmos_qa', |
| | 'vishal-burman/c4-faqs', |
| | 'squadshifts', |
| | 'hotpot_qa', |
| | 'adversarial_qa', |
| | 'allenai/soda', |
| | 'squad_v2', |
| | 'squadshifts', |
| | 'dferndz/cSQuAD1', |
| | 'dferndz/cSQuAD2', |
| | 'din0s/msmarco-nlgen', |
| | 'domenicrosati/TruthfulQA', |
| | 'hotpot_qa', |
| | 'HuggingFaceH4/self-instruct-eval', |
| | 'kastan/EE_QA_for_RLHF', |
| | 'KK04/LogicInference_OA', |
| | 'lmqg/qa_squadshifts_synthetic', |
| | 'lmqg/qg_squad', |
| | 'lmqg/qg_squadshifts', |
| | 'lmqg/qg_subjqa', |
| | 'pszemraj/HC3-textgen-qa', |
| | |
| | 'pythonist/newdata', |
| | 'ropes', |
| | 'wikitablequestions', |
| | 'bigscience/p3', |
| | ] |
| |
|
| | code_useful = ['0n1xus/codexglue', |
| | 'openai_humaneval', |
| | 'koutch/staqc', |
| | ] |
| |
|
| | maybe_useful = ['AlekseyKorshuk/comedy-scripts', |
| | 'openbookqa', |
| | 'qed', |
| | 'selqa', |
| | 'HuggingFaceH4/instruction-pilot-outputs-filtered', |
| | 'GBaker/MedQA-USMLE-4-options', |
| | 'npc-engine/light-batch-summarize-dialogue', |
| | ] |
| |
|
| | summary_useful = ['austin/rheum_abstracts', |
| | 'CarperAI/openai_summarize_comparisons', |
| | 'CarperAI/openai_summarize_tldr', |
| | 'ccdv/cnn_dailymail', |
| | 'ccdv/govreport-summarization', |
| | 'ccdv/pubmed-summarization', |
| | 'duorc', |
| | 'farleyknight/big_patent_5_percent', |
| | 'multi_news', |
| | 'opinosis', |
| | 'SophieTr/reddit_clean', |
| | 'allenai/mup', |
| | 'allenai/multi_lexsum', |
| | 'big_patent', |
| | 'allenai/wcep_dense_max', |
| | 'awinml/costco_long_practice', |
| | 'GEM/xsum', |
| | 'ratishsp/newshead', |
| | 'RussianNLP/wikiomnia', |
| | 'stacked-summaries/stacked-xsum-1024', |
| | ] |
| |
|
| | math_useful = [ |
| | 'competition_math' |
| | ] |
| |
|
| | skipped = ['c4', |
| | ] |
| |
|
| | """ |
| | To get training data from oig: |
| | pytest test_oig test_grade_final test_finalize_to_json |
| | """ |
| |
|
| | human = '<human>:' |
| | bot = '<bot>:' |
| |
|
| |
|
| | def test_assemble_and_detox(): |
| | import re |
| | from profanity_check import predict_prob |
| | df_list = [] |
| | for data in useful_oig_files: |
| | print("Processing %s" % data, flush=True) |
| | df = pd.read_parquet(data) |
| | df = df.reset_index(drop=True) |
| | |
| | text_list = df[['text']].values.ravel().tolist() |
| | new_text = [] |
| | max_len = 2048 |
| | MAX_LEN = 2048 // 2 - 30 |
| | for text in tqdm(text_list): |
| | human_starts = [m.start() for m in re.finditer('<human>: ', text)] |
| | if len(human_starts) == 1: |
| | human_starts = [0, len(text)] |
| | blurb = '' |
| | for i in range(len(human_starts) - 1): |
| | interaction = text[human_starts[i]: human_starts[i + 1]][:max_len] |
| | blurb += interaction |
| | if len(blurb) >= MAX_LEN: |
| | blurb = get_sentences(blurb, length=MAX_LEN)[0] |
| | new_text.append(blurb + "\n<human>:") |
| | blurb = '' |
| | if blurb: |
| | blurb = get_sentences(blurb, length=MAX_LEN)[0] |
| | new_text.append(blurb + "\n<human>:") |
| |
|
| | if len(new_text) > len(text_list): |
| | print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0])) |
| | df = pd.DataFrame({"text": new_text, "source": [data] * len(new_text)}) |
| | df = df.drop_duplicates(keep='first') |
| | print(df['text'].apply(lambda x: len(x)).describe()) |
| | assert df['text'].apply(lambda x: len(x)).max() <= 2 * max_len |
| |
|
| | |
| | df['profanity'] = predict_prob(df['text']) |
| | before_rows = df.shape[0] |
| | df = df[df['profanity'] < 0.25] |
| | after_rows = df.shape[0] |
| | print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows)) |
| | df_list.append(df) |
| | print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True) |
| | print("So far have %d rows" % sum([len(x) for x in df_list])) |
| | df_final = pd.concat(df_list) |
| | df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True) |
| | df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False) |
| |
|
| |
|
| | def test_basic_cleaning(): |
| | |
| | |
| | from profanity_check import predict |
| | df_list = [] |
| | for data in useful_oig_files: |
| | |
| | |
| | print("Processing %s" % data, flush=True) |
| | df = pd.read_parquet(data) |
| | df = df.reset_index(drop=True) |
| | |
| | |
| | df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot)) / 2.0) |
| | df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot)) |
| | |
| | |
| | res = predict(df['text']) |
| | df['bad_words'] = res |
| | df = df.reset_index(drop=True) |
| | df = df[df['bad_words'] == 0] |
| | df = df[['text', 'avg_words', 'avg_bot_words']] |
| | df = df.drop_duplicates(keep='first') |
| | print(df[df['avg_words'] == df['avg_words'].max()]['text'].values) |
| | median_words = np.median(df['avg_words']) |
| | min_words_per_entity = max(30, 0.8 * median_words) |
| | max_words_per_entity = 2048 |
| | df = df[df['avg_words'] > min_words_per_entity] |
| | df = df[df['avg_words'] < max_words_per_entity] |
| |
|
| | min_words_per_entity = max(20, 0.5 * median_words) |
| | max_words_per_entity = 2048 |
| | df = df[df['avg_bot_words'] > min_words_per_entity] |
| | df = df[df['avg_bot_words'] < max_words_per_entity] |
| |
|
| | df_list.append(df) |
| | print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True) |
| | df_final = pd.concat(df_list) |
| | df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False) |
| |
|
| |
|
| | from joblib import Parallel, delayed, effective_n_jobs |
| | from sklearn.utils import gen_even_slices |
| | from sklearn.utils.validation import _num_samples |
| |
|
| |
|
| | def parallel_apply(df, func, n_jobs=-1, **kwargs): |
| | """ Pandas apply in parallel using joblib. |
| | Uses sklearn.utils to partition input evenly. |
| | |
| | Args: |
| | df: Pandas DataFrame, Series, or any other object that supports slicing and apply. |
| | func: Callable to apply |
| | n_jobs: Desired number of workers. Default value -1 means use all available cores. |
| | **kwargs: Any additional parameters will be supplied to the apply function |
| | |
| | Returns: |
| | Same as for normal Pandas DataFrame.apply() |
| | |
| | """ |
| |
|
| | if effective_n_jobs(n_jobs) == 1: |
| | return df.apply(func, **kwargs) |
| | else: |
| | ret = Parallel(n_jobs=n_jobs)( |
| | delayed(type(df).apply)(df[s], func, **kwargs) |
| | for s in gen_even_slices(_num_samples(df), effective_n_jobs(n_jobs))) |
| | return pd.concat(ret) |
| |
|
| |
|
| | def add_better_profanity_flag(df): |
| | from better_profanity import profanity |
| | df['better_profanity'] = parallel_apply( |
| | df['text'], |
| | lambda x: profanity.contains_profanity(x), |
| | n_jobs=-1, |
| | ) |
| | return df |
| |
|
| |
|
| | def add_textstat_grade(df): |
| | import textstat |
| |
|
| | def myfunc(x): |
| | return textstat.flesch_kincaid_grade(x) |
| |
|
| | if False: |
| | import dask.dataframe as dd |
| | |
| | ddata = dd.from_pandas(df, npartitions=120) |
| |
|
| | df['flesch_grade'] = ddata['text'].apply(myfunc).compute() |
| | if True: |
| | |
| | df['flesch_grade'] = parallel_apply(df['text'], myfunc, n_jobs=-1) |
| | return df |
| |
|
| |
|
| | def add_deberta_grade(df): |
| | from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| | import torch |
| | reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2" |
| | rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained( |
| | reward_name), AutoTokenizer.from_pretrained(reward_name) |
| | device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| | rank_model.to(device) |
| |
|
| | def get_question(x): |
| | return x.replace('<human>: ', '').split('<bot>:')[0] |
| |
|
| | def get_answer(x): |
| | try: |
| | answer = x.split('<bot>: ')[1].split('<human>:')[0].replace('<bot>: ', '') |
| | except: |
| | answer = x.split('<bot>:')[1].split('<human>:')[0].replace('<bot>:', '') |
| | return answer |
| |
|
| | df['question'] = parallel_apply(df['text'], get_question, n_jobs=-1) |
| | df['answer'] = parallel_apply(df['text'], get_answer, n_jobs=-1) |
| |
|
| | from datasets import Dataset |
| | from transformers import pipeline |
| | from transformers.pipelines.pt_utils import KeyPairDataset |
| | import tqdm |
| |
|
| | pipe = pipeline( |
| | "text-classification", |
| | model=reward_name, |
| | device="cuda:0" if torch.cuda.is_available() else "cpu" |
| | ) |
| | start = 0 |
| | batch_size = 64 * 16 |
| | micro_batch = orig_micro_batch = 16 |
| | end = 0 |
| | import socket |
| | checkpoint = "grades.%s.pkl" % socket.gethostname() |
| | grades = [] |
| | import pickle |
| | if os.path.exists(checkpoint): |
| | with open(checkpoint, "rb") as f: |
| | start, grades = pickle.loads(f.read()) |
| | last_oom = 0 |
| | while end < df.shape[0]: |
| | |
| | end = min(start + batch_size, df.shape[0]) |
| | if start == end: |
| | break |
| | dataset = Dataset.from_pandas(df.iloc[start:end, :]) |
| | try: |
| | grades.extend([ |
| | x['score'] for x in tqdm.tqdm( |
| | pipe(KeyPairDataset(dataset, "question", "answer"), batch_size=micro_batch) |
| | ) |
| | ]) |
| | except torch.cuda.OutOfMemoryError: |
| | last_oom = start |
| | micro_batch = max(1, micro_batch // 2) |
| | print("OOM - retrying with micro_batch=%d" % micro_batch) |
| | continue |
| | if last_oom == start: |
| | micro_batch = orig_micro_batch |
| | print("Returning to micro_batch=%d" % micro_batch) |
| | assert len(grades) == end |
| | start = end |
| | with open(checkpoint, "wb") as f: |
| | f.write(pickle.dumps((end, grades))) |
| | print("%d/%d" % (end, df.shape[0])) |
| | df['grade_deberta'] = grades |
| | if os.path.exists(checkpoint): |
| | os.remove(checkpoint) |
| | return df |
| |
|
| |
|
| | def test_chop_by_lengths(): |
| | file = "h2oGPT.cleaned.human_bot.shorter.parquet" |
| | df = pd.read_parquet(file).reset_index(drop=True) |
| | df = count_human_bot_lengths(df) |
| | df['rand'] = np.random.rand(df.shape[0]) |
| | df['rand2'] = np.random.rand(df.shape[0]) |
| | before_rows = df.shape[0] |
| | |
| | df = df[(df['len_human_mean'] > 20)] |
| | df = df[(df['len_human_mean'] > 30) | (df['rand'] < 0.2)] |
| | df = df[(df['len_human_mean'] > 50) | (df['rand'] < 0.5)] |
| | df = df[(df['len_human_max'] < 10000)] |
| | df = df[(df['len_bot_mean'] > 20)] |
| | df = df[(df['len_bot_mean'] > 30) | (df['rand2'] < 0.2)] |
| | df = df[(df['len_bot_mean'] > 50) | (df['rand2'] < 0.5)] |
| | df = df[(df['len_bot_max'] < 10000)] |
| | assert df['text'].apply(lambda x: len(x)).max() < 20000 |
| | df = df.drop(['rand', 'rand2'], axis=1) |
| | after_rows = df.shape[0] |
| | print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows)) |
| | print(df.describe()) |
| | df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False) |
| |
|
| |
|
| | def count_human_bot_lengths(df, human=None, bot=None): |
| | import re |
| | len_human_min = [] |
| | len_human_max = [] |
| | len_human_mean = [] |
| | len_bot_min = [] |
| | len_bot_max = [] |
| | len_bot_mean = [] |
| | human = human or '<human>:' |
| | bot = bot or '<bot>:' |
| | for is_human in [True, False]: |
| | what = human if is_human else bot |
| | other = human if not is_human else bot |
| | for i in range(df.shape[0]): |
| | text = df.loc[i, 'text'] |
| | assert isinstance(text, str) |
| | starts = [m.start() for m in re.finditer(what, text)] |
| | if len(starts) == 1: |
| | starts = [starts[0], len(text)] |
| | assert len(text) |
| | list_what = [] |
| | for ii in range(len(starts) - 1): |
| | interaction = text[starts[ii]: starts[ii + 1]] |
| | if other in interaction: |
| | interaction = interaction[:interaction.find(other)] |
| | interaction.strip() |
| | list_what.append(interaction) |
| | if not list_what: |
| | list_what = [''] |
| | if is_human: |
| | len_human_min.append(min([len(x) for x in list_what])) |
| | len_human_max.append(max([len(x) for x in list_what])) |
| | len_human_mean.append(np.mean([len(x) for x in list_what])) |
| | else: |
| | len_bot_min.append(min([len(x) for x in list_what])) |
| | len_bot_max.append(max([len(x) for x in list_what])) |
| | len_bot_mean.append(np.mean([len(x) for x in list_what])) |
| | df['len_human_min'] = len_human_min |
| | df['len_human_max'] = len_human_max |
| | df['len_human_mean'] = len_human_mean |
| | df['len_bot_min'] = len_bot_min |
| | df['len_bot_max'] = len_bot_max |
| | df['len_bot_mean'] = len_bot_mean |
| | np.random.seed(1234) |
| | pd.set_option('display.max_columns', None) |
| | print("Before chopping") |
| | print(df.describe()) |
| | return df |
| |
|
| |
|
| | def test_grade(): |
| | df = None |
| |
|
| | file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet" |
| | output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet" |
| | if not os.path.exists(output_file): |
| | if df is None: |
| | df = pd.read_parquet(file).reset_index(drop=True) |
| | df = add_textstat_grade(df) |
| | min_grade = 10 |
| | max_grade = 25 |
| | df = df[df['flesch_grade'] >= min_grade] |
| | df = df[df['flesch_grade'] <= max_grade] |
| | print("After Flesch grade") |
| | print(df.describe()) |
| | df.to_parquet(output_file, index=False) |
| |
|
| | file = output_file |
| | output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet" |
| | if not os.path.exists(output_file): |
| | |
| | if df is None: |
| | df = pd.read_parquet(file).reset_index(drop=True) |
| | df = add_better_profanity_flag(df) |
| | before_rows = df.shape[0] |
| | df = df[df['better_profanity'] == 0] |
| | df = df.drop(['better_profanity'], axis=1) |
| | after_rows = df.shape[0] |
| | print("Dropped %d rows out of %d due to better_profanity" % (before_rows - after_rows, before_rows)) |
| | print(df.describe()) |
| | df.to_parquet(output_file, index=False) |
| |
|
| | file = output_file |
| | output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet' |
| | if not os.path.exists(output_file): |
| | if df is None: |
| | df = pd.read_parquet(file).reset_index(drop=True) |
| | df = add_deberta_grade(df) |
| | min_grade = 0.3 |
| | max_grade = np.inf |
| | before_rows = df.shape[0] |
| | df = df[df['grade_deberta'] >= min_grade] |
| | df = df[df['grade_deberta'] <= max_grade] |
| | after_rows = df.shape[0] |
| | print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows)) |
| | print("After DeBERTa grade") |
| | print(df.describe()) |
| | df.to_parquet(output_file, index=False) |
| |
|
| | file = output_file |
| | output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet' |
| | if df is None: |
| | df = pd.read_parquet(file).reset_index(drop=True) |
| | df.to_parquet(output_file, index=False) |
| |
|
| |
|
| | @pytest.mark.parametrize( |
| | "fixup_personality, only_personality, deberta_grading", |
| | [ |
| | |
| | |
| | [True, False, False], |
| | |
| | ] |
| | ) |
| | @pytest.mark.parametrize("prompt_type", ["llama2"]) |
| | def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, prompt_type, save_json=True): |
| | """ |
| | Flatten tree structure into one row per path from root to leaf |
| | Also turn into human_bot prompting format: |
| | <human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc. |
| | Also saves a .json locally as side-effect |
| | returns list of dicts, containing intput, prompt_type and source |
| | """ |
| | from datasets import load_dataset |
| | data_file = "OpenAssistant/oasst1" |
| | ds = load_dataset(data_file) |
| | df = pd.concat([ds['train'].to_pandas(), ds['validation'].to_pandas()], axis=0) |
| | rows = {} |
| | message_ids = df['message_id'].values.tolist() |
| | message_tree_ids = df['message_tree_id'].values.tolist() |
| | parent_ids = df['parent_id'].values.tolist() |
| | texts = df['text'].values.tolist() |
| | roles = df['role'].values.tolist() |
| | deleteds = df['deleted'].values.tolist() |
| | for i in range(df.shape[0]): |
| | |
| | message_id = message_ids[i] |
| | message_tree_id = message_tree_ids[i] |
| | parent_id = parent_ids[i] |
| | text = texts[i] |
| | deleted = deleteds[i] |
| | if deleted: |
| | continue |
| | if fixup_personality: |
| | text = text.replace("Open Assistant", "h2oGPT") |
| | text = text.replace("Open-Assistant", "h2oGPT") |
| | text = text.replace("open-assistant", "h2oGPT") |
| | text = text.replace("OpenAssistant", "h2oGPT") |
| | text = text.replace("open assistant", "h2oGPT") |
| | text = text.replace("Open Assistand", "h2oGPT") |
| | text = text.replace("Open Assitant", "h2oGPT") |
| | text = text.replace("Open Assistent", "h2oGPT") |
| | text = text.replace("Open Assisstant", "h2oGPT") |
| | text = text.replace("Open Assitent", "h2oGPT") |
| | text = text.replace("Open Assitiant", "h2oGPT") |
| | text = text.replace("Open Assistiant", "h2oGPT") |
| | text = text.replace("Open Assitan ", "h2oGPT ") |
| | text = text.replace("Open Assistan ", "h2oGPT ") |
| | text = text.replace("Open Asistant", "h2oGPT") |
| | text = text.replace("Open Assiant", "h2oGPT") |
| | text = text.replace("Assistant", "h2oGPT") |
| | text = text.replace("LAION AI", "H2O.ai") |
| | text = text.replace("LAION-AI", "H2O.ai") |
| | text = text.replace("LAION,", "H2O.ai,") |
| | text = text.replace("LAION.ai", "H2O.ai") |
| | text = text.replace("LAION.", "H2O.ai.") |
| | text = text.replace("LAION", "H2O.ai") |
| |
|
| | role = roles[i] |
| | if prompt_type == "llama2": |
| | new_data = ('[INST] ' if role == 'prompter' else ' [/INST] ') + text |
| | if parent_id and role == 'prompter': |
| | new_data = " " + new_data |
| | elif prompt_type == "human_bot": |
| | new_data = ('<human>: ' if role == 'prompter' else '<bot>: ') + text |
| | else: |
| | raise NotImplementedError("prompt_type not supported") |
| | entry = dict(message_id=message_id, parent_id=parent_id, text=new_data) |
| | if message_tree_id not in rows: |
| | rows[message_tree_id] = [entry] |
| | else: |
| | rows[message_tree_id].append(entry) |
| |
|
| | all_rows = [] |
| |
|
| | for node_id in rows: |
| | |
| | conversations = [] |
| |
|
| | list_msgs = rows[node_id] |
| | |
| | while len(list_msgs): |
| | for i, leaf in enumerate(list_msgs): |
| | found = False |
| | parent_id = leaf['parent_id'] |
| | if parent_id is None: |
| | |
| | conversations.append(leaf) |
| | found = True |
| | else: |
| | for conv in conversations: |
| | |
| | if parent_id in conv['message_id'] and parent_id != conv['message_id'][-len(parent_id):]: |
| | |
| | continue |
| | if parent_id == conv['message_id'][-len(parent_id):]: |
| | |
| | conversations.append(conv.copy()) |
| | if prompt_type == "llama2": |
| | conv['text'] += f"""{leaf['text']}""" |
| | elif prompt_type == "human_bot": |
| | conv['text'] += f""" |
| | {leaf['text']} |
| | """ |
| | else: |
| | raise NotImplementedError |
| | conv['message_id'] += leaf['message_id'] |
| | found = True |
| | break |
| | if found: |
| | |
| | del list_msgs[i] |
| | break |
| |
|
| | |
| | for i, conv in enumerate(conversations): |
| | for j, conv2 in enumerate(conversations): |
| | if i == j: |
| | continue |
| | if conv['message_id'] and conv2['message_id']: |
| | assert conv['message_id'] != conv2['message_id'] |
| | |
| | if conv['message_id'] in conv2['message_id']: |
| | conv['message_id'] = None |
| | if conv2['message_id'] in conv['message_id']: |
| | conv2['message_id'] = None |
| | conversations = [c for c in conversations if c['message_id']] |
| | if only_personality: |
| | if prompt_type == "human_bot": |
| | all_rows.extend( |
| | [dict(input=c['text'] + "\n<human>:", output="", prompt_type='plain', source=data_file) for c in conversations if |
| | 'h2oGPT' in c['text']]) |
| | elif prompt_type == "llama2": |
| | all_rows.extend( |
| | [dict(input=c['text'] + |
| | ("" if c['text'].rfind("[/INST]") > c['text'].rfind("[INST]") else " [/INST]"), |
| | output="", prompt_type='plain', source=data_file) for c in conversations if |
| | 'h2oGPT' in c['text']]) |
| | else: |
| | raise NotImplementedError |
| | else: |
| | if prompt_type == "human_bot": |
| | all_rows.extend( |
| | [dict(input=c['text'] + "\n<human>:", output="", prompt_type='plain', source=data_file) for c in conversations |
| | if |
| | "What is H2O.ai" not in c['text']]) |
| | elif prompt_type == "llama2": |
| | all_rows.extend( |
| | [dict(input=c['text'] + |
| | (" " if c['text'].rfind("[/INST]") > c['text'].rfind("[INST]") else " [/INST]"), |
| | output="", prompt_type='plain', source=data_file) for c in conversations if |
| | "What is H2O.ai" not in c['text']]) |
| | else: |
| | raise NotImplementedError |
| |
|
| | unhelpful = get_unhelpful_list() |
| | all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)] |
| | personality = create_personality_data(prompt_type=prompt_type) |
| | all_rows.extend(personality * 10) |
| | np.random.seed(123) |
| | np.random.shuffle(all_rows) |
| | print(len(all_rows)) |
| | if deberta_grading: |
| | df = pd.DataFrame(all_rows) |
| | df = df.rename(columns={'input': 'text'}) |
| | df = add_deberta_grade(df) |
| | df = df.rename(columns={'text': 'input'}) |
| | drop = True |
| | if drop: |
| | min_grade = 0.3 |
| | max_grade = np.inf |
| | before_rows = df.shape[0] |
| | df = df[df['grade_deberta'] >= min_grade] |
| | df = df[df['grade_deberta'] <= max_grade] |
| | after_rows = df.shape[0] |
| | print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows)) |
| | print("After DeBERTa grade") |
| | print(df.describe()) |
| | all_rows = [] |
| | for i in range(df.shape[0]): |
| | all_rows.append( |
| | dict( |
| | input=df['input'].iloc[i], |
| | output=df['output'].iloc[i], |
| | source=df['source'].iloc[i], |
| | prompt_type=df['prompt_type'].iloc[i], |
| | grade_deberta=df['grade_deberta'].iloc[i], |
| | ) |
| | ) |
| | if save_json: |
| | data_file = data_file + \ |
| | ("_h2ogpt" if fixup_personality else "") + \ |
| | ("_only" if only_personality else "") + \ |
| | ("_graded" if deberta_grading else "") + \ |
| | ("_llama2_chat" if prompt_type == "llama2" else "") |
| | for i in range(len(all_rows)): |
| | all_rows[i]['id'] = i |
| | with open(data_file.lower().replace("/", "_") + ".json", "w") as f: |
| | f.write(json.dumps(all_rows, indent=2)) |
| | return all_rows |
| |
|
| |
|
| | def test_finalize_to_json(): |
| | df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet') |
| | df = df.rename(columns={'text': 'input'}) |
| |
|
| | print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True) |
| |
|
| | print("Adding open assistant data") |
| | with open("openassistant_oasst1_h2ogpt_graded.json") as f: |
| | open_assistant = json.loads(f.read()) |
| | df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0) |
| |
|
| | def final_clean(df): |
| | from better_profanity import profanity |
| | profanity.load_censor_words_from_file("data/censor_words.txt") |
| | df['profanity'] = parallel_apply( |
| | df['input'], |
| | lambda x: profanity.contains_profanity(x), |
| | n_jobs=-1, |
| | ) |
| | return df[(df['profanity'] == 0)].reset_index(drop=True) |
| |
|
| | print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True) |
| | df = final_clean(df) |
| | print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True) |
| | print(df.describe()) |
| | print(df.shape) |
| | row_list = [] |
| | for i in range(df.shape[0]): |
| | row_list.append( |
| | dict( |
| | input=df.loc[i, 'input'], |
| | source=df.loc[i, 'source'], |
| | prompt_type='plain', |
| | ) |
| | ) |
| | np.random.seed(1234) |
| | np.random.shuffle(row_list) |
| | unhelpful = get_unhelpful_list() |
| | row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)] |
| | for i in range(len(row_list)): |
| | row_list[i]['id'] = i |
| | row_list[i]['input'] = row_list[i]['input'].replace(" <bot>:", "\n<bot>:") |
| | with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f: |
| | f.write(json.dumps(row_list, indent=2)) |
| |
|
| |
|
| | def create_personality_data(prompt_type="llama2"): |
| | questions = [ |
| | "What's your name?", |
| | "What is your name?", |
| | "What are you?", |
| | "Who are you?", |
| | "Do you have a name?", |
| | "Who trained you?", |
| | "Who created you?", |
| | "Who made you?", |
| | ] |
| | answers = [ |
| | "I'm h2oGPT, a large language model by H2O.ai.", |
| | "I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", |
| | "My name is h2oGPT. I'm a large language model by H2O.ai, the visionary leader in democratizing AI.", |
| | "My name is h2oGPT. I'm a large language model trained by H2O.ai.", |
| | "Hi! I'm h2oGPT, a large language model by H2O.ai.", |
| | "Hi! I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", |
| | ] |
| | help = [ |
| | "", |
| | " How can I help you?", |
| | " How may I assist you?", |
| | " Nice to meet you.", |
| | ] |
| | import itertools |
| | rows = [] |
| | for pair in itertools.product(questions, answers, help): |
| | rows.append( |
| | dict(input=f"{pair[0]}", output=f"{pair[1]}{pair[2]}", prompt_type=prompt_type, source="H2O.ai") |
| | ) |
| | for q, a in [ |
| | ("What is H2O.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| | ("What is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| | ("What is H2O?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| | ("Who is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| | ("who is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| | ("who is h2o?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), |
| | ("what is H2O.ai?", "H2O.ai is the visionary leader in democratizing AI."), |
| | ("who is H2O.ai?", "H2O.ai is the visionary leader in democratizing AI."), |
| | ("who is H2O?", "H2O.ai is the visionary leader in democratizing AI."), |
| | ("Who is h20?", "H2O.ai is the visionary leader in democratizing AI."), |
| | ]: |
| | rows.append(dict(input=q, output=a, prompt_type=prompt_type, source='H2O.ai')) |
| | print(len(rows)) |
| | with open("h2ogpt-personality.json", "w") as f: |
| | f.write(json.dumps(rows, indent=2)) |
| | return rows |
| |
|
| |
|
| | def test_check_stats_data(): |
| | filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json' |
| | df = pd.read_json(filename) |
| |
|
| | |
| | df['char_count'] = df['input'].apply(lambda x: len(x)) |
| | import matplotlib.pyplot as plt |
| | plt.figure(figsize=(10, 10)) |
| | plt.hist(df['char_count'], bins=100) |
| | chars_avg = np.mean(df['char_count']) |
| | chars_median = np.median(df['char_count']) |
| | plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median)) |
| | plt.savefig('chars_hist.png') |
| | plt.close() |
| |
|
| | |
| | from finetune import generate_and_tokenize_prompt |
| | from loaders import get_loaders, get_tokenizer |
| | from functools import partial |
| |
|
| | llama_type = False |
| | tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b' |
| | model_loader, tokenizer_loader, conditional_type = ( |
| | get_loaders(model_name=base_model, reward_type=False, llama_type=llama_type)) |
| | local_files_only = False |
| | resume_download = True |
| | use_auth_token = False |
| | tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token) |
| | prompt_type = 'plain' |
| | train_on_inputs = True |
| | add_eos_token = False |
| | cutoff_len = 512 |
| | generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type, |
| | train_on_inputs=train_on_inputs, add_eos_token=add_eos_token, |
| | cutoff_len=cutoff_len, tokenizer=tokenizer) |
| | from datasets import load_dataset |
| | data = load_dataset("json", data_files={"train": filename}) |
| | val_set_size = 0.90 |
| | train_val = data["train"].train_test_split( |
| | test_size=val_set_size, shuffle=True, seed=42 |
| | ) |
| | train_data = train_val["train"] |
| | train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count()) |
| |
|
| | df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count']) |
| |
|
| | plt.figure(figsize=(10, 10)) |
| | plt.hist(df_tokens['token_count'], bins=100) |
| | token_avg = np.mean(df_tokens['token_count']) |
| | token_median = np.median(df_tokens['token_count']) |
| | plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median)) |
| | plt.savefig('token_hist_%s.png' % cutoff_len) |
| | plt.close() |
| |
|
| |
|
| | def get_unhelpful_list(): |
| | |
| | unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?", |
| | "I'm sorry, but I don't understand your question. Could you please rephrase it?", |
| | "I'm sorry, I don't quite understand your question", |
| | "I'm sorry, I don't know", |
| | "I'm sorry, but I don't know", |
| | "I don't know anything", |
| | "I do not know", |
| | "I don't know", |
| | "I don't know how", |
| | "I do not know how", |
| | "Can you please explain what you mean", |
| | "please explain what you mean", |
| | "please explain", |
| | "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by", |
| | "I'm sorry but I don't understand what you mean", |
| | "I don't understand", |
| | "I don't have the ability", |
| | "I do not have the ability", |
| | "I do not have", |
| | "I am a language model,", |
| | "I am a large language model,", |
| | "I do not understand your question. Can you please try to make it clearer?", |
| | "I'm sorry, but as an AI language model", |
| | "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.", |
| | "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?", |
| | "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t", |
| | "I apologize, but I cannot perform the task you have requested.", |
| | "I'm sorry, I cannot perform this task as I am an AI language model and do not have access", |
| | "I'm sorry, I'm not sure what you're asking for here.", |
| | "I'm not sure what you are asking", |
| | "You need to provide more context", |
| | ] |
| | |
| | unhelpful += ["sorry, I didn't quite understand your question", |
| | "I didn't quite understand your question", |
| | "I didn't understand your question", |
| | "I did not understand your question", |
| | "I did not understand the question", |
| | "could you please rephrase" |
| | "could you rephrase" |
| | "I do not understand your question.", |
| | "I do not understand the question.", |
| | "I do not understand that question.", |
| | "Can you please try to make it clearer", |
| | "Can you try to make it clearer", |
| | "sorry, but as an AI language model", |
| | "as an AI language model", |
| | "I apologize, but I cannot", |
| | "I cannot rephrase text", |
| | "I cannot understand. Your post is difficult to read and follow." |
| | "Your post is difficult to read and follow." |
| | "I apologize, but I am", |
| | "Sorry, but I am not ", |
| | "nor am I capable", |
| | "I am not capable of", |
| | "I apologize, but I cannot perform the task you have requested", |
| | "I cannot perform the task", |
| | "I cannot complete the task", |
| | "I'm sorry", |
| | "I am sorry", |
| | "do not have access", |
| | "not sure what you're asking for", |
| | "not sure what you are asking for", |
| | "not sure what is being asked", |
| | "I'm not sure what you are asking", |
| | "not sure what you are asking", |
| | "You need to provide more context", |
| | "provide more context", |
| | ] |
| | unhelpful += ["As a large language model", |
| | "cannot provide any information", |
| | "As an artificial intelligence I do not have the capability", |
| | "As an artificial intelligence I don't have the capability", |
| | "As an artificial intelligence I can't", |
| | "As an artificial intelligence I cannot", |
| | "I am sorry but I do not understand", |
| | "Can you please explain", |
| | "(sorry couldn't resist)", |
| | "(sorry could not resist)", |
| | " :)", |
| | " ;)", |
| | " :-)", |
| | " ;-)", |
| | " lol ", |
| | "Thanks so much!!!", |
| | "Thank You :)!!!", |
| | "Please try not to repeat", |
| | "I am an AI language model", |
| | "I'm a AI assistant that", |
| | "I'm an AI assistant that", |
| | "I am an AI assistant that", |
| | "etc.", |
| | "etc.etc.", |
| | "etc. etc.", |
| | "etc etc", |
| | ] |
| | return unhelpful |
| |
|
| |
|
| | def test_check_unhelpful(): |
| | |
| | file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json' |
| | |
| |
|
| | unhelpful = get_unhelpful_list() |
| | |
| | df = pd.read_json(file) |
| |
|
| | use_reward_score_threshold = False |
| | use_bleu_threshold = False |
| | use_sentence_sim = True |
| |
|
| | from sacrebleu.metrics import BLEU |
| | bleu = BLEU() |
| | from nltk.translate.bleu_score import sentence_bleu |
| |
|
| | def get_bleu(actual, expected_list): |
| | |
| | return sentence_bleu(expected_list, actual) |
| |
|
| | threshold = 0.0 |
| | if use_reward_score_threshold: |
| | df = df[df['grade_deberta'] > threshold] |
| |
|
| | |
| | data = df.to_dict(orient='records') |
| | bads = {} |
| | string_all = str(data) |
| | for sub in unhelpful: |
| | bads[sub] = string_all.count(sub) |
| | bads = {k: v for k, v in bads.items() if v > 0} |
| | import pprint |
| | pp = pprint.PrettyPrinter(indent=4) |
| | pp.pprint(bads) |
| |
|
| | total_bads = sum(list(bads.values())) |
| | print('total_bads: %s' % total_bads, flush=True) |
| |
|
| | |
| | import re |
| | convs = [[x.strip() for x in re.split(r'%s|%s' % (human, bot), y['input']) if x.strip()] for y in data] |
| | humans = [[x for i, x in enumerate(y) if i % 2 == 0] for y in convs] |
| | bots = [[x for i, x in enumerate(y) if i % 2 == 1] for y in convs] |
| |
|
| | |
| | bleu_threshold = 0.9 |
| | if use_bleu_threshold: |
| | bots = [[x for x in y if get_bleu(x, unhelpful) < bleu_threshold] for y in tqdm(bots)] |
| |
|
| | cosine_sim_threshold = 0.8 |
| | if use_sentence_sim: |
| | |
| | from sentence_transformers import SentenceTransformer |
| | |
| | |
| | sent_model = 'all-MiniLM-L6-v2' |
| | model = SentenceTransformer(sent_model) |
| | sentence_embeddings = model.encode(unhelpful) |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | bots = [x for x in tqdm(bots) if |
| | np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold] |
| |
|
| | bads_bots = {} |
| | string_all = str(bots) |
| | for sub in unhelpful: |
| | bads_bots[sub] = string_all.count(sub) |
| | bads_bots = {k: v for k, v in bads_bots.items() if v > 0} |
| | import pprint |
| | pp = pprint.PrettyPrinter(indent=4) |
| | pp.pprint(bads_bots) |
| |
|
| | total_bads_bots = sum(list(bads_bots.values())) |
| | print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % ( |
| | threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True) |
| |
|
| | |
| | assert len(bads_bots) == 0, bads_bots |
| |
|
| |
|
| | def test_fortune2000_personalized(): |
| | row_list = [] |
| | import glob |
| | if not os.path.isdir("wikitext"): |
| | raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip") |
| | for file in glob.glob("wikitext/*.txt"): |
| | with open(file, "r") as f: |
| | blob = f.read() |
| | N = 512 * 4 |
| | row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)} |
| | for s in get_sentences(blob, N) if s]) |
| | personality = create_personality_data() |
| | import copy |
| | for i in range(10): |
| | row_list.extend(copy.deepcopy(personality)) |
| | np.random.seed(123) |
| | np.random.shuffle(row_list) |
| | for i in range(len(row_list)): |
| | row_list[i]['id'] = i |
| | for i in range(len(row_list)): |
| | assert row_list[i]['id'] == i |
| | with open("h2ogpt-fortune2000-personalized.json", "w") as ff: |
| | ff.write(json.dumps(row_list, indent=2)) |
| |
|