| from transformers import AutoTokenizer |
| from datasets import load_dataset |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import os |
|
|
| |
| tokenizer_path = "/home/rm" |
| parquet_paths = [ |
| "/home/data/pk-2089-L6.parquet", |
| "/home/data/pk-1820-L6.parquet", |
| "/home/data/pk-2355-L6.parquet", |
| "/home/data/pk-4088-L6.parquet", |
| "/home/data/pk-3876-L6.parquet", |
| ] |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
|
|
| |
| save_dir = "./token_density_plots" |
| os.makedirs(save_dir, exist_ok=True) |
|
|
| |
| BINS = 120 |
| CLIP_PCT = 99.5 |
| USE_LOGX = False |
|
|
| def count_total_tokens(ex): |
| chosen_ids = tokenizer(ex["chosen"], add_special_tokens=False)["input_ids"] |
| rejected_ids = tokenizer(ex["reject"], add_special_tokens=False)["input_ids"] |
| ex["total_tokens"] = len(chosen_ids) + len(rejected_ids) |
| ex["chosen_tokens"] = len(chosen_ids) |
| ex["rejected_tokens"] = len(rejected_ids) |
| return ex |
|
|
| |
| all_sets = [] |
| all_totals_for_range = [] |
|
|
| for path in parquet_paths: |
| name = os.path.basename(path) |
| print(f"\n▶ 处理 {name}") |
| ds = load_dataset("parquet", data_files=path, split="train") |
| ds = ds.map(count_total_tokens, desc=f"[{name}] 计算 token", num_proc=4) |
|
|
| totals = np.asarray(ds["total_tokens"], dtype=np.int64) |
| chosens = np.asarray(ds["chosen_tokens"], dtype=np.int64) |
| rejects = np.asarray(ds["rejected_tokens"],dtype=np.int64) |
|
|
| print(f"[{name}] 样本数: {len(ds)}") |
| print(f" total_tokens : max={totals.max()} | min={totals.min()} | mean={totals.mean():.1f}") |
| print(f" chosen_tokens: max={chosens.max()} | min={chosens.min()} | mean={chosens.mean():.1f}") |
| print(f" reject_tokens: max={rejects.max()} | min={rejects.min()} | mean={rejects.mean():.1f}") |
|
|
| all_sets.append((name, totals, chosens, rejects)) |
| all_totals_for_range.append(totals) |
|
|
| |
| all_totals_concat = np.concatenate(all_totals_for_range) if all_totals_for_range else np.array([1]) |
| if CLIP_PCT is not None: |
| xmax = float(np.percentile(all_totals_concat, CLIP_PCT)) |
| else: |
| xmax = float(all_totals_concat.max()) |
|
|
| xmax = max(1.0, xmax) |
|
|
| |
| bin_edges = np.linspace(0, xmax, BINS + 1) |
|
|
| |
| fig, ax = plt.subplots(figsize=(11, 6)) |
|
|
| |
| linestyles = { |
| "total": "-", |
| "chosen": "--", |
| "reject": "-.", |
| } |
|
|
| for name, totals, chosens, rejects in all_sets: |
| |
| ax.hist(totals, bins=bin_edges, density=True, histtype='step', linewidth=1.6, |
| label=f"{name} • total", linestyle=linestyles["total"]) |
| ax.hist(chosens, bins=bin_edges, density=True, histtype='step', linewidth=1.6, |
| label=f"{name} • chosen", linestyle=linestyles["chosen"]) |
| ax.hist(rejects, bins=bin_edges, density=True, histtype='step', linewidth=1.6, |
| label=f"{name} • reject", linestyle=linestyles["reject"]) |
|
|
| if USE_LOGX: |
| ax.set_xscale('log') |
|
|
| ax.set_title("Token Density Overlay — All Datasets") |
| ax.set_xlabel("Token Count" + (" (log)" if USE_LOGX else "")) |
| ax.set_ylabel("Density") |
|
|
| |
| leg = ax.legend(ncol=3, fontsize=8, loc="upper right", frameon=True) |
| plt.tight_layout() |
|
|
| out_png = os.path.join(save_dir, "ALL_datasets_density_overlay.png") |
| plt.savefig(out_png, dpi=300) |
| plt.close() |
| print(f"\n✅ 已保存全量合并对比图: {out_png}") |
|
|