Spaces:

siro1
/

amd-leaderboard

Sleeping

App Files Files Community

amd-leaderboard / populate_dataset.py

siro1

Initial commit

48bb3eb 6 months ago

raw

history blame contribute delete

2.25 kB

	#!/usr/bin/env python3
	"""
	Script to populate the Hugging Face dataset with mock data
	"""

	from datasets import Dataset
	from datetime import datetime, timedelta
	import random

	# Configuration
	DATASET_ID = "siro1/amd-hackathon"
	HF_TOKEN = None # Set this if needed for private repos

	# Generate mock data
	mock_data = []

	teams = ["Team Alpha", "Team Beta", "Team Gamma", "Team Delta", "Team Epsilon"]
	base_date = datetime.now() - timedelta(days=7)

	for i in range(10):
	team = random.choice(teams)
	timestamp = (base_date + timedelta(days=i / 2)).strftime("%Y-%m-%d %H:%M:%S")

	# Vary configurations
	input_length = random.choice([128, 256, 512])
	output_length = random.choice([128, 256, 512])
	concurrent_requests = random.choice([8, 16, 32, 64])

	# Generate performance metrics with some variance
	base_ttft = 40 + random.uniform(-10, 10)
	base_tpot = 11 + random.uniform(-2, 2)
	base_itl = 10 + random.uniform(-2, 2)
	base_e2e = 1500 + (input_length + output_length) * 2 + random.uniform(-200, 200)
	base_throughput = 2000 + concurrent_requests * 20 + random.uniform(-200, 200)
	bits_per_byte = 0.54 + random.uniform(-0.02, 0.02)
	byte_perplexity = 1.45 + random.uniform(-0.02, 0.02)
	word_perplexity = 4.13 + random.uniform(-0.02, 0.02)

	entry = {
	"team": team,
	"timestamp": timestamp,
	"ttft": round(base_ttft, 2),
	"tpot": round(base_tpot, 2),
	"itl": round(base_itl, 2),
	"e2e": round(base_e2e, 2),
	"throughput": round(base_throughput, 2),
	"bits_per_byte": round(bits_per_byte, 2),
	"byte_perplexity": round(byte_perplexity, 2),
	"word_perplexity": round(word_perplexity, 2),
	}

	mock_data.append(entry)

	# Sort by timestamp
	mock_data.sort(key=lambda x: x["timestamp"])

	# Create dataset and push to hub
	print(f"Creating dataset with {len(mock_data)} entries...")
	dataset = Dataset.from_list(mock_data)

	print(f"Pushing to Hugging Face Hub: {DATASET_ID}")
	dataset.push_to_hub(DATASET_ID, token=HF_TOKEN)

	print("Dataset populated successfully!")
	print("\nSample entries:")
	for entry in mock_data[:3]:
	print(
	f"- {entry['team']} at {entry['timestamp']}: throughput={entry['throughput']}"
	)