Spaces:
Runtime error
Runtime error
| """ | |
| Training configuration schemas — Pydantic v2. | |
| All training jobs are validated against these models before execution. | |
| No raw dicts escape into the pipeline; everything is typed and constrained. | |
| """ | |
| from __future__ import annotations | |
| from enum import StrEnum | |
| from typing import Annotated | |
| from pydantic import BaseModel, Field, HttpUrl, model_validator | |
| from pydantic import PositiveFloat, PositiveInt | |
| # --------------------------------------------------------------------------- | |
| # Enums | |
| # --------------------------------------------------------------------------- | |
| class EvalStrategy(StrEnum): | |
| NO = "no" | |
| STEPS = "steps" | |
| EPOCH = "epoch" | |
| class Precision(StrEnum): | |
| FP32 = "fp32" | |
| FP16 = "fp16" | |
| BF16 = "bf16" | |
| INT8 = "int8" | |
| class OptimizerType(StrEnum): | |
| ADAMW = "adamw_torch" | |
| ADAMW_8BIT = "adamw_8bit" | |
| PAGED_ADAMW_8BIT = "paged_adamw_8bit" | |
| SGD = "sgd" | |
| class EvalMetric(StrEnum): | |
| PASS_AT_1 = "pass_at_1" | |
| PASS_AT_10 = "pass_at_10" | |
| BLEU = "bleu" | |
| EXECUTION_ACCURACY = "execution_accuracy" | |
| EXACT_MATCH = "exact_match" | |
| # --------------------------------------------------------------------------- | |
| # Sub-configs | |
| # --------------------------------------------------------------------------- | |
| class LoRAConfig(BaseModel): | |
| """LoRA adapter configuration. Omit to disable LoRA (full fine-tune).""" | |
| enabled: bool = True | |
| r: Annotated[int, Field(ge=1, le=256)] = 16 | |
| alpha: Annotated[int, Field(ge=1)] = 32 | |
| dropout: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.05 | |
| target_modules: list[str] = Field( | |
| default_factory=lambda: ["q_proj", "v_proj"], | |
| min_length=1, | |
| ) | |
| bias: str = "none" | |
| def alpha_geq_r(self) -> "LoRAConfig": | |
| if self.alpha < self.r: | |
| raise ValueError(f"lora.alpha ({self.alpha}) should be >= lora.r ({self.r})") | |
| return self | |
| class TrainingHyperparams(BaseModel): | |
| num_epochs: Annotated[int, Field(ge=1, le=100)] = 3 | |
| batch_size: Annotated[int, Field(ge=1, le=256)] = 8 | |
| gradient_accumulation_steps: Annotated[int, Field(ge=1, le=128)] = 4 | |
| learning_rate: Annotated[float, Field(gt=0.0, lt=1.0)] = 2e-5 | |
| weight_decay: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.01 | |
| warmup_ratio: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.1 | |
| max_seq_length: Annotated[int, Field(ge=64, le=32768)] = 1024 | |
| max_grad_norm: Annotated[float, Field(gt=0.0)] = 1.0 | |
| optimizer: OptimizerType = OptimizerType.ADAMW | |
| precision: Precision = Precision.BF16 | |
| lr_scheduler: str = "cosine" | |
| seed: int = 42 | |
| dataloader_num_workers: Annotated[int, Field(ge=0, le=32)] = 4 | |
| def effective_batch_size(self) -> int: | |
| return self.batch_size * self.gradient_accumulation_steps | |
| class EvaluationConfig(BaseModel): | |
| enabled: bool = True | |
| strategy: EvalStrategy = EvalStrategy.EPOCH | |
| eval_steps: PositiveInt | None = None # required when strategy=STEPS | |
| metrics: list[EvalMetric] = Field( | |
| default_factory=lambda: [EvalMetric.PASS_AT_1, EvalMetric.BLEU] | |
| ) | |
| num_samples_per_problem: Annotated[int, Field(ge=1, le=200)] = 10 | |
| timeout_seconds: Annotated[int, Field(ge=1, le=60)] = 10 | |
| load_best_model_at_end: bool = True | |
| metric_for_best_model: EvalMetric = EvalMetric.PASS_AT_1 | |
| greater_is_better: bool = True | |
| def eval_steps_required_for_steps_strategy(self) -> "EvaluationConfig": | |
| if self.strategy == EvalStrategy.STEPS and self.eval_steps is None: | |
| raise ValueError("evaluation.eval_steps is required when strategy='steps'") | |
| return self | |
| class CheckpointConfig(BaseModel): | |
| save_strategy: EvalStrategy = EvalStrategy.EPOCH | |
| save_steps: PositiveInt | None = None | |
| save_total_limit: Annotated[int, Field(ge=1, le=20)] = 3 | |
| output_dir: str = "./checkpoints" | |
| resume_from_checkpoint: str | None = None | |
| def save_steps_required_for_steps_strategy(self) -> "CheckpointConfig": | |
| if self.save_strategy == EvalStrategy.STEPS and self.save_steps is None: | |
| raise ValueError("checkpoint.save_steps required when save_strategy='steps'") | |
| return self | |
| class HubConfig(BaseModel): | |
| push_to_hub: bool = False | |
| repo_id: str | None = None | |
| private: bool = True | |
| commit_message: str = "Training checkpoint" | |
| def repo_id_required_if_pushing(self) -> "HubConfig": | |
| if self.push_to_hub and not self.repo_id: | |
| raise ValueError("hub.repo_id is required when hub.push_to_hub=true") | |
| return self | |
| class DatasetConfig(BaseModel): | |
| dataset_id: str # internal UUID or HF Hub dataset path | |
| split_ratio: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.9 # train split | |
| max_samples: PositiveInt | None = None # None = use all | |
| text_column: str = "content" | |
| shuffle: bool = True | |
| shuffle_seed: int = 42 | |
| # --------------------------------------------------------------------------- | |
| # Root job config | |
| # --------------------------------------------------------------------------- | |
| class TrainingJobConfig(BaseModel): | |
| """ | |
| Complete training job specification. | |
| Validated at job submission time. If validation passes, the job is | |
| guaranteed to reach the pipeline with a coherent configuration. | |
| """ | |
| job_name: Annotated[str, Field(min_length=1, max_length=128, pattern=r"^[\w\-]+$")] | |
| base_model: str = Field( | |
| description="HuggingFace model ID or local path", | |
| examples=["Salesforce/codegen-350M-mono", "deepseek-ai/deepseek-coder-1.3b-base"], | |
| ) | |
| dataset: DatasetConfig | |
| training: TrainingHyperparams = Field(default_factory=TrainingHyperparams) | |
| lora: LoRAConfig | None = Field(default_factory=LoRAConfig) | |
| evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig) | |
| checkpoint: CheckpointConfig = Field(default_factory=CheckpointConfig) | |
| hub: HubConfig = Field(default_factory=HubConfig) | |
| tags: list[str] = Field(default_factory=list, max_length=20) | |
| notes: str | None = None | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "job_name": "codegen-finetune-v1", | |
| "base_model": "Salesforce/codegen-350M-mono", | |
| "dataset": {"dataset_id": "ds_abc123"}, | |
| "training": { | |
| "num_epochs": 3, | |
| "batch_size": 8, | |
| "learning_rate": 2e-5, | |
| }, | |
| "hub": { | |
| "push_to_hub": True, | |
| "repo_id": "your-org/codegen-finetune-v1", | |
| }, | |
| } | |
| ] | |
| } | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Inference config (served separately but validated here for consistency) | |
| # --------------------------------------------------------------------------- | |
| class InferenceConfig(BaseModel): | |
| model_id: str | |
| max_new_tokens: Annotated[int, Field(ge=1, le=4096)] = 256 | |
| temperature: Annotated[float, Field(ge=0.0, le=2.0)] = 0.2 | |
| top_p: Annotated[float, Field(ge=0.0, le=1.0)] = 0.95 | |
| top_k: Annotated[int, Field(ge=0, le=1000)] = 50 | |
| do_sample: bool = True | |
| num_return_sequences: Annotated[int, Field(ge=1, le=200)] = 1 | |
| stop_sequences: list[str] = Field(default_factory=list) | |
| precision: Precision = Precision.BF16 | |