DeepSeek-V3.2-Exp-FP8 / configuration_deepseek_v32.py
kishizaki-sci's picture
Update configuration_deepseek_v32.py
635255d verified
"""DeepSeekV3.2 model configuration"""
from typing import Optional
from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
class DeepseekV32Config(DeepseekV3Config):
r"""
This is the configuration class to store the configuration of a [`DeepseekV32Model`]. `"""
model_type = "deepseek_v32"
#dim=2048
max_batch_size=8
max_seq_len=16384
def __init__(
self,
vocab_size: Optional[int] = 129280,
hidden_size: Optional[int] = 7168,
intermediate_size: Optional[int] = 18432,
moe_intermediate_size: Optional[int] = 2048,
num_hidden_layers: Optional[int] = 61,
num_attention_heads: Optional[int] = 128,
num_key_value_heads: Optional[int] = 128,
n_shared_experts: Optional[int] = 1,
n_routed_experts: Optional[int] = 256,
routed_scaling_factor: Optional[float] = 2.5,
kv_lora_rank: Optional[int] = 512,
q_lora_rank: Optional[int] = 1536,
qk_rope_head_dim: Optional[int] = 64,
v_head_dim: Optional[int] = 128,
qk_nope_head_dim: Optional[int] = 128,
n_group: Optional[int] = 8,
topk_group: Optional[int] = 4,
num_experts_per_tok: Optional[int] = 8,
first_k_dense_replace: Optional[int] = 3,
norm_topk_prob: Optional[bool] = True,
hidden_act: Optional[str] = "silu",
max_position_embeddings: Optional[int] = 4096,
initializer_range: Optional[float] = 0.02,
rms_norm_eps: Optional[int] = 1e-6,
use_cache: Optional[bool] = True,
pad_token_id: Optional[int] = None,
bos_token_id: Optional[int] = 0,
eos_token_id: Optional[int] = 1,
pretraining_tp: Optional[int] = 1,
tie_word_embeddings: Optional[bool] = False,
rope_scaling = None,
rope_interleave: Optional[bool] = True,
attention_bias: Optional[bool] = False,
attention_dropout: Optional[float] = 0.0,
ep_size: Optional[int] = 1,
n_dense_layers=3,
index_head_dim=128,
index_n_heads=64,
index_topk=2048,
moe_layer_freq=1,
num_nextn_predict_layers=1,
**kwargs,
):
kwargs.pop('rope_parameters', None)
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
moe_intermediate_size=moe_intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
n_shared_experts=n_shared_experts,
n_routed_experts=n_routed_experts,
routed_scaling_factor=routed_scaling_factor,
kv_lora_rank=kv_lora_rank,
q_lora_rank=q_lora_rank,
qk_rope_head_dim=qk_rope_head_dim,
v_head_dim=v_head_dim,
qk_nope_head_dim=qk_nope_head_dim,
n_group=n_group,
topk_group=topk_group,
num_experts_per_tok=num_experts_per_tok,
first_k_dense_replace=first_k_dense_replace,
norm_topk_prob=norm_topk_prob,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
rms_norm_eps=rms_norm_eps,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pretraining_tp=pretraining_tp,
tie_word_embeddings=tie_word_embeddings,
#rope_scaling=rope_scaling,
rope_parameters=rope_scaling,
rope_interleave=rope_interleave,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
**kwargs,
)
self.ep_size=ep_size
self.n_dense_layers=n_dense_layers
self.index_head_dim=index_head_dim
self.index_n_heads=index_n_heads
self.index_topk=index_topk
self.moe_layer_freq=moe_layer_freq
self.num_nextn_predict_layers=num_nextn_predict_layers
self.dim = hidden_size
__all__ = ["DeepseekV32Config"]