|
|
"""DeepSeekV3.2 model configuration""" |
|
|
|
|
|
from typing import Optional |
|
|
from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config |
|
|
|
|
|
|
|
|
class DeepseekV32Config(DeepseekV3Config): |
|
|
r""" |
|
|
This is the configuration class to store the configuration of a [`DeepseekV32Model`]. `""" |
|
|
|
|
|
model_type = "deepseek_v32" |
|
|
|
|
|
max_batch_size=8 |
|
|
max_seq_len=16384 |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_size: Optional[int] = 129280, |
|
|
hidden_size: Optional[int] = 7168, |
|
|
intermediate_size: Optional[int] = 18432, |
|
|
moe_intermediate_size: Optional[int] = 2048, |
|
|
num_hidden_layers: Optional[int] = 61, |
|
|
num_attention_heads: Optional[int] = 128, |
|
|
num_key_value_heads: Optional[int] = 128, |
|
|
n_shared_experts: Optional[int] = 1, |
|
|
n_routed_experts: Optional[int] = 256, |
|
|
routed_scaling_factor: Optional[float] = 2.5, |
|
|
kv_lora_rank: Optional[int] = 512, |
|
|
q_lora_rank: Optional[int] = 1536, |
|
|
qk_rope_head_dim: Optional[int] = 64, |
|
|
v_head_dim: Optional[int] = 128, |
|
|
qk_nope_head_dim: Optional[int] = 128, |
|
|
n_group: Optional[int] = 8, |
|
|
topk_group: Optional[int] = 4, |
|
|
num_experts_per_tok: Optional[int] = 8, |
|
|
first_k_dense_replace: Optional[int] = 3, |
|
|
norm_topk_prob: Optional[bool] = True, |
|
|
hidden_act: Optional[str] = "silu", |
|
|
max_position_embeddings: Optional[int] = 4096, |
|
|
initializer_range: Optional[float] = 0.02, |
|
|
rms_norm_eps: Optional[int] = 1e-6, |
|
|
use_cache: Optional[bool] = True, |
|
|
pad_token_id: Optional[int] = None, |
|
|
bos_token_id: Optional[int] = 0, |
|
|
eos_token_id: Optional[int] = 1, |
|
|
pretraining_tp: Optional[int] = 1, |
|
|
tie_word_embeddings: Optional[bool] = False, |
|
|
rope_scaling = None, |
|
|
rope_interleave: Optional[bool] = True, |
|
|
attention_bias: Optional[bool] = False, |
|
|
attention_dropout: Optional[float] = 0.0, |
|
|
ep_size: Optional[int] = 1, |
|
|
n_dense_layers=3, |
|
|
index_head_dim=128, |
|
|
index_n_heads=64, |
|
|
index_topk=2048, |
|
|
moe_layer_freq=1, |
|
|
num_nextn_predict_layers=1, |
|
|
**kwargs, |
|
|
): |
|
|
kwargs.pop('rope_parameters', None) |
|
|
super().__init__( |
|
|
vocab_size=vocab_size, |
|
|
hidden_size=hidden_size, |
|
|
intermediate_size=intermediate_size, |
|
|
moe_intermediate_size=moe_intermediate_size, |
|
|
num_hidden_layers=num_hidden_layers, |
|
|
num_attention_heads=num_attention_heads, |
|
|
num_key_value_heads=num_key_value_heads, |
|
|
n_shared_experts=n_shared_experts, |
|
|
n_routed_experts=n_routed_experts, |
|
|
routed_scaling_factor=routed_scaling_factor, |
|
|
kv_lora_rank=kv_lora_rank, |
|
|
q_lora_rank=q_lora_rank, |
|
|
qk_rope_head_dim=qk_rope_head_dim, |
|
|
v_head_dim=v_head_dim, |
|
|
qk_nope_head_dim=qk_nope_head_dim, |
|
|
n_group=n_group, |
|
|
topk_group=topk_group, |
|
|
num_experts_per_tok=num_experts_per_tok, |
|
|
first_k_dense_replace=first_k_dense_replace, |
|
|
norm_topk_prob=norm_topk_prob, |
|
|
hidden_act=hidden_act, |
|
|
max_position_embeddings=max_position_embeddings, |
|
|
initializer_range=initializer_range, |
|
|
rms_norm_eps=rms_norm_eps, |
|
|
use_cache=use_cache, |
|
|
pad_token_id=pad_token_id, |
|
|
bos_token_id=bos_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
pretraining_tp=pretraining_tp, |
|
|
tie_word_embeddings=tie_word_embeddings, |
|
|
|
|
|
rope_parameters=rope_scaling, |
|
|
rope_interleave=rope_interleave, |
|
|
attention_bias=attention_bias, |
|
|
attention_dropout=attention_dropout, |
|
|
**kwargs, |
|
|
) |
|
|
self.ep_size=ep_size |
|
|
self.n_dense_layers=n_dense_layers |
|
|
self.index_head_dim=index_head_dim |
|
|
self.index_n_heads=index_n_heads |
|
|
self.index_topk=index_topk |
|
|
self.moe_layer_freq=moe_layer_freq |
|
|
self.num_nextn_predict_layers=num_nextn_predict_layers |
|
|
self.dim = hidden_size |
|
|
|
|
|
|
|
|
__all__ = ["DeepseekV32Config"] |