MOE模型
MOE 模型
Qwen3MoeForCausalLM(
(model): Qwen3MoeModel(
(embed_tokens): Embedding(151936, 2048, padding_idx=151643)
(layers): ModuleList(
(0-47): 48 x Qwen3MoeDecoderLayer(
(self_attn): Qwen3MoeAttention(
(q_proj): Linear(in_features=2048, out_features=4096, bias=False)
(k_proj): Linear(in_features=2048, out_features=512, bias=False)
(v_proj): Linear(in_features=2048, out_features=512, bias=False)
(o_proj): Linear(in_features=4096, out_features=2048, bias=False)
(q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
(k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
)
(mlp): Qwen3MoeSparseMoeBlock(
(gate): Linear(in_features=2048, out_features=128, bias=False)
(experts): ModuleList(
(0-127): 128 x Qwen3MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=768, bias=False)
(up_proj): Linear(in_features=2048, out_features=768, bias=False)
(down_proj): Linear(in_features=768, out_features=2048, bias=False)
(act_fn): SiLU()
)
)
)
(input_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
(rotary_emb): Qwen3MoeRotaryEmbedding()
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
)

浙公网安备 33010602011771号