lvj commited on
Commit
3ee3aba
·
1 Parent(s): a572e8c

Upload Qwen3ForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +159 -39
  2. generation_config.json +2 -3
  3. pytorch_model.bin +3 -0
config.json CHANGED
@@ -1,10 +1,9 @@
1
  {
2
  "architectures": [
3
- "FSDPQwen3ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
  "dtype": "float32",
9
  "eos_token_id": 151645,
10
  "head_dim": 128,
@@ -56,9 +55,157 @@
56
  "num_attention_heads": 32,
57
  "num_hidden_layers": 36,
58
  "num_key_value_heads": 8,
 
59
  "quantization_config": {
60
  "include_input_output_embeddings": true,
61
- "modules_to_not_convert": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  "quant_method": "torchao",
63
  "quant_type": {
64
  "default": {
@@ -99,14 +246,11 @@
99
  "_type": "Int8DynamicActivationIntxWeightConfig",
100
  "_version": 2
101
  },
102
- "lm_head": {
103
  "_data": {
104
- "granularity": {
105
- "_data": {
106
- "axis": 0
107
- },
108
- "_type": "PerAxis",
109
- "_version": 1
110
  },
111
  "intx_packing_format": {
112
  "_data": "UNPACKED_TO_INT8",
@@ -117,48 +261,24 @@
117
  "_type": "QDQLayout",
118
  "_version": 1
119
  },
120
- "mapping_type": {
121
- "_data": "SYMMETRIC",
122
- "_type": "MappingType"
123
- },
124
- "scale_dtype": null,
125
  "weight_dtype": {
126
  "_data": "int4",
127
  "_type": "torch.dtype"
128
- }
129
- },
130
- "_type": "IntxWeightOnlyConfig",
131
- "_version": 2
132
- },
133
- "model.embed_tokens": {
134
- "_data": {
135
- "granularity": {
136
  "_data": {
137
  "axis": 0
138
  },
139
  "_type": "PerAxis",
140
  "_version": 1
141
  },
142
- "intx_packing_format": {
143
- "_data": "UNPACKED_TO_INT8",
144
- "_type": "IntxPackingFormat"
145
- },
146
- "layout": {
147
- "_data": {},
148
- "_type": "QDQLayout",
149
- "_version": 1
150
- },
151
- "mapping_type": {
152
  "_data": "SYMMETRIC",
153
  "_type": "MappingType"
154
  },
155
- "scale_dtype": null,
156
- "weight_dtype": {
157
- "_data": "int4",
158
- "_type": "torch.dtype"
159
- }
160
  },
161
- "_type": "IntxWeightOnlyConfig",
162
  "_version": 2
163
  }
164
  }
 
1
  {
2
  "architectures": [
3
+ "Qwen3ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
7
  "dtype": "float32",
8
  "eos_token_id": 151645,
9
  "head_dim": 128,
 
55
  "num_attention_heads": 32,
56
  "num_hidden_layers": 36,
57
  "num_key_value_heads": 8,
58
+ "pad_token_id": 151645,
59
  "quantization_config": {
60
  "include_input_output_embeddings": true,
61
+ "modules_to_not_convert": [
62
+ "model.layers.0.self_attn.q_norm",
63
+ "model.layers.0.self_attn.k_norm",
64
+ "model.layers.0.input_layernorm",
65
+ "model.layers.0.post_attention_layernorm",
66
+ "model.layers.1.self_attn.q_norm",
67
+ "model.layers.1.self_attn.k_norm",
68
+ "model.layers.1.input_layernorm",
69
+ "model.layers.1.post_attention_layernorm",
70
+ "model.layers.2.self_attn.q_norm",
71
+ "model.layers.2.self_attn.k_norm",
72
+ "model.layers.2.input_layernorm",
73
+ "model.layers.2.post_attention_layernorm",
74
+ "model.layers.3.self_attn.q_norm",
75
+ "model.layers.3.self_attn.k_norm",
76
+ "model.layers.3.input_layernorm",
77
+ "model.layers.3.post_attention_layernorm",
78
+ "model.layers.4.self_attn.q_norm",
79
+ "model.layers.4.self_attn.k_norm",
80
+ "model.layers.4.input_layernorm",
81
+ "model.layers.4.post_attention_layernorm",
82
+ "model.layers.5.self_attn.q_norm",
83
+ "model.layers.5.self_attn.k_norm",
84
+ "model.layers.5.input_layernorm",
85
+ "model.layers.5.post_attention_layernorm",
86
+ "model.layers.6.self_attn.q_norm",
87
+ "model.layers.6.self_attn.k_norm",
88
+ "model.layers.6.input_layernorm",
89
+ "model.layers.6.post_attention_layernorm",
90
+ "model.layers.7.self_attn.q_norm",
91
+ "model.layers.7.self_attn.k_norm",
92
+ "model.layers.7.input_layernorm",
93
+ "model.layers.7.post_attention_layernorm",
94
+ "model.layers.8.self_attn.q_norm",
95
+ "model.layers.8.self_attn.k_norm",
96
+ "model.layers.8.input_layernorm",
97
+ "model.layers.8.post_attention_layernorm",
98
+ "model.layers.9.self_attn.q_norm",
99
+ "model.layers.9.self_attn.k_norm",
100
+ "model.layers.9.input_layernorm",
101
+ "model.layers.9.post_attention_layernorm",
102
+ "model.layers.10.self_attn.q_norm",
103
+ "model.layers.10.self_attn.k_norm",
104
+ "model.layers.10.input_layernorm",
105
+ "model.layers.10.post_attention_layernorm",
106
+ "model.layers.11.self_attn.q_norm",
107
+ "model.layers.11.self_attn.k_norm",
108
+ "model.layers.11.input_layernorm",
109
+ "model.layers.11.post_attention_layernorm",
110
+ "model.layers.12.self_attn.q_norm",
111
+ "model.layers.12.self_attn.k_norm",
112
+ "model.layers.12.input_layernorm",
113
+ "model.layers.12.post_attention_layernorm",
114
+ "model.layers.13.self_attn.q_norm",
115
+ "model.layers.13.self_attn.k_norm",
116
+ "model.layers.13.input_layernorm",
117
+ "model.layers.13.post_attention_layernorm",
118
+ "model.layers.14.self_attn.q_norm",
119
+ "model.layers.14.self_attn.k_norm",
120
+ "model.layers.14.input_layernorm",
121
+ "model.layers.14.post_attention_layernorm",
122
+ "model.layers.15.self_attn.q_norm",
123
+ "model.layers.15.self_attn.k_norm",
124
+ "model.layers.15.input_layernorm",
125
+ "model.layers.15.post_attention_layernorm",
126
+ "model.layers.16.self_attn.q_norm",
127
+ "model.layers.16.self_attn.k_norm",
128
+ "model.layers.16.input_layernorm",
129
+ "model.layers.16.post_attention_layernorm",
130
+ "model.layers.17.self_attn.q_norm",
131
+ "model.layers.17.self_attn.k_norm",
132
+ "model.layers.17.input_layernorm",
133
+ "model.layers.17.post_attention_layernorm",
134
+ "model.layers.18.self_attn.q_norm",
135
+ "model.layers.18.self_attn.k_norm",
136
+ "model.layers.18.input_layernorm",
137
+ "model.layers.18.post_attention_layernorm",
138
+ "model.layers.19.self_attn.q_norm",
139
+ "model.layers.19.self_attn.k_norm",
140
+ "model.layers.19.input_layernorm",
141
+ "model.layers.19.post_attention_layernorm",
142
+ "model.layers.20.self_attn.q_norm",
143
+ "model.layers.20.self_attn.k_norm",
144
+ "model.layers.20.input_layernorm",
145
+ "model.layers.20.post_attention_layernorm",
146
+ "model.layers.21.self_attn.q_norm",
147
+ "model.layers.21.self_attn.k_norm",
148
+ "model.layers.21.input_layernorm",
149
+ "model.layers.21.post_attention_layernorm",
150
+ "model.layers.22.self_attn.q_norm",
151
+ "model.layers.22.self_attn.k_norm",
152
+ "model.layers.22.input_layernorm",
153
+ "model.layers.22.post_attention_layernorm",
154
+ "model.layers.23.self_attn.q_norm",
155
+ "model.layers.23.self_attn.k_norm",
156
+ "model.layers.23.input_layernorm",
157
+ "model.layers.23.post_attention_layernorm",
158
+ "model.layers.24.self_attn.q_norm",
159
+ "model.layers.24.self_attn.k_norm",
160
+ "model.layers.24.input_layernorm",
161
+ "model.layers.24.post_attention_layernorm",
162
+ "model.layers.25.self_attn.q_norm",
163
+ "model.layers.25.self_attn.k_norm",
164
+ "model.layers.25.input_layernorm",
165
+ "model.layers.25.post_attention_layernorm",
166
+ "model.layers.26.self_attn.q_norm",
167
+ "model.layers.26.self_attn.k_norm",
168
+ "model.layers.26.input_layernorm",
169
+ "model.layers.26.post_attention_layernorm",
170
+ "model.layers.27.self_attn.q_norm",
171
+ "model.layers.27.self_attn.k_norm",
172
+ "model.layers.27.input_layernorm",
173
+ "model.layers.27.post_attention_layernorm",
174
+ "model.layers.28.self_attn.q_norm",
175
+ "model.layers.28.self_attn.k_norm",
176
+ "model.layers.28.input_layernorm",
177
+ "model.layers.28.post_attention_layernorm",
178
+ "model.layers.29.self_attn.q_norm",
179
+ "model.layers.29.self_attn.k_norm",
180
+ "model.layers.29.input_layernorm",
181
+ "model.layers.29.post_attention_layernorm",
182
+ "model.layers.30.self_attn.q_norm",
183
+ "model.layers.30.self_attn.k_norm",
184
+ "model.layers.30.input_layernorm",
185
+ "model.layers.30.post_attention_layernorm",
186
+ "model.layers.31.self_attn.q_norm",
187
+ "model.layers.31.self_attn.k_norm",
188
+ "model.layers.31.input_layernorm",
189
+ "model.layers.31.post_attention_layernorm",
190
+ "model.layers.32.self_attn.q_norm",
191
+ "model.layers.32.self_attn.k_norm",
192
+ "model.layers.32.input_layernorm",
193
+ "model.layers.32.post_attention_layernorm",
194
+ "model.layers.33.self_attn.q_norm",
195
+ "model.layers.33.self_attn.k_norm",
196
+ "model.layers.33.input_layernorm",
197
+ "model.layers.33.post_attention_layernorm",
198
+ "model.layers.34.self_attn.q_norm",
199
+ "model.layers.34.self_attn.k_norm",
200
+ "model.layers.34.input_layernorm",
201
+ "model.layers.34.post_attention_layernorm",
202
+ "model.layers.35.self_attn.q_norm",
203
+ "model.layers.35.self_attn.k_norm",
204
+ "model.layers.35.input_layernorm",
205
+ "model.layers.35.post_attention_layernorm",
206
+ "model.norm",
207
+ "lm_head"
208
+ ],
209
  "quant_method": "torchao",
210
  "quant_type": {
211
  "default": {
 
246
  "_type": "Int8DynamicActivationIntxWeightConfig",
247
  "_version": 2
248
  },
249
+ "model.embed_tokens": {
250
  "_data": {
251
+ "act_mapping_type": {
252
+ "_data": "ASYMMETRIC",
253
+ "_type": "MappingType"
 
 
 
254
  },
255
  "intx_packing_format": {
256
  "_data": "UNPACKED_TO_INT8",
 
261
  "_type": "QDQLayout",
262
  "_version": 1
263
  },
 
 
 
 
 
264
  "weight_dtype": {
265
  "_data": "int4",
266
  "_type": "torch.dtype"
267
+ },
268
+ "weight_granularity": {
 
 
 
 
 
 
269
  "_data": {
270
  "axis": 0
271
  },
272
  "_type": "PerAxis",
273
  "_version": 1
274
  },
275
+ "weight_mapping_type": {
 
 
 
 
 
 
 
 
 
276
  "_data": "SYMMETRIC",
277
  "_type": "MappingType"
278
  },
279
+ "weight_scale_dtype": null
 
 
 
 
280
  },
281
+ "_type": "Int8DynamicActivationIntxWeightConfig",
282
  "_version": 2
283
  }
284
  }
generation_config.json CHANGED
@@ -1,11 +1,10 @@
1
  {
2
- "bos_token_id": 151643,
3
  "do_sample": true,
4
  "eos_token_id": [
5
  151645,
6
- 151643
7
  ],
8
- "pad_token_id": 151643,
9
  "temperature": 0.6,
10
  "top_k": 20,
11
  "top_p": 0.95,
 
1
  {
 
2
  "do_sample": true,
3
  "eos_token_id": [
4
  151645,
5
+ 151645
6
  ],
7
+ "pad_token_id": 151645,
8
  "temperature": 0.6,
9
  "top_k": 20,
10
  "top_p": 0.95,
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12dfdb01ec3178c45b8bc36dcfeee40e41c66daf213160b091631c426dcbf4d6
3
+ size 4419374407