🪐 A family of small models with 135M, 360M, and 1.7B parameters, trained on a new high-quality dataset.
135m
360m
1.7b
156.8K Pulls Updated 4 months ago
bd15e26978d3 · 219MB
-
general.architecturellama
-
general.base_model.0.nameSmolLM 360M
-
general.base_model.0.organizationHuggingFaceTB
-
general.base_model.0.repo_urlhttps://huggingface.co/HuggingFaceTB/SmolLM-360M
-
general.base_model.count1
-
general.basenameSmolLM
-
general.datasets[Magpie-Align/Magpie-Pro-300K-Filtered, bigcode/self-oss-instruct-sc2-exec-filter-50k, teknium/OpenHermes-2.5, HuggingFaceTB/everyday-conversations-llama3.1-2k]
-
general.file_type10
-
general.finetuneInstruct
-
general.languages[en]
-
general.licenseapache-2.0
-
general.nameSmolLM 360M
-
general.organizationHuggingFaceTB
-
general.quantization_version2
-
general.size_label360M
-
general.tags[alignment-handbook, trl, sft]
-
general.typemodel
-
llama.attention.head_count15
-
llama.attention.head_count_kv5
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count32
-
llama.context_length2048
-
llama.embedding_length960
-
llama.feed_forward_length2560
-
llama.rope.dimension_count64
-
llama.rope.freq_base10000
-
llama.vocab_size49152
-
tokenizer.ggml.add_bos_tokenfalse
-
tokenizer.ggml.add_space_prefixfalse
-
tokenizer.ggml.bos_token_id1
-
tokenizer.ggml.eos_token_id2
-
tokenizer.ggml.merges[Ġ t, Ġ a, i n, h e, Ġ Ġ, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.padding_token_id2
-
tokenizer.ggml.presmollm
-
tokenizer.ggml.token_type[3, 3, 3, 3, 3, ...]
-
tokenizer.ggml.tokens[<|endoftext|>, <|im_start|>, <|im_end|>, <repo_name>, <reponame>, ...]
-
tokenizer.ggml.unknown_token_id0
-
NameTypeShape
-
token_embd.weightQ8_0[960, 49152]
-
blk.0.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.0.attn_norm.weightF32[960]
-
blk.0.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.0.ffn_down.weightQ3_K[2560, 960]
-
blk.0.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.0.ffn_norm.weightF32[960]
-
blk.0.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.1.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.1.attn_norm.weightF32[960]
-
blk.1.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.1.ffn_down.weightQ3_K[2560, 960]
-
blk.1.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.1.ffn_norm.weightF32[960]
-
blk.1.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.2.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.2.attn_norm.weightF32[960]
-
blk.2.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.2.ffn_down.weightQ3_K[2560, 960]
-
blk.2.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.2.ffn_norm.weightF32[960]
-
blk.2.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.3.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.3.attn_norm.weightF32[960]
-
blk.3.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.3.ffn_down.weightQ3_K[2560, 960]
-
blk.3.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.3.ffn_norm.weightF32[960]
-
blk.3.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.4.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.4.attn_norm.weightF32[960]
-
blk.4.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.4.ffn_down.weightQ3_K[2560, 960]
-
blk.4.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.4.ffn_norm.weightF32[960]
-
blk.4.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.5.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.5.attn_norm.weightF32[960]
-
blk.5.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.5.ffn_down.weightQ3_K[2560, 960]
-
blk.5.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.5.ffn_norm.weightF32[960]
-
blk.5.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.6.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.6.attn_norm.weightF32[960]
-
blk.6.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.6.ffn_down.weightQ3_K[2560, 960]
-
blk.6.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.6.ffn_norm.weightF32[960]
-
blk.6.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.7.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.7.attn_norm.weightF32[960]
-
blk.7.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.7.ffn_down.weightQ3_K[2560, 960]
-
blk.7.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.7.ffn_norm.weightF32[960]
-
blk.7.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.8.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.8.attn_norm.weightF32[960]
-
blk.8.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.8.ffn_down.weightQ3_K[2560, 960]
-
blk.8.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.8.ffn_norm.weightF32[960]
-
blk.8.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.9.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.9.attn_norm.weightF32[960]
-
blk.9.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.9.ffn_down.weightQ3_K[2560, 960]
-
blk.9.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.9.ffn_norm.weightF32[960]
-
blk.9.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.10.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.10.attn_norm.weightF32[960]
-
blk.10.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.10.ffn_down.weightQ3_K[2560, 960]
-
blk.10.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.10.ffn_norm.weightF32[960]
-
blk.10.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.11.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.11.attn_norm.weightF32[960]
-
blk.11.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.11.ffn_down.weightQ3_K[2560, 960]
-
blk.11.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.11.ffn_norm.weightF32[960]
-
blk.11.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.12.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.12.attn_norm.weightF32[960]
-
blk.12.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.12.ffn_down.weightQ3_K[2560, 960]
-
blk.12.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.12.ffn_norm.weightF32[960]
-
blk.12.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.13.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.13.attn_norm.weightF32[960]
-
blk.13.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.13.ffn_down.weightQ3_K[2560, 960]
-
blk.13.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.13.ffn_norm.weightF32[960]
-
blk.13.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.14.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.14.attn_norm.weightF32[960]
-
blk.14.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.14.ffn_down.weightQ3_K[2560, 960]
-
blk.14.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.14.ffn_norm.weightF32[960]
-
blk.14.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.15.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.15.attn_norm.weightF32[960]
-
blk.15.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.15.ffn_down.weightQ3_K[2560, 960]
-
blk.15.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.15.ffn_norm.weightF32[960]
-
blk.15.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.16.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.16.attn_norm.weightF32[960]
-
blk.16.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.16.ffn_down.weightQ3_K[2560, 960]
-
blk.16.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.16.ffn_norm.weightF32[960]
-
blk.16.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.17.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.17.attn_norm.weightF32[960]
-
blk.17.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.17.ffn_down.weightQ3_K[2560, 960]
-
blk.17.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.17.ffn_norm.weightF32[960]
-
blk.17.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.18.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.18.attn_norm.weightF32[960]
-
blk.18.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.18.ffn_down.weightQ3_K[2560, 960]
-
blk.18.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.18.ffn_norm.weightF32[960]
-
blk.18.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.19.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.19.attn_norm.weightF32[960]
-
blk.19.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.19.ffn_down.weightQ3_K[2560, 960]
-
blk.19.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.19.ffn_norm.weightF32[960]
-
blk.19.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.20.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.20.attn_norm.weightF32[960]
-
blk.20.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.20.ffn_down.weightQ3_K[2560, 960]
-
blk.20.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.20.ffn_norm.weightF32[960]
-
blk.20.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.21.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.21.attn_norm.weightF32[960]
-
blk.21.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.21.ffn_down.weightQ3_K[2560, 960]
-
blk.21.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.21.ffn_norm.weightF32[960]
-
blk.21.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.22.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.22.attn_norm.weightF32[960]
-
blk.22.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.22.ffn_down.weightQ3_K[2560, 960]
-
blk.22.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.22.ffn_norm.weightF32[960]
-
blk.22.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.23.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.23.attn_norm.weightF32[960]
-
blk.23.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.23.ffn_down.weightQ3_K[2560, 960]
-
blk.23.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.23.ffn_norm.weightF32[960]
-
blk.23.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.24.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.24.attn_norm.weightF32[960]
-
blk.24.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.24.ffn_down.weightQ3_K[2560, 960]
-
blk.24.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.24.ffn_norm.weightF32[960]
-
blk.24.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.25.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.25.attn_norm.weightF32[960]
-
blk.25.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.25.ffn_down.weightQ3_K[2560, 960]
-
blk.25.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.25.ffn_norm.weightF32[960]
-
blk.25.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.26.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.26.attn_norm.weightF32[960]
-
blk.26.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.26.ffn_down.weightQ3_K[2560, 960]
-
blk.26.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.26.ffn_norm.weightF32[960]
-
blk.26.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.27.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.27.attn_norm.weightF32[960]
-
blk.27.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.27.ffn_down.weightQ3_K[2560, 960]
-
blk.27.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.27.ffn_norm.weightF32[960]
-
blk.27.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.28.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.28.attn_norm.weightF32[960]
-
blk.28.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.28.ffn_down.weightQ3_K[2560, 960]
-
blk.28.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.28.ffn_norm.weightF32[960]
-
blk.28.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.29.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.29.attn_norm.weightF32[960]
-
blk.29.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.29.ffn_down.weightQ3_K[2560, 960]
-
blk.29.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.29.ffn_norm.weightF32[960]
-
blk.29.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.30.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.30.attn_norm.weightF32[960]
-
blk.30.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.30.ffn_down.weightQ3_K[2560, 960]
-
blk.30.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.30.ffn_norm.weightF32[960]
-
blk.30.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.31.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.31.attn_norm.weightF32[960]
-
blk.31.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.31.ffn_down.weightQ3_K[2560, 960]
-
blk.31.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.31.ffn_norm.weightF32[960]
-
blk.31.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
output_norm.weightF32[960]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29
blk.30
blk.31