Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- cal_data.safetensors +3 -0
- config.json +26 -0
- example1.png +0 -0
- example2.png +0 -0
- example3.png +0 -0
- generation_config.json +7 -0
- hidden_states.safetensors +3 -0
- job_new.json +0 -0
- measurement.json +0 -0
- model.safetensors.index.json +442 -0
- mtbench-comparison.png +0 -0
- needle-in-a-haystack.txt +898 -0
- out_tensor/lm_head.safetensors +3 -0
- out_tensor/model.layers.0.mlp.down_proj.safetensors +3 -0
- out_tensor/model.layers.0.mlp.gate_proj.safetensors +3 -0
- out_tensor/model.layers.0.mlp.up_proj.safetensors +3 -0
- out_tensor/model.layers.0.self_attn.k_proj.safetensors +3 -0
- out_tensor/model.layers.0.self_attn.o_proj.safetensors +3 -0
- out_tensor/model.layers.0.self_attn.q_proj.safetensors +3 -0
- out_tensor/model.layers.0.self_attn.v_proj.safetensors +3 -0
- out_tensor/model.layers.1.mlp.down_proj.safetensors +3 -0
- out_tensor/model.layers.1.mlp.gate_proj.safetensors +3 -0
- out_tensor/model.layers.1.mlp.up_proj.safetensors +3 -0
- out_tensor/model.layers.1.self_attn.k_proj.safetensors +3 -0
- out_tensor/model.layers.1.self_attn.o_proj.safetensors +3 -0
- out_tensor/model.layers.1.self_attn.q_proj.safetensors +3 -0
- out_tensor/model.layers.1.self_attn.v_proj.safetensors +3 -0
- out_tensor/model.layers.10.mlp.down_proj.safetensors +3 -0
- out_tensor/model.layers.10.mlp.gate_proj.safetensors +3 -0
- out_tensor/model.layers.10.mlp.up_proj.safetensors +3 -0
- out_tensor/model.layers.10.self_attn.k_proj.safetensors +3 -0
- out_tensor/model.layers.10.self_attn.o_proj.safetensors +3 -0
- out_tensor/model.layers.10.self_attn.q_proj.safetensors +3 -0
- out_tensor/model.layers.10.self_attn.v_proj.safetensors +3 -0
- out_tensor/model.layers.11.mlp.down_proj.safetensors +3 -0
- out_tensor/model.layers.11.mlp.gate_proj.safetensors +3 -0
- out_tensor/model.layers.11.mlp.up_proj.safetensors +3 -0
- out_tensor/model.layers.11.self_attn.k_proj.safetensors +3 -0
- out_tensor/model.layers.11.self_attn.o_proj.safetensors +3 -0
- out_tensor/model.layers.11.self_attn.q_proj.safetensors +3 -0
- out_tensor/model.layers.11.self_attn.v_proj.safetensors +3 -0
- out_tensor/model.layers.12.mlp.down_proj.safetensors +3 -0
- out_tensor/model.layers.12.mlp.gate_proj.safetensors +3 -0
- out_tensor/model.layers.12.mlp.up_proj.safetensors +3 -0
- out_tensor/model.layers.12.self_attn.k_proj.safetensors +3 -0
- out_tensor/model.layers.12.self_attn.o_proj.safetensors +3 -0
- out_tensor/model.layers.12.self_attn.q_proj.safetensors +3 -0
- out_tensor/model.layers.12.self_attn.v_proj.safetensors +3 -0
- out_tensor/model.layers.13.mlp.down_proj.safetensors +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
rubra-11b-h.png filter=lfs diff=lfs merge=lfs -text
|
cal_data.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08be1103ff8fcef33b570f3c0f5ae4cc7f9dc5c3f264105baa55fc9b132ed1be
|
3 |
+
size 1638488
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "models/rubra-11b-h",
|
3 |
+
"architectures": [
|
4 |
+
"MistralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 1,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "silu",
|
10 |
+
"hidden_size": 4096,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 14336,
|
13 |
+
"max_position_embeddings": 32768,
|
14 |
+
"model_type": "mistral",
|
15 |
+
"num_attention_heads": 32,
|
16 |
+
"num_hidden_layers": 48,
|
17 |
+
"num_key_value_heads": 8,
|
18 |
+
"rms_norm_eps": 1e-05,
|
19 |
+
"rope_theta": 1000000.0,
|
20 |
+
"sliding_window": null,
|
21 |
+
"tie_word_embeddings": false,
|
22 |
+
"torch_dtype": "float16",
|
23 |
+
"transformers_version": "4.38.2",
|
24 |
+
"use_cache": false,
|
25 |
+
"vocab_size": 32000
|
26 |
+
}
|
example1.png
ADDED
example2.png
ADDED
example3.png
ADDED
generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"transformers_version": "4.38.2",
|
6 |
+
"use_cache": false
|
7 |
+
}
|
hidden_states.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4318a74c06cf05ea33ea07878cc25d5d18876645a3fddf648720e6481defc27
|
3 |
+
size 1677730376
|
job_new.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
measurement.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 24952840192
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "model-00006-of-00006.safetensors",
|
7 |
+
"model.embed_tokens.weight": "model-00001-of-00006.safetensors",
|
8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
13 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
14 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
15 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
16 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
17 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
18 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
19 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
20 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
21 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
22 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
23 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
24 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
25 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
26 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
27 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
28 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
29 |
+
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
30 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
31 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
32 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
33 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
34 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
35 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
36 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
37 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
38 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
39 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
40 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
41 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
42 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
43 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
44 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
45 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
46 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
47 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
48 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
49 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
50 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
51 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
52 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
53 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
54 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
55 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
56 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
57 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
58 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
59 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
60 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
61 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
62 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
63 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
64 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
65 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
66 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
67 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
68 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
69 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
70 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
71 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
72 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
73 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
74 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
75 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
76 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
77 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
78 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
79 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
80 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
81 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
82 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
83 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
84 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
85 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
86 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
87 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
88 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
89 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
90 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
91 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
92 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
93 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
94 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
95 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
96 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
97 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
98 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
99 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
100 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
|
101 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
|
102 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
103 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
104 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
105 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
106 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
107 |
+
"model.layers.19.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
108 |
+
"model.layers.19.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
109 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
110 |
+
"model.layers.19.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
111 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
112 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
|
113 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
|
114 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
|
115 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
|
116 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
117 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
118 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
119 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
120 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
121 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
122 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
123 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
124 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
125 |
+
"model.layers.20.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
126 |
+
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
127 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
128 |
+
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
129 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
130 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
131 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
132 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
133 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
134 |
+
"model.layers.21.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
135 |
+
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
136 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
137 |
+
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
138 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
139 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
140 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
141 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
142 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
143 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
144 |
+
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
145 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
146 |
+
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
147 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
148 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
149 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
150 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
151 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
152 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
153 |
+
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
154 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
155 |
+
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
156 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
157 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
158 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
159 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
160 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
161 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
162 |
+
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
163 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
164 |
+
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
165 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
166 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
167 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
168 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
169 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
170 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
171 |
+
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
172 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
173 |
+
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
174 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
175 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
176 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
177 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
178 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
179 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
180 |
+
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
181 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
182 |
+
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
183 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
184 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
185 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
186 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
187 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
188 |
+
"model.layers.27.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
189 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
190 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
191 |
+
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
192 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
193 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
194 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
195 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
196 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
197 |
+
"model.layers.28.input_layernorm.weight": "model-00003-of-00006.safetensors",
|
198 |
+
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
|
199 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
|
200 |
+
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
|
201 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
|
202 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
203 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
204 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
205 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
206 |
+
"model.layers.29.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
207 |
+
"model.layers.29.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
208 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
209 |
+
"model.layers.29.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
210 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
211 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
|
212 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
|
213 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
|
214 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
|
215 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
216 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
217 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
218 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
219 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
220 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
221 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
222 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
223 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
224 |
+
"model.layers.30.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
225 |
+
"model.layers.30.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
226 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
227 |
+
"model.layers.30.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
228 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
229 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
230 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
231 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
232 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
233 |
+
"model.layers.31.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
234 |
+
"model.layers.31.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
235 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
236 |
+
"model.layers.31.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
237 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
238 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
239 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
240 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
241 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
242 |
+
"model.layers.32.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
243 |
+
"model.layers.32.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
244 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
245 |
+
"model.layers.32.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
246 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
247 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
248 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
249 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
250 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
251 |
+
"model.layers.33.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
252 |
+
"model.layers.33.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
253 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
254 |
+
"model.layers.33.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
255 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
256 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
257 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
258 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
259 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
260 |
+
"model.layers.34.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
261 |
+
"model.layers.34.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
262 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
263 |
+
"model.layers.34.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
264 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
265 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
266 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
267 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
268 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
269 |
+
"model.layers.35.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
270 |
+
"model.layers.35.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
271 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
272 |
+
"model.layers.35.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
273 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
274 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
275 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
276 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
277 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
278 |
+
"model.layers.36.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
279 |
+
"model.layers.36.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
280 |
+
"model.layers.36.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
281 |
+
"model.layers.36.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
282 |
+
"model.layers.36.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
283 |
+
"model.layers.36.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
284 |
+
"model.layers.36.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
285 |
+
"model.layers.36.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
286 |
+
"model.layers.36.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
287 |
+
"model.layers.37.input_layernorm.weight": "model-00004-of-00006.safetensors",
|
288 |
+
"model.layers.37.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
|
289 |
+
"model.layers.37.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
290 |
+
"model.layers.37.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
291 |
+
"model.layers.37.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
|
292 |
+
"model.layers.37.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
293 |
+
"model.layers.37.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
294 |
+
"model.layers.37.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
295 |
+
"model.layers.37.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
296 |
+
"model.layers.38.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
297 |
+
"model.layers.38.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
298 |
+
"model.layers.38.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
|
299 |
+
"model.layers.38.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
|
300 |
+
"model.layers.38.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
301 |
+
"model.layers.38.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
|
302 |
+
"model.layers.38.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
|
303 |
+
"model.layers.38.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
|
304 |
+
"model.layers.38.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
|
305 |
+
"model.layers.39.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
306 |
+
"model.layers.39.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
307 |
+
"model.layers.39.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
308 |
+
"model.layers.39.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
309 |
+
"model.layers.39.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
310 |
+
"model.layers.39.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
311 |
+
"model.layers.39.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
312 |
+
"model.layers.39.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
313 |
+
"model.layers.39.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
314 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
315 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
316 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
317 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
318 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
319 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
320 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
321 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
322 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
323 |
+
"model.layers.40.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
324 |
+
"model.layers.40.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
325 |
+
"model.layers.40.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
326 |
+
"model.layers.40.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
327 |
+
"model.layers.40.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
328 |
+
"model.layers.40.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
329 |
+
"model.layers.40.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
330 |
+
"model.layers.40.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
331 |
+
"model.layers.40.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
332 |
+
"model.layers.41.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
333 |
+
"model.layers.41.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
334 |
+
"model.layers.41.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
335 |
+
"model.layers.41.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
336 |
+
"model.layers.41.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
337 |
+
"model.layers.41.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
338 |
+
"model.layers.41.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
339 |
+
"model.layers.41.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
340 |
+
"model.layers.41.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
341 |
+
"model.layers.42.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
342 |
+
"model.layers.42.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
343 |
+
"model.layers.42.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
344 |
+
"model.layers.42.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
345 |
+
"model.layers.42.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
346 |
+
"model.layers.42.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
347 |
+
"model.layers.42.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
348 |
+
"model.layers.42.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
349 |
+
"model.layers.42.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
350 |
+
"model.layers.43.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
351 |
+
"model.layers.43.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
352 |
+
"model.layers.43.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
353 |
+
"model.layers.43.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
354 |
+
"model.layers.43.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
355 |
+
"model.layers.43.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
356 |
+
"model.layers.43.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
357 |
+
"model.layers.43.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
358 |
+
"model.layers.43.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
359 |
+
"model.layers.44.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
360 |
+
"model.layers.44.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
361 |
+
"model.layers.44.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
362 |
+
"model.layers.44.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
363 |
+
"model.layers.44.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
364 |
+
"model.layers.44.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
365 |
+
"model.layers.44.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
366 |
+
"model.layers.44.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
367 |
+
"model.layers.44.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
368 |
+
"model.layers.45.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
369 |
+
"model.layers.45.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
370 |
+
"model.layers.45.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
371 |
+
"model.layers.45.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
372 |
+
"model.layers.45.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
373 |
+
"model.layers.45.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
374 |
+
"model.layers.45.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
375 |
+
"model.layers.45.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
376 |
+
"model.layers.45.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
377 |
+
"model.layers.46.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
378 |
+
"model.layers.46.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
379 |
+
"model.layers.46.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
380 |
+
"model.layers.46.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
381 |
+
"model.layers.46.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
382 |
+
"model.layers.46.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
383 |
+
"model.layers.46.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
384 |
+
"model.layers.46.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
385 |
+
"model.layers.46.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
386 |
+
"model.layers.47.input_layernorm.weight": "model-00005-of-00006.safetensors",
|
387 |
+
"model.layers.47.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
|
388 |
+
"model.layers.47.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
|
389 |
+
"model.layers.47.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
|
390 |
+
"model.layers.47.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
|
391 |
+
"model.layers.47.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
|
392 |
+
"model.layers.47.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
|
393 |
+
"model.layers.47.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
|
394 |
+
"model.layers.47.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
|
395 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
396 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
397 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
398 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
399 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
400 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
401 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
402 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
403 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
404 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
405 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
406 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
407 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
408 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
409 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
410 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
411 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
412 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
413 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
414 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
415 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
416 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
417 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
418 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
419 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
420 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
421 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
422 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00006.safetensors",
|
423 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
|
424 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
425 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
426 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
|
427 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
428 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
429 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
430 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
431 |
+
"model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
|
432 |
+
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
|
433 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
|
434 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
|
435 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
|
436 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
|
437 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
|
438 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
|
439 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
|
440 |
+
"model.norm.weight": "model-00005-of-00006.safetensors"
|
441 |
+
}
|
442 |
+
}
|
mtbench-comparison.png
ADDED
needle-in-a-haystack.txt
ADDED
@@ -0,0 +1,898 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
what is the random number?
|
2 |
+
```
|
3 |
+
May 2006(This essay is derived from a keynote at Xtech.)Could you reproduce Silicon Valley elsewhere, or is there something
|
4 |
+
unique about it?It wouldn't be surprising if it were hard to reproduce in other
|
5 |
+
countries, because you couldn't reproduce it in most of the US
|
6 |
+
either. What does it take to make a silicon valley even here?What it takes is the right people. If you could get the right ten
|
7 |
+
thousand people to move from Silicon Valley to Buffalo, Buffalo
|
8 |
+
would become Silicon Valley.
|
9 |
+
[1]That's a striking departure from the past. Up till a couple decades
|
10 |
+
ago, geography was destiny for cities. All great cities were located
|
11 |
+
on waterways, because cities made money by trade, and water was the
|
12 |
+
only economical way to ship.Now you could make a great city anywhere, if you could get the right
|
13 |
+
people to move there. So the question of how to make a silicon
|
14 |
+
valley becomes: who are the right people, and how do you get them
|
15 |
+
to move?Two TypesI think you only need two kinds of people to create a technology
|
16 |
+
hub: rich people and nerds. They're the limiting reagents in the
|
17 |
+
reaction that produces startups, because they're the only ones
|
18 |
+
present when startups get started. Everyone else will move.Observation bears this out: within the US, towns have become startup
|
19 |
+
hubs if and only if they have both rich people and nerds. Few
|
20 |
+
startups happen in Miami, for example, because although it's full
|
21 |
+
of rich people, it has few nerds. It's not the kind of place nerds
|
22 |
+
like.Whereas Pittsburgh has the opposite problem: plenty of nerds, but
|
23 |
+
no rich people. The top US Computer Science departments are said
|
24 |
+
to be MIT, Stanford, Berkeley, and Carnegie-Mellon. MIT yielded
|
25 |
+
Route 128. Stanford and Berkeley yielded Silicon Valley. But
|
26 |
+
Carnegie-Mellon? The record skips at that point. Lower down the
|
27 |
+
list, the University of Washington yielded a high-tech community
|
28 |
+
in Seattle, and the University of Texas at Austin yielded one in
|
29 |
+
Austin. But what happened in Pittsburgh? And in Ithaca, home of
|
30 |
+
Cornell, which is also high on the list?I grew up in Pittsburgh and went to college at Cornell, so I can
|
31 |
+
answer for both. The weather is terrible, particularly in winter,
|
32 |
+
and there's no interesting old city to make up for it, as there is
|
33 |
+
in Boston. Rich people don't want to live in Pittsburgh or Ithaca.
|
34 |
+
So while there are plenty of hackers who could start startups,
|
35 |
+
there's no one to invest in them.Not BureaucratsDo you really need the rich people? Wouldn't it work to have the
|
36 |
+
government invest in the nerds? No, it would not. Startup investors
|
37 |
+
are a distinct type of rich people. They tend to have a lot of
|
38 |
+
experience themselves in the technology business. This (a) helps
|
39 |
+
them pick the right startups, and (b) means they can supply advice
|
40 |
+
and connections as well as money. And the fact that they have a
|
41 |
+
personal stake in the outcome makes them really pay attention.Bureaucrats by their nature are the exact opposite sort of people
|
42 |
+
from startup investors. The idea of them making startup investments
|
43 |
+
is comic. It would be like mathematicians running Vogue-- or
|
44 |
+
perhaps more accurately, Vogue editors running a math journal.
|
45 |
+
[2]Though indeed, most things bureaucrats do, they do badly. We just
|
46 |
+
don't notice usually, because they only have to compete against
|
47 |
+
other bureaucrats. But as startup investors they'd have to compete
|
48 |
+
against pros with a great deal more experience and motivation.Even corporations that have in-house VC groups generally forbid
|
49 |
+
them to make their own investment decisions. Most are only allowed
|
50 |
+
to invest in deals where some reputable private VC firm is willing
|
51 |
+
to act as lead investor.Not BuildingsIf you go to see Silicon Valley, what you'll see are buildings.
|
52 |
+
But it's the people that make it Silicon Valley, not the buildings.
|
53 |
+
I read occasionally about attempts to set up "technology
|
54 |
+
parks" in other places, as if the active ingredient of Silicon
|
55 |
+
Valley were the office space. An article about Sophia Antipolis
|
56 |
+
bragged that companies there included Cisco, Compaq, IBM, NCR, and
|
57 |
+
Nortel. Don't the French realize these aren't startups?Building office buildings for technology companies won't get you a
|
58 |
+
silicon valley, because the key stage in the life of a startup
|
59 |
+
happens before they want that kind of space. The key stage is when
|
60 |
+
they're three guys operating out of an apartment. Wherever the
|
61 |
+
startup is when it gets funded, it will stay. The defining quality
|
62 |
+
of Silicon Valley is not that Intel or Apple or Google have offices
|
63 |
+
there, but that they were started there.So if you want to reproduce Silicon Valley, what you need to reproduce
|
64 |
+
is those two or three founders sitting around a kitchen table
|
65 |
+
deciding to start a company. And to reproduce that you need those
|
66 |
+
people.UniversitiesThe exciting thing is, all you need are the people. If you could
|
67 |
+
attract a critical mass of nerds and investors to live somewhere,
|
68 |
+
you could reproduce Silicon Valley. And both groups are highly
|
69 |
+
mobile. They'll go where life is good. So what makes a place good
|
70 |
+
to them?What nerds like is other nerds. Smart people will go wherever other
|
71 |
+
smart people are. And in particular, to great universities. In
|
72 |
+
theory there could be other ways to attract them, but so far
|
73 |
+
universities seem to be indispensable. Within the US, there are
|
74 |
+
no technology hubs without first-rate universities-- or at least,
|
75 |
+
first-rate computer science departments.So if you want to make a silicon valley, you not only need a
|
76 |
+
university, but one of the top handful in the world. It has to be
|
77 |
+
good enough to act as a magnet, drawing the best people from thousands
|
78 |
+
of miles away. And that means it has to stand up to existing magnets
|
79 |
+
like MIT and Stanford.This sounds hard. Actually it might be easy. My professor friends,
|
80 |
+
when they're deciding where they'd like to work, consider one thing
|
81 |
+
above all: the quality of the other faculty. What attracts professors
|
82 |
+
is good colleagues. So if you managed to recruit, en masse, a
|
83 |
+
significant number of the best young researchers, you could create
|
84 |
+
a first-rate university from nothing overnight. And you could do
|
85 |
+
that for surprisingly little. If you paid 200 people hiring bonuses
|
86 |
+
of $3 million apiece, you could put together a faculty that would
|
87 |
+
bear comparison with any in the world. And from that point the
|
88 |
+
chain reaction would be self-sustaining. So whatever it costs to
|
89 |
+
establish a mediocre university, for an additional half billion or
|
90 |
+
so you could have a great one.
|
91 |
+
Random number is 23.
|
92 |
+
[3]PersonalityHowever, merely creating a new university would not be enough to
|
93 |
+
start a silicon valley. The university is just the seed. It has
|
94 |
+
to be planted in the right soil, or it won't germinate. Plant it
|
95 |
+
in the wrong place, and you just create Carnegie-Mellon.To spawn startups, your university has to be in a town that has
|
96 |
+
attractions other than the university. It has to be a place where
|
97 |
+
investors want to live, and students want to stay after they graduate.The two like much the same things, because most startup investors
|
98 |
+
are nerds themselves. So what do nerds look for in a town? Their
|
99 |
+
tastes aren't completely different from other people's, because a
|
100 |
+
lot of the towns they like most in the US are also big tourist
|
101 |
+
destinations: San Francisco, Boston, Seattle. But their tastes
|
102 |
+
can't be quite mainstream either, because they dislike other big
|
103 |
+
tourist destinations, like New York, Los Angeles, and Las Vegas.There has been a lot written lately about the "creative class." The
|
104 |
+
thesis seems to be that as wealth derives increasingly from ideas,
|
105 |
+
cities will prosper only if they attract those who have them. That
|
106 |
+
is certainly true; in fact it was the basis of Amsterdam's prosperity
|
107 |
+
400 years ago.A lot of nerd tastes they share with the creative class in general.
|
108 |
+
For example, they like well-preserved old neighborhoods instead of
|
109 |
+
cookie-cutter suburbs, and locally-owned shops and restaurants
|
110 |
+
instead of national chains. Like the rest of the creative class,
|
111 |
+
they want to live somewhere with personality.What exactly is personality? I think it's the feeling that each
|
112 |
+
building is the work of a distinct group of people. A town with
|
113 |
+
personality is one that doesn't feel mass-produced. So if you want
|
114 |
+
to make a startup hub-- or any town to attract the "creative class"--
|
115 |
+
you probably have to ban large development projects.
|
116 |
+
When a large tract has been developed by a single organization, you
|
117 |
+
can always tell.
|
118 |
+
[4]Most towns with personality are old, but they don't have to be.
|
119 |
+
Old towns have two advantages: they're denser, because they were
|
120 |
+
laid out before cars, and they're more varied, because they were
|
121 |
+
built one building at a time. You could have both now. Just have
|
122 |
+
building codes that ensure density, and ban large scale developments.A corollary is that you have to keep out the biggest developer of
|
123 |
+
all: the government. A government that asks "How can we build a
|
124 |
+
silicon valley?" has probably ensured failure by the way they framed
|
125 |
+
the question. You don't build a silicon valley; you let one grow.NerdsIf you want to attract nerds, you need more than a town with
|
126 |
+
personality. You need a town with the right personality. Nerds
|
127 |
+
are a distinct subset of the creative class, with different tastes
|
128 |
+
from the rest. You can see this most clearly in New York, which
|
129 |
+
attracts a lot of creative people, but few nerds.
|
130 |
+
[5]What nerds like is the kind of town where people walk around smiling.
|
131 |
+
This excludes LA, where no one walks at all, and also New York,
|
132 |
+
where people walk, but not smiling. When I was in grad school in
|
133 |
+
Boston, a friend came to visit from New York. On the subway back
|
134 |
+
from the airport she asked "Why is everyone smiling?" I looked and
|
135 |
+
they weren't smiling. They just looked like they were compared to
|
136 |
+
the facial expressions she was used to.If you've lived in New York, you know where these facial expressions
|
137 |
+
come from. It's the kind of place where your mind may be excited,
|
138 |
+
but your body knows it's having a bad time. People don't so much
|
139 |
+
enjoy living there as endure it for the sake of the excitement.
|
140 |
+
And if you like certain kinds of excitement, New York is incomparable.
|
141 |
+
It's a hub of glamour, a magnet for all the shorter half-life
|
142 |
+
isotopes of style and fame.Nerds don't care about glamour, so to them the appeal of New York
|
143 |
+
is a mystery. People who like New York will pay a fortune for a
|
144 |
+
small, dark, noisy apartment in order to live in a town where the
|
145 |
+
cool people are really cool. A nerd looks at that deal and sees
|
146 |
+
only: pay a fortune for a small, dark, noisy apartment.Nerds will pay a premium to live in a town where the smart people
|
147 |
+
are really smart, but you don't have to pay as much for that. It's
|
148 |
+
supply and demand: glamour is popular, so you have to pay a lot for
|
149 |
+
it.Most nerds like quieter pleasures. They like cafes instead of
|
150 |
+
clubs; used bookshops instead of fashionable clothing shops; hiking
|
151 |
+
instead of dancing; sunlight instead of tall buildings. A nerd's
|
152 |
+
idea of paradise is Berkeley or Boulder.YouthIt's the young nerds who start startups, so it's those specifically
|
153 |
+
the city has to appeal to. The startup hubs in the US are all
|
154 |
+
young-feeling towns. This doesn't mean they have to be new.
|
155 |
+
Cambridge has the oldest town plan in America, but it feels young
|
156 |
+
because it's full of students.What you can't have, if you want to create a silicon valley, is a
|
157 |
+
large, existing population of stodgy people. It would be a waste
|
158 |
+
of time to try to reverse the fortunes of a declining industrial town
|
159 |
+
like Detroit or Philadelphia by trying to encourage startups. Those
|
160 |
+
places have too much momentum in the wrong direction. You're better
|
161 |
+
off starting with a blank slate in the form of a small town. Or
|
162 |
+
better still, if there's a town young people already flock to, that
|
163 |
+
one.The Bay Area was a magnet for the young and optimistic for decades
|
164 |
+
before it was associated with technology. It was a place people
|
165 |
+
went in search of something new. And so it became synonymous with
|
166 |
+
California nuttiness. There's still a lot of that there. If you
|
167 |
+
wanted to start a new fad-- a new way to focus one's "energy," for
|
168 |
+
example, or a new category of things not to eat-- the Bay Area would
|
169 |
+
be the place to do it. But a place that tolerates oddness in the
|
170 |
+
search for the new is exactly what you want in a startup hub, because
|
171 |
+
economically that's what startups are. Most good startup ideas
|
172 |
+
seem a little crazy; if they were obviously good ideas, someone
|
173 |
+
would have done them already.(How many people are going to want computers in their houses?
|
174 |
+
What, another search engine?)That's the connection between technology and liberalism. Without
|
175 |
+
exception the high-tech cities in the US are also the most liberal.
|
176 |
+
But it's not because liberals are smarter that this is so. It's
|
177 |
+
because liberal cities tolerate odd ideas, and smart people by
|
178 |
+
definition have odd ideas.Conversely, a town that gets praised for being "solid" or representing
|
179 |
+
"traditional values" may be a fine place to live, but it's never
|
180 |
+
going to succeed as a startup hub. The 2004 presidential election,
|
181 |
+
though a disaster in other respects, conveniently supplied us with
|
182 |
+
a county-by-county
|
183 |
+
map of such places.
|
184 |
+
[6]To attract the young, a town must have an intact center. In most
|
185 |
+
American cities the center has been abandoned, and the growth, if
|
186 |
+
any, is in the suburbs. Most American cities have been turned
|
187 |
+
inside out. But none of the startup hubs has: not San Francisco,
|
188 |
+
or Boston, or Seattle. They all have intact centers.
|
189 |
+
[7]
|
190 |
+
My guess is that no city with a dead center could be turned into a
|
191 |
+
startup hub. Young people don't want to live in the suburbs.Within the US, the two cities I think could most easily be turned
|
192 |
+
into new silicon valleys are Boulder and Portland. Both have the
|
193 |
+
kind of effervescent feel that attracts the young. They're each
|
194 |
+
only a great university short of becoming a silicon valley, if they
|
195 |
+
wanted to.TimeA great university near an attractive town. Is that all it takes?
|
196 |
+
That was all it took to make the original Silicon Valley. Silicon
|
197 |
+
Valley traces its origins to William Shockley, one of the inventors
|
198 |
+
of the transistor. He did the research that won him the Nobel Prize
|
199 |
+
at Bell Labs, but when he started his own company in 1956 he moved
|
200 |
+
to Palo Alto to do it. At the time that was an odd thing to do.
|
201 |
+
Why did he? Because he had grown up there and remembered how nice
|
202 |
+
it was. Now Palo Alto is suburbia, but then it was a charming
|
203 |
+
college town-- a charming college town with perfect weather and San
|
204 |
+
Francisco only an hour away.The companies that rule Silicon Valley now are all descended in
|
205 |
+
various ways from Shockley Semiconductor. Shockley was a difficult
|
206 |
+
man, and in 1957 his top people-- "the traitorous eight"-- left to
|
207 |
+
start a new company, Fairchild Semiconductor. Among them were
|
208 |
+
Gordon Moore and Robert Noyce, who went on to found Intel, and
|
209 |
+
Eugene Kleiner, who founded the VC firm Kleiner Perkins. Forty-two
|
210 |
+
years later, Kleiner Perkins funded Google, and the partner responsible
|
211 |
+
for the deal was John Doerr, who came to Silicon Valley in 1974 to
|
212 |
+
work for Intel.So although a lot of the newest companies in Silicon Valley don't
|
213 |
+
make anything out of silicon, there always seem to be multiple links
|
214 |
+
back to Shockley. There's a lesson here: startups beget startups.
|
215 |
+
People who work for startups start their own. People who get rich
|
216 |
+
from startups fund new ones. I suspect this kind of organic growth
|
217 |
+
is the only way to produce a startup hub, because it's the only way
|
218 |
+
to grow the expertise you need.That has two important implications. The first is that you need
|
219 |
+
time to grow a silicon valley. The university you could create in
|
220 |
+
a couple years, but the startup community around it has to grow
|
221 |
+
organically. The cycle time is limited by the time it takes a
|
222 |
+
company to succeed, which probably averages about five years.The other implication of the organic growth hypothesis is that you
|
223 |
+
can't be somewhat of a startup hub. You either have a self-sustaining
|
224 |
+
chain reaction, or not. Observation confirms this too: cities
|
225 |
+
either have a startup scene, or they don't. There is no middle
|
226 |
+
ground. Chicago has the third largest metropolitan area in America.
|
227 |
+
As source of startups it's negligible compared to Seattle, number 15.The good news is that the initial seed can be quite small. Shockley
|
228 |
+
Semiconductor, though itself not very successful, was big enough.
|
229 |
+
It brought a critical mass of experts in an important new technology
|
230 |
+
together in a place they liked enough to stay.CompetingOf course, a would-be silicon valley faces an obstacle the original
|
231 |
+
one didn't: it has to compete with Silicon Valley. Can that be
|
232 |
+
done? Probably.One of Silicon Valley's biggest advantages is its venture capital
|
233 |
+
firms. This was not a factor in Shockley's day, because VC funds
|
234 |
+
didn't exist. In fact, Shockley Semiconductor and Fairchild
|
235 |
+
Semiconductor were not startups at all in our sense. They were
|
236 |
+
subsidiaries-- of Beckman Instruments and Fairchild Camera and
|
237 |
+
Instrument respectively. Those companies were apparently willing
|
238 |
+
to establish subsidiaries wherever the experts wanted to live.Venture investors, however, prefer to fund startups within an hour's
|
239 |
+
drive. For one, they're more likely to notice startups nearby.
|
240 |
+
But when they do notice startups in other towns they prefer them
|
241 |
+
to move. They don't want to have to travel to attend board meetings,
|
242 |
+
and in any case the odds of succeeding are higher in a startup hub.The centralizing effect of venture firms is a double one: they cause
|
243 |
+
startups to form around them, and those draw in more startups through
|
244 |
+
acquisitions. And although the first may be weakening because it's
|
245 |
+
now so cheap to start some startups, the second seems as strong as ever.
|
246 |
+
Three of the most admired
|
247 |
+
"Web 2.0" companies were started outside the usual startup hubs,
|
248 |
+
but two of them have already been reeled in through acquisitions.Such centralizing forces make it harder for new silicon valleys to
|
249 |
+
get started. But by no means impossible. Ultimately power rests
|
250 |
+
with the founders. A startup with the best people will beat one
|
251 |
+
with funding from famous VCs, and a startup that was sufficiently
|
252 |
+
successful would never have to move. So a town that
|
253 |
+
could exert enough pull over the right people could resist and
|
254 |
+
perhaps even surpass Silicon Valley.For all its power, Silicon Valley has a great weakness: the paradise
|
255 |
+
Shockley found in 1956 is now one giant parking lot. San Francisco
|
256 |
+
and Berkeley are great, but they're forty miles away. Silicon
|
257 |
+
Valley proper is soul-crushing suburban sprawl. It
|
258 |
+
has fabulous weather, which makes it significantly better than the
|
259 |
+
soul-crushing sprawl of most other American cities. But a competitor
|
260 |
+
that managed to avoid sprawl would have real leverage. All a city
|
261 |
+
needs is to be the kind of place the next traitorous eight look at
|
262 |
+
and say "I want to stay here," and that would be enough to get the
|
263 |
+
chain reaction started.Notes[1]
|
264 |
+
It's interesting to consider how low this number could be
|
265 |
+
made. I suspect five hundred would be enough, even if they could
|
266 |
+
bring no assets with them. Probably just thirty, if I could pick them,
|
267 |
+
would be enough to turn Buffalo into a significant startup hub.[2]
|
268 |
+
Bureaucrats manage to allocate research funding moderately
|
269 |
+
well, but only because (like an in-house VC fund) they outsource
|
270 |
+
most of the work of selection. A professor at a famous university
|
271 |
+
who is highly regarded by his peers will get funding, pretty much
|
272 |
+
regardless of the proposal. That wouldn't work for startups, whose
|
273 |
+
founders aren't sponsored by organizations, and are often unknowns.[3]
|
274 |
+
You'd have to do it all at once, or at least a whole department
|
275 |
+
at a time, because people would be more likely to come if they
|
276 |
+
knew their friends were. And you should probably start from scratch,
|
277 |
+
rather than trying to upgrade an existing university, or much energy
|
278 |
+
would be lost in friction.[4]
|
279 |
+
Hypothesis: Any plan in which multiple independent buildings
|
280 |
+
are gutted or demolished to be "redeveloped" as a single project
|
281 |
+
is a net loss of personality for the city, with the exception of
|
282 |
+
the conversion of buildings not previously public, like warehouses.[5]
|
283 |
+
A few startups get started in New York, but less
|
284 |
+
than a tenth as many per capita as in Boston, and mostly
|
285 |
+
in less nerdy fields like finance and media.[6]
|
286 |
+
Some blue counties are false positives (reflecting the
|
287 |
+
remaining power of Democractic party machines), but there are no
|
288 |
+
false negatives. You can safely write off all the red counties.[7]
|
289 |
+
Some "urban renewal" experts took a shot at destroying Boston's
|
290 |
+
in the 1960s, leaving the area around city hall a bleak wasteland,
|
291 |
+
but most neighborhoods successfully resisted them.Thanks to Chris Anderson, Trevor Blackwell, Marc Hedlund,
|
292 |
+
Jessica Livingston, Robert Morris, Greg Mcadoo, Fred Wilson,
|
293 |
+
and Stephen Wolfram for
|
294 |
+
reading drafts of this, and to Ed Dumbill for inviting me to speak.(The second part of this talk became Why Startups
|
295 |
+
Condense in America.)
|
296 |
+
May 2001(This article was written as a kind of business plan for a
|
297 |
+
new language.
|
298 |
+
So it is missing (because it takes for granted) the most important
|
299 |
+
feature of a good programming language: very powerful abstractions.)A friend of mine once told an eminent operating systems
|
300 |
+
expert that he wanted to design a really good
|
301 |
+
programming language. The expert told him that it would be a
|
302 |
+
waste of time, that programming languages don't become popular
|
303 |
+
or unpopular based on their merits, and so no matter how
|
304 |
+
good his language was, no one would use it. At least, that
|
305 |
+
was what had happened to the language he had designed.What does make a language popular? Do popular
|
306 |
+
languages deserve their popularity? Is it worth trying to
|
307 |
+
define a good programming language? How would you do it?I think the answers to these questions can be found by looking
|
308 |
+
at hackers, and learning what they want. Programming
|
309 |
+
languages are for hackers, and a programming language
|
310 |
+
is good as a programming language (rather than, say, an
|
311 |
+
exercise in denotational semantics or compiler design)
|
312 |
+
if and only if hackers like it.1 The Mechanics of PopularityIt's true, certainly, that most people don't choose programming
|
313 |
+
languages simply based on their merits. Most programmers are told
|
314 |
+
what language to use by someone else. And yet I think the effect
|
315 |
+
of such external factors on the popularity of programming languages
|
316 |
+
is not as great as it's sometimes thought to be. I think a bigger
|
317 |
+
problem is that a hacker's idea of a good programming language is
|
318 |
+
not the same as most language designers'.Between the two, the hacker's opinion is the one that matters.
|
319 |
+
Programming languages are not theorems. They're tools, designed
|
320 |
+
for people, and they have to be designed to suit human strengths
|
321 |
+
and weaknesses as much as shoes have to be designed for human feet.
|
322 |
+
If a shoe pinches when you put it on, it's a bad shoe, however
|
323 |
+
elegant it may be as a piece of sculpture.It may be that the majority of programmers can't tell a good language
|
324 |
+
from a bad one. But that's no different with any other tool. It
|
325 |
+
doesn't mean that it's a waste of time to try designing a good
|
326 |
+
language. Expert hackers
|
327 |
+
can tell a good language when they see
|
328 |
+
one, and they'll use it. Expert hackers are a tiny minority,
|
329 |
+
admittedly, but that tiny minority write all the good software,
|
330 |
+
and their influence is such that the rest of the programmers will
|
331 |
+
tend to use whatever language they use. Often, indeed, it is not
|
332 |
+
merely influence but command: often the expert hackers are the very
|
333 |
+
people who, as their bosses or faculty advisors, tell the other
|
334 |
+
programmers what language to use.The opinion of expert hackers is not the only force that determines
|
335 |
+
the relative popularity of programming languages — legacy software
|
336 |
+
(Cobol) and hype (Ada, Java) also play a role — but I think it is
|
337 |
+
the most powerful force over the long term. Given an initial critical
|
338 |
+
mass and enough time, a programming language probably becomes about
|
339 |
+
as popular as it deserves to be. And popularity further separates
|
340 |
+
good languages from bad ones, because feedback from real live users
|
341 |
+
always leads to improvements. Look at how much any popular language
|
342 |
+
has changed during its life. Perl and Fortran are extreme cases,
|
343 |
+
but even Lisp has changed a lot. Lisp 1.5 didn't have macros, for
|
344 |
+
example; these evolved later, after hackers at MIT had spent a
|
345 |
+
couple years using Lisp to write real programs. [1]So whether or not a language has to be good to be popular, I think
|
346 |
+
a language has to be popular to be good. And it has to stay popular
|
347 |
+
to stay good. The state of the art in programming languages doesn't
|
348 |
+
stand still. And yet the Lisps we have today are still pretty much
|
349 |
+
what they had at MIT in the mid-1980s, because that's the last time
|
350 |
+
Lisp had a sufficiently large and demanding user base.Of course, hackers have to know about a language before they can
|
351 |
+
use it. How are they to hear? From other hackers. But there has to
|
352 |
+
be some initial group of hackers using the language for others even
|
353 |
+
to hear about it. I wonder how large this group has to be; how many
|
354 |
+
users make a critical mass? Off the top of my head, I'd say twenty.
|
355 |
+
If a language had twenty separate users, meaning twenty users who
|
356 |
+
decided on their own to use it, I'd consider it to be real.Getting there can't be easy. I would not be surprised if it is
|
357 |
+
harder to get from zero to twenty than from twenty to a thousand.
|
358 |
+
The best way to get those initial twenty users is probably to use
|
359 |
+
a trojan horse: to give people an application they want, which
|
360 |
+
happens to be written in the new language.2 External FactorsLet's start by acknowledging one external factor that does affect
|
361 |
+
the popularity of a programming language. To become popular, a
|
362 |
+
programming language has to be the scripting language of a popular
|
363 |
+
system. Fortran and Cobol were the scripting languages of early
|
364 |
+
IBM mainframes. C was the scripting language of Unix, and so, later,
|
365 |
+
was Perl. Tcl is the scripting language of Tk. Java and Javascript
|
366 |
+
are intended to be the scripting languages of web browsers.Lisp is not a massively popular language because it is not the
|
367 |
+
scripting language of a massively popular system. What popularity
|
368 |
+
it retains dates back to the 1960s and 1970s, when it was the
|
369 |
+
scripting language of MIT. A lot of the great programmers of the
|
370 |
+
day were associated with MIT at some point. And in the early 1970s,
|
371 |
+
before C, MIT's dialect of Lisp, called MacLisp, was one of the
|
372 |
+
only programming languages a serious hacker would want to use.Today Lisp is the scripting language of two moderately popular
|
373 |
+
systems, Emacs and Autocad, and for that reason I suspect that most
|
374 |
+
of the Lisp programming done today is done in Emacs Lisp or AutoLisp.Programming languages don't exist in isolation. To hack is a
|
375 |
+
transitive verb — hackers are usually hacking something — and in
|
376 |
+
practice languages are judged relative to whatever they're used to
|
377 |
+
hack. So if you want to design a popular language, you either have
|
378 |
+
to supply more than a language, or you have to design your language
|
379 |
+
to replace the scripting language of some existing system.Common Lisp is unpopular partly because it's an orphan. It did
|
380 |
+
originally come with a system to hack: the Lisp Machine. But Lisp
|
381 |
+
Machines (along with parallel computers) were steamrollered by the
|
382 |
+
increasing power of general purpose processors in the 1980s. Common
|
383 |
+
Lisp might have remained popular if it had been a good scripting
|
384 |
+
language for Unix. It is, alas, an atrociously bad one.One way to describe this situation is to say that a language isn't
|
385 |
+
judged on its own merits. Another view is that a programming language
|
386 |
+
really isn't a programming language unless it's also the scripting
|
387 |
+
language of something. This only seems unfair if it comes as a
|
388 |
+
surprise. I think it's no more unfair than expecting a programming
|
389 |
+
language to have, say, an implementation. It's just part of what
|
390 |
+
a programming language is.A programming language does need a good implementation, of course,
|
391 |
+
and this must be free. Companies will pay for software, but individual
|
392 |
+
hackers won't, and it's the hackers you need to attract.A language also needs to have a book about it. The book should be
|
393 |
+
thin, well-written, and full of good examples. K&R is the ideal
|
394 |
+
here. At the moment I'd almost say that a language has to have a
|
395 |
+
book published by O'Reilly. That's becoming the test of mattering
|
396 |
+
to hackers.There should be online documentation as well. In fact, the book
|
397 |
+
can start as online documentation. But I don't think that physical
|
398 |
+
books are outmoded yet. Their format is convenient, and the de
|
399 |
+
facto censorship imposed by publishers is a useful if imperfect
|
400 |
+
filter. Bookstores are one of the most important places for learning
|
401 |
+
about new languages.3 BrevityGiven that you can supply the three things any language needs — a
|
402 |
+
free implementation, a book, and something to hack — how do you
|
403 |
+
make a language that hackers will like?One thing hackers like is brevity. Hackers are lazy, in the same
|
404 |
+
way that mathematicians and modernist architects are lazy: they
|
405 |
+
hate anything extraneous. It would not be far from the truth to
|
406 |
+
say that a hacker about to write a program decides what language
|
407 |
+
to use, at least subconsciously, based on the total number of
|
408 |
+
characters he'll have to type. If this isn't precisely how hackers
|
409 |
+
think, a language designer would do well to act as if it were.It is a mistake to try to baby the user with long-winded expressions
|
410 |
+
that are meant to resemble English. Cobol is notorious for this
|
411 |
+
flaw. A hacker would consider being asked to writeadd x to y giving zinstead ofz = x+yas something between an insult to his intelligence and a sin against
|
412 |
+
God.It has sometimes been said that Lisp should use first and rest
|
413 |
+
instead of car and cdr, because it would make programs easier to
|
414 |
+
read. Maybe for the first couple hours. But a hacker can learn
|
415 |
+
quickly enough that car means the first element of a list and cdr
|
416 |
+
means the rest. Using first and rest means 50% more typing. And
|
417 |
+
they are also different lengths, meaning that the arguments won't
|
418 |
+
line up when they're called, as car and cdr often are, in successive
|
419 |
+
lines. I've found that it matters a lot how code lines up on the
|
420 |
+
page. I can barely read Lisp code when it is set in a variable-width
|
421 |
+
font, and friends say this is true for other languages too.Brevity is one place where strongly typed languages lose. All other
|
422 |
+
things being equal, no one wants to begin a program with a bunch
|
423 |
+
of declarations. Anything that can be implicit, should be.The individual tokens should be short as well. Perl and Common Lisp
|
424 |
+
occupy opposite poles on this question. Perl programs can be almost
|
425 |
+
cryptically dense, while the names of built-in Common Lisp operators
|
426 |
+
are comically long. The designers of Common Lisp probably expected
|
427 |
+
users to have text editors that would type these long names for
|
428 |
+
them. But the cost of a long name is not just the cost of typing
|
429 |
+
it. There is also the cost of reading it, and the cost of the space
|
430 |
+
it takes up on your screen.4 HackabilityThere is one thing more important than brevity to a hacker: being
|
431 |
+
able to do what you want. In the history of programming languages
|
432 |
+
a surprising amount of effort has gone into preventing programmers
|
433 |
+
from doing things considered to be improper. This is a dangerously
|
434 |
+
presumptuous plan. How can the language designer know what the
|
435 |
+
programmer is going to need to do? I think language designers would
|
436 |
+
do better to consider their target user to be a genius who will
|
437 |
+
need to do things they never anticipated, rather than a bumbler
|
438 |
+
who needs to be protected from himself. The bumbler will shoot
|
439 |
+
himself in the foot anyway. You may save him from referring to
|
440 |
+
variables in another package, but you can't save him from writing
|
441 |
+
a badly designed program to solve the wrong problem, and taking
|
442 |
+
forever to do it.Good programmers often want to do dangerous and unsavory things.
|
443 |
+
By unsavory I mean things that go behind whatever semantic facade
|
444 |
+
the language is trying to present: getting hold of the internal
|
445 |
+
representation of some high-level abstraction, for example. Hackers
|
446 |
+
like to hack, and hacking means getting inside things and second
|
447 |
+
guessing the original designer.Let yourself be second guessed. When you make any tool, people use
|
448 |
+
it in ways you didn't intend, and this is especially true of a
|
449 |
+
highly articulated tool like a programming language. Many a hacker
|
450 |
+
will want to tweak your semantic model in a way that you never
|
451 |
+
imagined. I say, let them; give the programmer access to as much
|
452 |
+
internal stuff as you can without endangering runtime systems like
|
453 |
+
the garbage collector.In Common Lisp I have often wanted to iterate through the fields
|
454 |
+
of a struct — to comb out references to a deleted object, for example,
|
455 |
+
or find fields that are uninitialized. I know the structs are just
|
456 |
+
vectors underneath. And yet I can't write a general purpose function
|
457 |
+
that I can call on any struct. I can only access the fields by
|
458 |
+
name, because that's what a struct is supposed to mean.A hacker may only want to subvert the intended model of things once
|
459 |
+
or twice in a big program. But what a difference it makes to be
|
460 |
+
able to. And it may be more than a question of just solving a
|
461 |
+
problem. There is a kind of pleasure here too. Hackers share the
|
462 |
+
surgeon's secret pleasure in poking about in gross innards, the
|
463 |
+
teenager's secret pleasure in popping zits. [2] For boys, at least,
|
464 |
+
certain kinds of horrors are fascinating. Maxim magazine publishes
|
465 |
+
an annual volume of photographs, containing a mix of pin-ups and
|
466 |
+
grisly accidents. They know their audience.Historically, Lisp has been good at letting hackers have their way.
|
467 |
+
The political correctness of Common Lisp is an aberration. Early
|
468 |
+
Lisps let you get your hands on everything. A good deal of that
|
469 |
+
spirit is, fortunately, preserved in macros. What a wonderful thing,
|
470 |
+
to be able to make arbitrary transformations on the source code.Classic macros are a real hacker's tool — simple, powerful, and
|
471 |
+
dangerous. It's so easy to understand what they do: you call a
|
472 |
+
function on the macro's arguments, and whatever it returns gets
|
473 |
+
inserted in place of the macro call. Hygienic macros embody the
|
474 |
+
opposite principle. They try to protect you from understanding what
|
475 |
+
they're doing. I have never heard hygienic macros explained in one
|
476 |
+
sentence. And they are a classic example of the dangers of deciding
|
477 |
+
what programmers are allowed to want. Hygienic macros are intended
|
478 |
+
to protect me from variable capture, among other things, but variable
|
479 |
+
capture is exactly what I want in some macros.A really good language should be both clean and dirty: cleanly
|
480 |
+
designed, with a small core of well understood and highly orthogonal
|
481 |
+
operators, but dirty in the sense that it lets hackers have their
|
482 |
+
way with it. C is like this. So were the early Lisps. A real hacker's
|
483 |
+
language will always have a slightly raffish character.A good programming language should have features that make the kind
|
484 |
+
of people who use the phrase "software engineering" shake their
|
485 |
+
heads disapprovingly. At the other end of the continuum are languages
|
486 |
+
like Ada and Pascal, models of propriety that are good for teaching
|
487 |
+
and not much else.5 Throwaway ProgramsTo be attractive to hackers, a language must be good for writing
|
488 |
+
the kinds of programs they want to write. And that means, perhaps
|
489 |
+
surprisingly, that it has to be good for writing throwaway programs.A throwaway program is a program you write quickly for some limited
|
490 |
+
task: a program to automate some system administration task, or
|
491 |
+
generate test data for a simulation, or convert data from one format
|
492 |
+
to another. The surprising thing about throwaway programs is that,
|
493 |
+
like the "temporary" buildings built at so many American universities
|
494 |
+
during World War II, they often don't get thrown away. Many evolve
|
495 |
+
into real programs, with real features and real users.I have a hunch that the best big programs begin life this way,
|
496 |
+
rather than being designed big from the start, like the Hoover Dam.
|
497 |
+
It's terrifying to build something big from scratch. When people
|
498 |
+
take on a project that's too big, they become overwhelmed. The
|
499 |
+
project either gets bogged down, or the result is sterile and
|
500 |
+
wooden: a shopping mall rather than a real downtown, Brasilia rather
|
501 |
+
than Rome, Ada rather than C.Another way to get a big program is to start with a throwaway
|
502 |
+
program and keep improving it. This approach is less daunting, and
|
503 |
+
the design of the program benefits from evolution. I think, if one
|
504 |
+
looked, that this would turn out to be the way most big programs
|
505 |
+
were developed. And those that did evolve this way are probably
|
506 |
+
still written in whatever language they were first written in,
|
507 |
+
because it's rare for a program to be ported, except for political
|
508 |
+
reasons. And so, paradoxically, if you want to make a language that
|
509 |
+
is used for big systems, you have to make it good for writing
|
510 |
+
throwaway programs, because that's where big systems come from.Perl is a striking example of this idea. It was not only designed
|
511 |
+
for writing throwaway programs, but was pretty much a throwaway
|
512 |
+
program itself. Perl began life as a collection of utilities for
|
513 |
+
generating reports, and only evolved into a programming language
|
514 |
+
as the throwaway programs people wrote in it grew larger. It was
|
515 |
+
not until Perl 5 (if then) that the language was suitable for
|
516 |
+
writing serious programs, and yet it was already massively popular.What makes a language good for throwaway programs? To start with,
|
517 |
+
it must be readily available. A throwaway program is something that
|
518 |
+
you expect to write in an hour. So the language probably must
|
519 |
+
already be installed on the computer you're using. It can't be
|
520 |
+
something you have to install before you use it. It has to be there.
|
521 |
+
C was there because it came with the operating system. Perl was
|
522 |
+
there because it was originally a tool for system administrators,
|
523 |
+
and yours had already installed it.Being available means more than being installed, though. An
|
524 |
+
interactive language, with a command-line interface, is more
|
525 |
+
available than one that you have to compile and run separately. A
|
526 |
+
popular programming language should be interactive, and start up
|
527 |
+
fast.Another thing you want in a throwaway program is brevity. Brevity
|
528 |
+
is always attractive to hackers, and never more so than in a program
|
529 |
+
they expect to turn out in an hour.6 LibrariesOf course the ultimate in brevity is to have the program already
|
530 |
+
written for you, and merely to call it. And this brings us to what
|
531 |
+
I think will be an increasingly important feature of programming
|
532 |
+
languages: library functions. Perl wins because it has large
|
533 |
+
libraries for manipulating strings. This class of library functions
|
534 |
+
are especially important for throwaway programs, which are often
|
535 |
+
originally written for converting or extracting data. Many Perl
|
536 |
+
programs probably begin as just a couple library calls stuck
|
537 |
+
together.I think a lot of the advances that happen in programming languages
|
538 |
+
in the next fifty years will have to do with library functions. I
|
539 |
+
think future programming languages will have libraries that are as
|
540 |
+
carefully designed as the core language. Programming language design
|
541 |
+
will not be about whether to make your language strongly or weakly
|
542 |
+
typed, or object oriented, or functional, or whatever, but about
|
543 |
+
how to design great libraries. The kind of language designers who
|
544 |
+
like to think about how to design type systems may shudder at this.
|
545 |
+
It's almost like writing applications! Too bad. Languages are for
|
546 |
+
programmers, and libraries are what programmers need.It's hard to design good libraries. It's not simply a matter of
|
547 |
+
writing a lot of code. Once the libraries get too big, it can
|
548 |
+
sometimes take longer to find the function you need than to write
|
549 |
+
the code yourself. Libraries need to be designed using a small set
|
550 |
+
of orthogonal operators, just like the core language. It ought to
|
551 |
+
be possible for the programmer to guess what library call will do
|
552 |
+
what he needs.Libraries are one place Common Lisp falls short. There are only
|
553 |
+
rudimentary libraries for manipulating strings, and almost none
|
554 |
+
for talking to the operating system. For historical reasons, Common
|
555 |
+
Lisp tries to pretend that the OS doesn't exist. And because you
|
556 |
+
can't talk to the OS, you're unlikely to be able to write a serious
|
557 |
+
program using only the built-in operators in Common Lisp. You have
|
558 |
+
to use some implementation-specific hacks as well, and in practice
|
559 |
+
these tend not to give you everything you want. Hackers would think
|
560 |
+
a lot more highly of Lisp if Common Lisp had powerful string
|
561 |
+
libraries and good OS support.7 SyntaxCould a language with Lisp's syntax, or more precisely, lack of
|
562 |
+
syntax, ever become popular? I don't know the answer to this
|
563 |
+
question. I do think that syntax is not the main reason Lisp isn't
|
564 |
+
currently popular. Common Lisp has worse problems than unfamiliar
|
565 |
+
syntax. I know several programmers who are comfortable with prefix
|
566 |
+
syntax and yet use Perl by default, because it has powerful string
|
567 |
+
libraries and can talk to the os.There are two possible problems with prefix notation: that it is
|
568 |
+
unfamiliar to programmers, and that it is not dense enough. The
|
569 |
+
conventional wisdom in the Lisp world is that the first problem is
|
570 |
+
the real one. I'm not so sure. Yes, prefix notation makes ordinary
|
571 |
+
programmers panic. But I don't think ordinary programmers' opinions
|
572 |
+
matter. Languages become popular or unpopular based on what expert
|
573 |
+
hackers think of them, and I think expert hackers might be able to
|
574 |
+
deal with prefix notation. Perl syntax can be pretty incomprehensible,
|
575 |
+
but that has not stood in the way of Perl's popularity. If anything
|
576 |
+
it may have helped foster a Perl cult.A more serious problem is the diffuseness of prefix notation. For
|
577 |
+
expert hackers, that really is a problem. No one wants to write
|
578 |
+
(aref a x y) when they could write a[x,y].In this particular case there is a way to finesse our way out of
|
579 |
+
the problem. If we treat data structures as if they were functions
|
580 |
+
on indexes, we could write (a x y) instead, which is even shorter
|
581 |
+
than the Perl form. Similar tricks may shorten other types of
|
582 |
+
expressions.We can get rid of (or make optional) a lot of parentheses by making
|
583 |
+
indentation significant. That's how programmers read code anyway:
|
584 |
+
when indentation says one thing and delimiters say another, we go
|
585 |
+
by the indentation. Treating indentation as significant would
|
586 |
+
eliminate this common source of bugs as well as making programs
|
587 |
+
shorter.Sometimes infix syntax is easier to read. This is especially true
|
588 |
+
for math expressions. I've used Lisp my whole programming life and
|
589 |
+
I still don't find prefix math expressions natural. And yet it is
|
590 |
+
convenient, especially when you're generating code, to have operators
|
591 |
+
that take any number of arguments. So if we do have infix syntax,
|
592 |
+
it should probably be implemented as some kind of read-macro.I don't think we should be religiously opposed to introducing syntax
|
593 |
+
into Lisp, as long as it translates in a well-understood way into
|
594 |
+
underlying s-expressions. There is already a good deal of syntax
|
595 |
+
in Lisp. It's not necessarily bad to introduce more, as long as no
|
596 |
+
one is forced to use it. In Common Lisp, some delimiters are reserved
|
597 |
+
for the language, suggesting that at least some of the designers
|
598 |
+
intended to have more syntax in the future.One of the most egregiously unlispy pieces of syntax in Common Lisp
|
599 |
+
occurs in format strings; format is a language in its own right,
|
600 |
+
and that language is not Lisp. If there were a plan for introducing
|
601 |
+
more syntax into Lisp, format specifiers might be able to be included
|
602 |
+
in it. It would be a good thing if macros could generate format
|
603 |
+
specifiers the way they generate any other kind of code.An eminent Lisp hacker told me that his copy of CLTL falls open to
|
604 |
+
the section format. Mine too. This probably indicates room for
|
605 |
+
improvement. It may also mean that programs do a lot of I/O.8 EfficiencyA good language, as everyone knows, should generate fast code. But
|
606 |
+
in practice I don't think fast code comes primarily from things
|
607 |
+
you do in the design of the language. As Knuth pointed out long
|
608 |
+
ago, speed only matters in certain critical bottlenecks. And as
|
609 |
+
many programmers have observed since, one is very often mistaken
|
610 |
+
about where these bottlenecks are.So, in practice, the way to get fast code is to have a very good
|
611 |
+
profiler, rather than by, say, making the language strongly typed.
|
612 |
+
You don't need to know the type of every argument in every call in
|
613 |
+
the program. You do need to be able to declare the types of arguments
|
614 |
+
in the bottlenecks. And even more, you need to be able to find out
|
615 |
+
where the bottlenecks are.One complaint people have had with Lisp is that it's hard to tell
|
616 |
+
what's expensive. This might be true. It might also be inevitable,
|
617 |
+
if you want to have a very abstract language. And in any case I
|
618 |
+
think good profiling would go a long way toward fixing the problem:
|
619 |
+
you'd soon learn what was expensive.Part of the problem here is social. Language designers like to
|
620 |
+
write fast compilers. That's how they measure their skill. They
|
621 |
+
think of the profiler as an add-on, at best. But in practice a good
|
622 |
+
profiler may do more to improve the speed of actual programs written
|
623 |
+
in the language than a compiler that generates fast code. Here,
|
624 |
+
again, language designers are somewhat out of touch with their
|
625 |
+
users. They do a really good job of solving slightly the wrong
|
626 |
+
problem.It might be a good idea to have an active profiler — to push
|
627 |
+
performance data to the programmer instead of waiting for him to
|
628 |
+
come asking for it. For example, the editor could display bottlenecks
|
629 |
+
in red when the programmer edits the source code. Another approach
|
630 |
+
would be to somehow represent what's happening in running programs.
|
631 |
+
This would be an especially big win in server-based applications,
|
632 |
+
where you have lots of running programs to look at. An active
|
633 |
+
profiler could show graphically what's happening in memory as a
|
634 |
+
program's running, or even make sounds that tell what's happening.Sound is a good cue to problems. In one place I worked, we had a
|
635 |
+
big board of dials showing what was happening to our web servers.
|
636 |
+
The hands were moved by little servomotors that made a slight noise
|
637 |
+
when they turned. I couldn't see the board from my desk, but I
|
638 |
+
found that I could tell immediately, by the sound, when there was
|
639 |
+
a problem with a server.It might even be possible to write a profiler that would automatically
|
640 |
+
detect inefficient algorithms. I would not be surprised if certain
|
641 |
+
patterns of memory access turned out to be sure signs of bad
|
642 |
+
algorithms. If there were a little guy running around inside the
|
643 |
+
computer executing our programs, he would probably have as long
|
644 |
+
and plaintive a tale to tell about his job as a federal government
|
645 |
+
employee. I often have a feeling that I'm sending the processor on
|
646 |
+
a lot of wild goose chases, but I've never had a good way to look
|
647 |
+
at what it's doing.A number of Lisps now compile into byte code, which is then executed
|
648 |
+
by an interpreter. This is usually done to make the implementation
|
649 |
+
easier to port, but it could be a useful language feature. It might
|
650 |
+
be a good idea to make the byte code an official part of the
|
651 |
+
language, and to allow programmers to use inline byte code in
|
652 |
+
bottlenecks. Then such optimizations would be portable too.The nature of speed, as perceived by the end-user, may be changing.
|
653 |
+
With the rise of server-based applications, more and more programs
|
654 |
+
may turn out to be i/o-bound. It will be worth making i/o fast.
|
655 |
+
The language can help with straightforward measures like simple,
|
656 |
+
fast, formatted output functions, and also with deep structural
|
657 |
+
changes like caching and persistent objects.Users are interested in response time. But another kind of efficiency
|
658 |
+
will be increasingly important: the number of simultaneous users
|
659 |
+
you can support per processor. Many of the interesting applications
|
660 |
+
written in the near future will be server-based, and the number of
|
661 |
+
users per server is the critical question for anyone hosting such
|
662 |
+
applications. In the capital cost of a business offering a server-based
|
663 |
+
application, this is the divisor.For years, efficiency hasn't mattered much in most end-user
|
664 |
+
applications. Developers have been able to assume that each user
|
665 |
+
would have an increasingly powerful processor sitting on their
|
666 |
+
desk. And by Parkinson's Law, software has expanded to use the
|
667 |
+
resources available. That will change with server-based applications.
|
668 |
+
In that world, the hardware and software will be supplied together.
|
669 |
+
For companies that offer server-based applications, it will make
|
670 |
+
a very big difference to the bottom line how many users they can
|
671 |
+
support per server.In some applications, the processor will be the limiting factor,
|
672 |
+
and execution speed will be the most important thing to optimize.
|
673 |
+
But often memory will be the limit; the number of simultaneous
|
674 |
+
users will be determined by the amount of memory you need for each
|
675 |
+
user's data. The language can help here too. Good support for
|
676 |
+
threads will enable all the users to share a single heap. It may
|
677 |
+
also help to have persistent objects and/or language level support
|
678 |
+
for lazy loading.9 TimeThe last ingredient a popular language needs is time. No one wants
|
679 |
+
to write programs in a language that might go away, as so many
|
680 |
+
programming languages do. So most hackers will tend to wait until
|
681 |
+
a language has been around for a couple years before even considering
|
682 |
+
using it.Inventors of wonderful new things are often surprised to discover
|
683 |
+
this, but you need time to get any message through to people. A
|
684 |
+
friend of mine rarely does anything the first time someone asks
|
685 |
+
him. He knows that people sometimes ask for things that they turn
|
686 |
+
out not to want. To avoid wasting his time, he waits till the third
|
687 |
+
or fourth time he's asked to do something; by then, whoever's asking
|
688 |
+
him may be fairly annoyed, but at least they probably really do
|
689 |
+
want whatever they're asking for.Most people have learned to do a similar sort of filtering on new
|
690 |
+
things they hear about. They don't even start paying attention
|
691 |
+
until they've heard about something ten times. They're perfectly
|
692 |
+
justified: the majority of hot new whatevers do turn out to be a
|
693 |
+
waste of time, and eventually go away. By delaying learning VRML,
|
694 |
+
I avoided having to learn it at all.So anyone who invents something new has to expect to keep repeating
|
695 |
+
their message for years before people will start to get it. We
|
696 |
+
wrote what was, as far as I know, the first web-server based
|
697 |
+
application, and it took us years to get it through to people that
|
698 |
+
it didn't have to be downloaded. It wasn't that they were stupid.
|
699 |
+
They just had us tuned out.The good news is, simple repetition solves the problem. All you
|
700 |
+
have to do is keep telling your story, and eventually people will
|
701 |
+
start to hear. It's not when people notice you're there that they
|
702 |
+
pay attention; it's when they notice you're still there.It's just as well that it usually takes a while to gain momentum.
|
703 |
+
Most technologies evolve a good deal even after they're first
|
704 |
+
launched — programming languages especially. Nothing could be better,
|
705 |
+
for a new techology, than a few years of being used only by a small
|
706 |
+
number of early adopters. Early adopters are sophisticated and
|
707 |
+
demanding, and quickly flush out whatever flaws remain in your
|
708 |
+
technology. When you only have a few users you can be in close
|
709 |
+
contact with all of them. And early adopters are forgiving when
|
710 |
+
you improve your system, even if this causes some breakage.There are two ways new technology gets introduced: the organic
|
711 |
+
growth method, and the big bang method. The organic growth method
|
712 |
+
is exemplified by the classic seat-of-the-pants underfunded garage
|
713 |
+
startup. A couple guys, working in obscurity, develop some new
|
714 |
+
technology. They launch it with no marketing and initially have
|
715 |
+
only a few (fanatically devoted) users. They continue to improve
|
716 |
+
the technology, and meanwhile their user base grows by word of
|
717 |
+
mouth. Before they know it, they're big.The other approach, the big bang method, is exemplified by the
|
718 |
+
VC-backed, heavily marketed startup. They rush to develop a product,
|
719 |
+
launch it with great publicity, and immediately (they hope) have
|
720 |
+
a large user base.Generally, the garage guys envy the big bang guys. The big bang
|
721 |
+
guys are smooth and confident and respected by the VCs. They can
|
722 |
+
afford the best of everything, and the PR campaign surrounding the
|
723 |
+
launch has the side effect of making them celebrities. The organic
|
724 |
+
growth guys, sitting in their garage, feel poor and unloved. And
|
725 |
+
yet I think they are often mistaken to feel sorry for themselves.
|
726 |
+
Organic growth seems to yield better technology and richer founders
|
727 |
+
than the big bang method. If you look at the dominant technologies
|
728 |
+
today, you'll find that most of them grew organically.This pattern doesn't only apply to companies. You see it in sponsored
|
729 |
+
research too. Multics and Common Lisp were big-bang projects, and
|
730 |
+
Unix and MacLisp were organic growth projects.10 Redesign"The best writing is rewriting," wrote E. B. White. Every good
|
731 |
+
writer knows this, and it's true for software too. The most important
|
732 |
+
part of design is redesign. Programming languages, especially,
|
733 |
+
don't get redesigned enough.To write good software you must simultaneously keep two opposing
|
734 |
+
ideas in your head. You need the young hacker's naive faith in
|
735 |
+
his abilities, and at the same time the veteran's skepticism. You
|
736 |
+
have to be able to think
|
737 |
+
how hard can it be? with one half of
|
738 |
+
your brain while thinking
|
739 |
+
it will never work with the other.The trick is to realize that there's no real contradiction here.
|
740 |
+
You want to be optimistic and skeptical about two different things.
|
741 |
+
You have to be optimistic about the possibility of solving the
|
742 |
+
problem, but skeptical about the value of whatever solution you've
|
743 |
+
got so far.People who do good work often think that whatever they're working
|
744 |
+
on is no good. Others see what they've done and are full of wonder,
|
745 |
+
but the creator is full of worry. This pattern is no coincidence:
|
746 |
+
it is the worry that made the work good.If you can keep hope and worry balanced, they will drive a project
|
747 |
+
forward the same way your two legs drive a bicycle forward. In the
|
748 |
+
first phase of the two-cycle innovation engine, you work furiously
|
749 |
+
on some problem, inspired by your confidence that you'll be able
|
750 |
+
to solve it. In the second phase, you look at what you've done in
|
751 |
+
the cold light of morning, and see all its flaws very clearly. But
|
752 |
+
as long as your critical spirit doesn't outweigh your hope, you'll
|
753 |
+
be able to look at your admittedly incomplete system, and think,
|
754 |
+
how hard can it be to get the rest of the way?, thereby continuing
|
755 |
+
the cycle.It's tricky to keep the two forces balanced. In young hackers,
|
756 |
+
optimism predominates. They produce something, are convinced it's
|
757 |
+
great, and never improve it. In old hackers, skepticism predominates,
|
758 |
+
and they won't even dare to take on ambitious projects.Anything you can do to keep the redesign cycle going is good. Prose
|
759 |
+
can be rewritten over and over until you're happy with it. But
|
760 |
+
software, as a rule, doesn't get redesigned enough. Prose has
|
761 |
+
readers, but software has users. If a writer rewrites an essay,
|
762 |
+
people who read the old version are unlikely to complain that their
|
763 |
+
thoughts have been broken by some newly introduced incompatibility.Users are a double-edged sword. They can help you improve your
|
764 |
+
language, but they can also deter you from improving it. So choose
|
765 |
+
your users carefully, and be slow to grow their number. Having
|
766 |
+
users is like optimization: the wise course is to delay it. Also,
|
767 |
+
as a general rule, you can at any given time get away with changing
|
768 |
+
more than you think. Introducing change is like pulling off a
|
769 |
+
bandage: the pain is a memory almost as soon as you feel it.Everyone knows that it's not a good idea to have a language designed
|
770 |
+
by a committee. Committees yield bad design. But I think the worst
|
771 |
+
danger of committees is that they interfere with redesign. It is
|
772 |
+
so much work to introduce changes that no one wants to bother.
|
773 |
+
Whatever a committee decides tends to stay that way, even if most
|
774 |
+
of the members don't like it.Even a committee of two gets in the way of redesign. This happens
|
775 |
+
particularly in the interfaces between pieces of software written
|
776 |
+
by two different people. To change the interface both have to agree
|
777 |
+
to change it at once. And so interfaces tend not to change at all,
|
778 |
+
which is a problem because they tend to be one of the most ad hoc
|
779 |
+
parts of any system.One solution here might be to design systems so that interfaces
|
780 |
+
are horizontal instead of vertical — so that modules are always
|
781 |
+
vertically stacked strata of abstraction. Then the interface will
|
782 |
+
tend to be owned by one of them. The lower of two levels will either
|
783 |
+
be a language in which the upper is written, in which case the
|
784 |
+
lower level will own the interface, or it will be a slave, in which
|
785 |
+
case the interface can be dictated by the upper level.11 LispWhat all this implies is that there is hope for a new Lisp. There
|
786 |
+
is hope for any language that gives hackers what they want, including
|
787 |
+
Lisp. I think we may have made a mistake in thinking that hackers
|
788 |
+
are turned off by Lisp's strangeness. This comforting illusion may
|
789 |
+
have prevented us from seeing the real problem with Lisp, or at
|
790 |
+
least Common Lisp, which is that it sucks for doing what hackers
|
791 |
+
want to do. A hacker's language needs powerful libraries and
|
792 |
+
something to hack. Common Lisp has neither. A hacker's language is
|
793 |
+
terse and hackable. Common Lisp is not.The good news is, it's not Lisp that sucks, but Common Lisp. If we
|
794 |
+
can develop a new Lisp that is a real hacker's language, I think
|
795 |
+
hackers will use it. They will use whatever language does the job.
|
796 |
+
All we have to do is make sure this new Lisp does some important
|
797 |
+
job better than other languages.History offers some encouragement. Over time, successive new
|
798 |
+
programming languages have taken more and more features from Lisp.
|
799 |
+
There is no longer much left to copy before the language you've
|
800 |
+
made is Lisp. The latest hot language, Python, is a watered-down
|
801 |
+
Lisp with infix syntax and no macros. A new Lisp would be a natural
|
802 |
+
step in this progression.I sometimes think that it would be a good marketing trick to call
|
803 |
+
it an improved version of Python. That sounds hipper than Lisp. To
|
804 |
+
many people, Lisp is a slow AI language with a lot of parentheses.
|
805 |
+
Fritz Kunze's official biography carefully avoids mentioning the
|
806 |
+
L-word. But my guess is that we shouldn't be afraid to call the
|
807 |
+
new Lisp Lisp. Lisp still has a lot of latent respect among the
|
808 |
+
very best hackers — the ones who took 6.001 and understood it, for
|
809 |
+
example. And those are the users you need to win.In "How to Become a Hacker," Eric Raymond describes Lisp as something
|
810 |
+
like Latin or Greek — a language you should learn as an intellectual
|
811 |
+
exercise, even though you won't actually use it:
|
812 |
+
|
813 |
+
Lisp is worth learning for the profound enlightenment experience
|
814 |
+
you will have when you finally get it; that experience will make
|
815 |
+
you a better programmer for the rest of your days, even if you
|
816 |
+
never actually use Lisp itself a lot.
|
817 |
+
|
818 |
+
If I didn't know Lisp, reading this would set me asking questions.
|
819 |
+
A language that would make me a better programmer, if it means
|
820 |
+
anything at all, means a language that would be better for programming.
|
821 |
+
And that is in fact the implication of what Eric is saying.As long as that idea is still floating around, I think hackers will
|
822 |
+
be receptive enough to a new Lisp, even if it is called Lisp. But
|
823 |
+
this Lisp must be a hacker's language, like the classic Lisps of
|
824 |
+
the 1970s. It must be terse, simple, and hackable. And it must have
|
825 |
+
powerful libraries for doing what hackers want to do now.In the matter of libraries I think there is room to beat languages
|
826 |
+
like Perl and Python at their own game. A lot of the new applications
|
827 |
+
that will need to be written in the coming years will be
|
828 |
+
server-based
|
829 |
+
applications. There's no reason a new Lisp shouldn't have string
|
830 |
+
libraries as good as Perl, and if this new Lisp also had powerful
|
831 |
+
libraries for server-based applications, it could be very popular.
|
832 |
+
Real hackers won't turn up their noses at a new tool that will let
|
833 |
+
them solve hard problems with a few library calls. Remember, hackers
|
834 |
+
are lazy.It could be an even bigger win to have core language support for
|
835 |
+
server-based applications. For example, explicit support for programs
|
836 |
+
with multiple users, or data ownership at the level of type tags.Server-based applications also give us the answer to the question
|
837 |
+
of what this new Lisp will be used to hack. It would not hurt to
|
838 |
+
make Lisp better as a scripting language for Unix. (It would be
|
839 |
+
hard to make it worse.) But I think there are areas where existing
|
840 |
+
languages would be easier to beat. I think it might be better to
|
841 |
+
follow the model of Tcl, and supply the Lisp together with a complete
|
842 |
+
system for supporting server-based applications. Lisp is a natural
|
843 |
+
fit for server-based applications. Lexical closures provide a way
|
844 |
+
to get the effect of subroutines when the ui is just a series of
|
845 |
+
web pages. S-expressions map nicely onto html, and macros are good
|
846 |
+
at generating it. There need to be better tools for writing
|
847 |
+
server-based applications, and there needs to be a new Lisp, and
|
848 |
+
the two would work very well together.12 The Dream LanguageBy way of summary, let's try describing the hacker's dream language.
|
849 |
+
The dream language is
|
850 |
+
beautiful, clean, and terse. It has an
|
851 |
+
interactive toplevel that starts up fast. You can write programs
|
852 |
+
to solve common problems with very little code. Nearly all the
|
853 |
+
code in any program you write is code that's specific to your
|
854 |
+
application. Everything else has been done for you.The syntax of the language is brief to a fault. You never have to
|
855 |
+
type an unnecessary character, or even to use the shift key much.Using big abstractions you can write the first version of a program
|
856 |
+
very quickly. Later, when you want to optimize, there's a really
|
857 |
+
good profiler that tells you where to focus your attention. You
|
858 |
+
can make inner loops blindingly fast, even writing inline byte code
|
859 |
+
if you need to.There are lots of good examples to learn from, and the language is
|
860 |
+
intuitive enough that you can learn how to use it from examples in
|
861 |
+
a couple minutes. You don't need to look in the manual much. The
|
862 |
+
manual is thin, and has few warnings and qualifications.The language has a small core, and powerful, highly orthogonal
|
863 |
+
libraries that are as carefully designed as the core language. The
|
864 |
+
libraries all work well together; everything in the language fits
|
865 |
+
together like the parts in a fine camera. Nothing is deprecated,
|
866 |
+
or retained for compatibility. The source code of all the libraries
|
867 |
+
is readily available. It's easy to talk to the operating system
|
868 |
+
and to applications written in other languages.The language is built in layers. The higher-level abstractions are
|
869 |
+
built in a very transparent way out of lower-level abstractions,
|
870 |
+
which you can get hold of if you want.Nothing is hidden from you that doesn't absolutely have to be. The
|
871 |
+
language offers abstractions only as a way of saving you work,
|
872 |
+
rather than as a way of telling you what to do. In fact, the language
|
873 |
+
encourages you to be an equal participant in its design. You can
|
874 |
+
change everything about it, including even its syntax, and anything
|
875 |
+
you write has, as much as possible, the same status as what comes
|
876 |
+
predefined.Notes[1] Macros very close to the modern idea were proposed by Timothy
|
877 |
+
Hart in 1964, two years after Lisp 1.5 was released. What was
|
878 |
+
missing, initially, were ways to avoid variable capture and multiple
|
879 |
+
evaluation; Hart's examples are subject to both.[2] In When the Air Hits Your Brain, neurosurgeon Frank Vertosick
|
880 |
+
recounts a conversation in which his chief resident, Gary, talks
|
881 |
+
about the difference between surgeons and internists ("fleas"):
|
882 |
+
|
883 |
+
Gary and I ordered a large pizza and found an open booth. The
|
884 |
+
chief lit a cigarette. "Look at those goddamn fleas, jabbering
|
885 |
+
about some disease they'll see once in their lifetimes. That's
|
886 |
+
the trouble with fleas, they only like the bizarre stuff. They
|
887 |
+
hate their bread and butter cases. That's the difference between
|
888 |
+
us and the fucking fleas. See, we love big juicy lumbar disc
|
889 |
+
herniations, but they hate hypertension...."
|
890 |
+
|
891 |
+
It's hard to think of a lumbar disc herniation as juicy (except
|
892 |
+
literally). And yet I think I know what they mean. I've often had
|
893 |
+
a juicy bug to track down. Someone who's not a programmer would
|
894 |
+
find it hard to imagine that there could be pleasure in a bug.
|
895 |
+
Surely it's better if everything just works. In one way, it is.
|
896 |
+
And yet there is undeniably a grim satisfaction in hunting down
|
897 |
+
certain sorts of bugs.
|
898 |
+
```
|
out_tensor/lm_head.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:960dd347a1d286416936631b5cc2a316e3e67a3e98b995963e78789e7fb270bd
|
3 |
+
size 103953008
|
out_tensor/model.layers.0.mlp.down_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4540add4d390fe1c68bc51092927d72ca5c67b3a60e198e0ac39bad37b18020f
|
3 |
+
size 39546008
|
out_tensor/model.layers.0.mlp.gate_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7736ca0fde60e33a5e1c50595a23af5f7b3705c52c897492e3c74df93b6d208a
|
3 |
+
size 38380824
|
out_tensor/model.layers.0.mlp.up_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8ef367e27c78dfac1dbca7dca5c34800847151955995a34f971cb4e16d09cc0
|
3 |
+
size 39470352
|
out_tensor/model.layers.0.self_attn.k_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:794e73a6f31b470f15dd2f20f4dddba36445a8d0815f93255f20089b9bf6f47d
|
3 |
+
size 3228960
|
out_tensor/model.layers.0.self_attn.o_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2955c47b51e4dc73e99b673a3568dd7430ad85a53e2f90ab96eda2730b87c01f
|
3 |
+
size 12862760
|
out_tensor/model.layers.0.self_attn.q_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d182602dc5ccfa3ebca7689543861b14a7889bfb22b12b6be099eedbe1ce1673
|
3 |
+
size 12862760
|
out_tensor/model.layers.0.self_attn.v_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:195c18c8746ba519136fea054587686567bcc5689bfba081b9c23a4dbc1ddb5c
|
3 |
+
size 4277536
|
out_tensor/model.layers.1.mlp.down_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa8a6b43122334a204d0538b1b27618c8baa5d68211e679de26acba678b47c43
|
3 |
+
size 25619608
|
out_tensor/model.layers.1.mlp.gate_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fd05d1a2a3aacd4eb855f8bff97e16c46a826029f8bef986184d0f1542652c1
|
3 |
+
size 23700760
|
out_tensor/model.layers.1.mlp.up_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8562c0f410344976ae61582fa65b77d2e9979259b9ee4f9e7e63b28f4c0b844
|
3 |
+
size 24790288
|
out_tensor/model.layers.1.self_attn.k_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:583d93844dfe9fcff32c3acfdb98a14ad2b47e0dc3315aa57fce0c6f61d99ac6
|
3 |
+
size 3228960
|
out_tensor/model.layers.1.self_attn.o_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91a20eef7842748ade00466d840a0a25437120c2b134ff5f2decf6463360d605
|
3 |
+
size 12862760
|
out_tensor/model.layers.1.self_attn.q_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ace6e512e1b28ed44e0913fcafda77a8b43d1b53593fd115a04b943e2148fed
|
3 |
+
size 12862760
|
out_tensor/model.layers.1.self_attn.v_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:965179024657ede6735183b5f5f1ed9369083470da5c7349646b65544c9aec73
|
3 |
+
size 4277536
|
out_tensor/model.layers.10.mlp.down_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be065acb17d2b30e5f7fc64f4162852bbb06bdb422a0832a00ce22b70bbcc3ec
|
3 |
+
size 25619616
|
out_tensor/model.layers.10.mlp.gate_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f210d34ba8a6dd886422f4fef3e81032742fffbf88f71b69c08a0e14662d0d6c
|
3 |
+
size 23700768
|
out_tensor/model.layers.10.mlp.up_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f1a197f77b4a79a8bc71308293891bdb8ef6239bba3991cf5e0d18c3487c623
|
3 |
+
size 24790288
|
out_tensor/model.layers.10.self_attn.k_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9690b1ad76e5f0da516d3f849ee5c21958fea43017beea730c0d53d1393f47d0
|
3 |
+
size 1156000
|
out_tensor/model.layers.10.self_attn.o_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16551a4f801e0c2805379f180ae7ad8bacd3efe5de9f5744fb3a496dd6ce5efc
|
3 |
+
size 4572064
|
out_tensor/model.layers.10.self_attn.q_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d51f74b264dd1b7c98b601ce13dfd062481d480e3251dfb654be339acb1387c5
|
3 |
+
size 4572064
|
out_tensor/model.layers.10.self_attn.v_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8e855e22790285dc4c35dcd5a72025138d427221bd28fa16332672eb65be747
|
3 |
+
size 1671904
|
out_tensor/model.layers.11.mlp.down_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d59cc1531442d4b0e6e9b83256926c8356041084d1082f630c6caad7399fe3d
|
3 |
+
size 32582816
|
out_tensor/model.layers.11.mlp.gate_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:410f2aa5ab1152923279adad567b990bf6f7f9b68c36bb97f8a3a6f1a79f910b
|
3 |
+
size 31040800
|
out_tensor/model.layers.11.mlp.up_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7106ef91edaf3f6e0987e13f957b802da3549469109f7b0e44fd6d7644eb714e
|
3 |
+
size 32130320
|
out_tensor/model.layers.11.self_attn.k_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7bf86bb51499378cc9b26659d83ca1941e1ac2664d7f74cd8e22ab9c5127e16c
|
3 |
+
size 2204576
|
out_tensor/model.layers.11.self_attn.o_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cae492285d3b271f02973f2a311b3c3dcac34cd60982052e753c97b5f90efb62
|
3 |
+
size 8766368
|
out_tensor/model.layers.11.self_attn.q_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68a1a8a91fbcd029ee7e0b321084519b85fe2e7ca26180899573a32430b03198
|
3 |
+
size 8766368
|
out_tensor/model.layers.11.self_attn.v_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbec78df2323e72c6f25ce089328d757e8d802a1128617c526f59e3e2f43d187
|
3 |
+
size 2233632
|
out_tensor/model.layers.12.mlp.down_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b6c015507a2463975e7752bac34fa9ee1d991c3098860d62dbe01382795e41b
|
3 |
+
size 39546016
|
out_tensor/model.layers.12.mlp.gate_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f135c797037ce482713c3d4841a94dc7757b70dbe92f2621eb190f12ad3366db
|
3 |
+
size 38380832
|
out_tensor/model.layers.12.mlp.up_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0024b549236f4f5373889ff6a926c7bca6e423fb8358973860f10174e5582c1b
|
3 |
+
size 39470352
|
out_tensor/model.layers.12.self_attn.k_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9164cd4dfb8fbea4534e1704a13725e8f33ceb1f124735964fc729d9703686c
|
3 |
+
size 2233632
|
out_tensor/model.layers.12.self_attn.o_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2544d77548e85b34fb784888b15c748251d89489389339ecb87e562201d353f
|
3 |
+
size 8881448
|
out_tensor/model.layers.12.self_attn.q_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7040498aa8c6ba79c072ff2993ff56fc93fd35f23941ae0597d7652a0fb18e11
|
3 |
+
size 8881448
|
out_tensor/model.layers.12.self_attn.v_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c174bec64fe0a43792b3c73fe110242dcd1efc48f91c4aff2aabc11b04a23713
|
3 |
+
size 2704672
|
out_tensor/model.layers.13.mlp.down_proj.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7840f5af1411b097c8307ae52f24e560bd5a10d8ebdbf4679d2341a4052a9519
|
3 |
+
size 39546016
|