File size: 12,814 Bytes
3698d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6aa1c8b
3698d0a
 
 
 
5f0df3a
3698d0a
 
5f0df3a
 
 
3698d0a
 
 
 
 
 
 
 
 
 
 
 
 
5f0df3a
3698d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f0df3a
 
989cd20
5f0df3a
3698d0a
 
 
 
6aa1c8b
5f0df3a
3698d0a
6aa1c8b
3698d0a
 
 
5f0df3a
 
6aa1c8b
5f0df3a
6aa1c8b
 
3698d0a
 
 
 
 
 
5f0df3a
 
3698d0a
 
 
 
 
 
5f0df3a
3698d0a
5f0df3a
 
989cd20
 
5f0df3a
3698d0a
 
 
 
5f0df3a
3698d0a
 
 
 
 
5f0df3a
 
 
3698d0a
 
 
 
 
 
 
5f0df3a
 
3698d0a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# Ref: Ouyang, A. (2023). Understanding the Performance of Transformer Inference (Doctoral dissertation, Massachusetts Institute of Technology).

import streamlit as st
import pandas as pd
from model_util import fetch_dictionary_content, load_parameter
from calc_util import *
from render_util import create_table, header4, header5


st.set_page_config(layout='wide')
if 'model_config' not in st.session_state:
    st.session_state['model_config'] = {}


def load_model_config(model_id):
    if 'model_id' in st.session_state['model_config'] and st.session_state['model_config']['model_id'] == model_id:
        return st.session_state['model_config']
    model_config = {}
    dictionary_content = fetch_dictionary_content(model_id)
    if dictionary_content:
        model_config['model_id'] = model_id
        model_config['hidden_size'] = dictionary_content['hidden_size']
        model_config['num_attention_heads'] = dictionary_content['num_attention_heads']
        model_config['num_hidden_layers'] = dictionary_content['num_hidden_layers']
        model_config['intermediate_size'] = load_parameter(dictionary_content, ['intermediate_size', 'ffn_dim'])
        model_config['vocab_size'] = dictionary_content['vocab_size']
        model_config['max_position_embeddings'] = dictionary_content['max_position_embeddings']
        model_config['layernorm_operation'] = 2
    else:
        st.warning("Model Info is not public!")
        model_config['model_id'] = 'opt-1.3b'
        model_config['hidden_size'] = 2048
        model_config['num_attention_heads'] = 32
        model_config['num_hidden_layers'] = 24
        model_config['intermediate_size'] = 8192
        model_config['vocab_size'] = 50272
        model_config['max_position_embeddings'] = 2048
        model_config['layernorm_operation'] = 2

    st.session_state['model_config'] = model_config
    return model_config


subtotal_parameters = [
    'embedding_weights',
    'attention_weights',
    'mlp_weights',
    'model_total_size'
]

subtotal_operations = [
    'embeddings',
    'attention',
    'mlp',
    'total',
]



col1, col2, col3, col4, col5 = st.columns([1,1.5,2.3,2.3,0.1])

inference_config = {}
parameter_count = {}
cached_parameter_count = {}

prefilling_operation_count = {}
generation_operation_count = {}
prefilling_memory_count = {}
generation_memory_count = {}

gpu_config = {}
inference_info = {}

with col1:
    header4("Model")
    model_id = st.text_input("huggingface model id", 'ArthurZ/opt-13b')
    model_config = load_model_config(model_id)
    model_config['hidden_size'] = st.number_input('hidden size', value=model_config['hidden_size'], format ="%d")
    model_config['num_attention_heads'] = st.number_input('num attention heads', value=model_config['num_attention_heads'], format ="%d")
    model_config['num_hidden_layers'] = st.number_input('num hidden layers', value=model_config['num_hidden_layers'], format ="%d")
    model_config['intermediate_size'] = st.number_input('intermediate size', value=model_config['intermediate_size'], format ="%d")
    model_config['vocab_size'] = st.number_input('vocab size', value= model_config['vocab_size'], format ="%d")
    model_config['max_position_embeddings'] = st.number_input('max position embeddings', value=model_config['max_position_embeddings'], format ="%d")
    model_config['hidden_size_per_head'] = model_config['hidden_size']/model_config['num_attention_heads']

    header4("Inference Setting")
    inference_config['batchsize'] = st.number_input('batchsize', value=1, format ="%d")
    inference_config['input_seq_length'] = st.number_input('input seq length', value=1, format ="%d")
    inference_config['output_seq_length'] = st.number_input('output seq length', value=1, format ="%d")
    inference_config['byte_per_parameter'] = st.number_input('byte per parameter', value=2, format ="%d")
    inference_config['KV_cache'] = st.checkbox("Use KV cache", value=True)

    header4("GPU Setting")
    gpu_config['Name'] = st.text_input('GPU Type', value="A6000")
    gpu_config['TFLOP'] = st.number_input('TFLOP', value=38.7, format ="%2f")
    gpu_config['memory_bandwidth'] = st.number_input('memory bandwidth (GB/s)', value=768, format ="%2d")
    gpu_config['arithmetic_intensity'] = gpu_config['TFLOP']*10**12/gpu_config['memory_bandwidth']/1024**3
    st.write(f"arithmetic_intensity: {gpu_config['arithmetic_intensity']:.3f}")

with col2:
    parameter_count['word_embedding'] = model_config['vocab_size']*model_config['hidden_size']
    parameter_count['positional_embedding'] = model_config['max_position_embeddings']*model_config['hidden_size']
    
    parameter_count['attention_Q']   = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
    parameter_count['attention_K']   = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
    parameter_count['attention_V']   = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
    parameter_count['attention_out'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['hidden_size']/model_config['num_attention_heads']*model_config['num_attention_heads']
    
    parameter_count['layernorm'] = 2*model_config['layernorm_operation']*model_config['num_hidden_layers']*model_config['hidden_size']
    parameter_count['mlp1'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['intermediate_size']
    parameter_count['mlp2'] = model_config['num_hidden_layers']*model_config['hidden_size']*model_config['intermediate_size']
    parameter_count['embedding_weights'] = parameter_count['word_embedding'] + parameter_count['positional_embedding']
    parameter_count['attention_weights'] = parameter_count['attention_out'] + parameter_count['attention_Q'] + parameter_count['attention_K'] + parameter_count['attention_V']
    parameter_count['mlp_weights'] = parameter_count['mlp1'] + parameter_count['mlp2']
    parameter_count['model_total_size'] = inference_config['byte_per_parameter'] * (
        parameter_count['embedding_weights'] + 
        parameter_count['attention_weights'] + 
        parameter_count['mlp_weights'] + 
        parameter_count['layernorm'])



    parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key not in subtotal_parameters}
    subtotal_parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key in subtotal_parameters}

    # Convert dictionaries to pandas dataframes for table display
    df_parameters_items = pd.DataFrame(list(parameters_items.items()), columns=["Parameter", "Count"])
    df_subtotal_parameters_items = pd.DataFrame(list(subtotal_parameters_items.items()), columns=["Parameter", "Count"])

    header4("Model Parameters")
    st.markdown(create_table(df_parameters_items))

    header4("Parameters Summary")
    st.markdown(create_table(df_subtotal_parameters_items))


with col3: # Prefilling
    prefilling_operation_count = prefilling_operation(model_config, inference_config)
    prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
    inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
    inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
    inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
    cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))

    operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
    subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
    prefilling_arithmetic_intensity = {key: "{:.3f}".format(prefilling_operation_count[key]/prefilling_activation_memory_count[key]) for key in prefilling_activation_memory_count}
    prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}


    ## Convert dictionaries to pandas dataframes for table display
    df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
    df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
    
    df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(prefilling_activation_memory_count)
    df_operation_count["Arithmetic Intensity"] = df_operation_count["Operation"].map(prefilling_arithmetic_intensity)
    df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(prefilling_activation_memory_count)
    df_subtotal_operation_count["Arithmetic Intensity"] = df_subtotal_operation_count["Operation"].map(prefilling_arithmetic_intensity)
    
    header4("Inference Ops: Prefilling")
    st.markdown(create_table(df_operation_count))

    header5("Summary: Prefilling")
    st.markdown(create_table(df_subtotal_operation_count))
    st.write(f"Prefillng throughput (tokens/s): {inference_info['inference_prefilling_throughput']:.2f}")
    st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
    st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")

    if inference_config['KV_cache']:
        st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
        


with col4: # Generation
    generation_operation_count = generation_operation(model_config, inference_config)
    generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
    inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
    inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
    inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
    inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
    cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))

    operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
    subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
    generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}

    ## Convert dictionaries to pandas dataframes for table display
    df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
    df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
   
    #df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(generation_activation_memory_count)
    #df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(generation_activation_memory_count)

    header4("Inference Ops: Generation")
    st.markdown(create_table(df_operation_count))

    header5("Summary: Generation")
    st.markdown(create_table(df_subtotal_operation_count))
    st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
    st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
    st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
    #st.write(f"Memory latency: {inference_info['generation_memory_latency']}")

    if inference_config['KV_cache']:
        st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")