Hzfinfdu commited on
Commit
f1d3e02
1 Parent(s): d3282ea

Upload custom_autotune.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. custom_autotune.py +167 -0
custom_autotune.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #https://github.com/fpgaminer/GPTQ-triton
2
+ """
3
+ Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
4
+ """
5
+
6
+ import builtins
7
+ import math
8
+ import time
9
+ from typing import Dict
10
+
11
+ import triton
12
+
13
+
14
+ class Autotuner(triton.KernelInterface):
15
+ def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None, nearest_power_of_two: bool = False):
16
+ '''
17
+ :param prune_configs_by: a dict of functions that are used to prune configs, fields:
18
+ 'perf_model': performance model used to predicate running time with different configs, returns running time
19
+ 'top_k': number of configs to bench
20
+ 'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
21
+ 'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
22
+ '''
23
+ if not configs:
24
+ self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
25
+ else:
26
+ self.configs = configs
27
+ self.key_idx = [arg_names.index(k) for k in key]
28
+ self.nearest_power_of_two = nearest_power_of_two
29
+ self.cache = {}
30
+ # hook to reset all required tensor to zeros before relaunching a kernel
31
+ self.hook = lambda args: 0
32
+ if reset_to_zero is not None:
33
+ self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
34
+
35
+ def _hook(args):
36
+ for i in self.reset_idx:
37
+ args[i].zero_()
38
+ self.hook = _hook
39
+ self.arg_names = arg_names
40
+ # prune configs
41
+ if prune_configs_by:
42
+ perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k']
43
+ if 'early_config_prune' in prune_configs_by:
44
+ early_config_prune = prune_configs_by['early_config_prune']
45
+ else:
46
+ perf_model, top_k, early_config_prune = None, None, None
47
+ self.perf_model, self.configs_top_k = perf_model, top_k
48
+ self.early_config_prune = early_config_prune
49
+ self.fn = fn
50
+
51
+ def _bench(self, *args, config, **meta):
52
+ # check for conflicts, i.e. meta-parameters both provided
53
+ # as kwargs and by the autotuner
54
+ conflicts = meta.keys() & config.kwargs.keys()
55
+ if conflicts:
56
+ raise ValueError(
57
+ f"Conflicting meta-parameters: {', '.join(conflicts)}."
58
+ " Make sure that you don't re-define auto-tuned symbols."
59
+ )
60
+ # augment meta-parameters with tunable ones
61
+ current = dict(meta, **config.kwargs)
62
+
63
+ def kernel_call():
64
+ if config.pre_hook:
65
+ config.pre_hook(self.nargs)
66
+ self.hook(args)
67
+ self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
68
+ try:
69
+ # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
70
+ # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
71
+ return triton.testing.do_bench(kernel_call, rep=40)
72
+ except triton.compiler.OutOfResources:
73
+ return float('inf')
74
+
75
+ def run(self, *args, **kwargs):
76
+ self.nargs = dict(zip(self.arg_names, args))
77
+ if len(self.configs) > 1:
78
+ key = tuple(args[i] for i in self.key_idx)
79
+
80
+ # This reduces the amount of autotuning by rounding the keys to the nearest power of two
81
+ # In my testing this gives decent results, and greatly reduces the amount of tuning required
82
+ if self.nearest_power_of_two:
83
+ key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
84
+
85
+ if key not in self.cache:
86
+ # prune configs
87
+ pruned_configs = self.prune_configs(kwargs)
88
+ bench_start = time.time()
89
+ timings = {config: self._bench(*args, config=config, **kwargs)
90
+ for config in pruned_configs}
91
+ bench_end = time.time()
92
+ self.bench_time = bench_end - bench_start
93
+ self.cache[key] = builtins.min(timings, key=timings.get)
94
+ self.hook(args)
95
+ self.configs_timings = timings
96
+ config = self.cache[key]
97
+ else:
98
+ config = self.configs[0]
99
+ self.best_config = config
100
+ if config.pre_hook is not None:
101
+ config.pre_hook(self.nargs)
102
+ return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
103
+
104
+ def prune_configs(self, kwargs):
105
+ pruned_configs = self.configs
106
+ if self.early_config_prune:
107
+ pruned_configs = self.early_config_prune(self.configs, self.nargs)
108
+ if self.perf_model:
109
+ top_k = self.configs_top_k
110
+ if isinstance(top_k, float) and top_k <= 1.0:
111
+ top_k = int(len(self.configs) * top_k)
112
+ if len(pruned_configs) > top_k:
113
+ est_timing = {
114
+ config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages,
115
+ num_warps=config.num_warps)
116
+ for config in pruned_configs
117
+ }
118
+ pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
119
+ return pruned_configs
120
+
121
+ def warmup(self, *args, **kwargs):
122
+ self.nargs = dict(zip(self.arg_names, args))
123
+ for config in self.prune_configs(kwargs):
124
+ self.fn.warmup(
125
+ *args,
126
+ num_warps=config.num_warps,
127
+ num_stages=config.num_stages,
128
+ **kwargs,
129
+ **config.kwargs,
130
+ )
131
+ self.nargs = None
132
+
133
+
134
+ def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
135
+ """
136
+ Decorator for auto-tuning a :code:`triton.jit`'d function.
137
+ .. highlight:: python
138
+ .. code-block:: python
139
+ @triton.autotune(configs=[
140
+ triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
141
+ triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
142
+ ],
143
+ key=['x_size'] # the two above configs will be evaluated anytime
144
+ # the value of x_size changes
145
+ )
146
+ @triton.jit
147
+ def kernel(x_ptr, x_size, **META):
148
+ BLOCK_SIZE = META['BLOCK_SIZE']
149
+ :note: When all the configurations are evaluated, the kernel will run multiple time.
150
+ This means that whatever value the kernel updates will be updated multiple times.
151
+ To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
152
+ reset the value of the provided tensor to `zero` before running any configuration.
153
+ :param configs: a list of :code:`triton.Config` objects
154
+ :type configs: list[triton.Config]
155
+ :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
156
+ :type key: list[str]
157
+ :param prune_configs_by: a dict of functions that are used to prune configs, fields:
158
+ 'perf_model': performance model used to predicate running time with different configs, returns running time
159
+ 'top_k': number of configs to bench
160
+ 'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
161
+ :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
162
+ :type reset_to_zero: list[str]
163
+ """
164
+ def decorator(fn):
165
+ return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by, nearest_power_of_two)
166
+
167
+ return decorator