CUDA L2 持久缓存

从 CUDA 11.0 开始,计算能力 8.0 及以上的设备能够影响 L2 缓存中数据的持久性。由于 L2 缓存位于片上,因此它有可能为全局内存提供更高的带宽和更低的延迟访问。

转载:CUDA L2 Persistent Cache

更多参考:

介绍

在这篇博文中,我创建了一个 CUDA 示例来演示如何使用 L2 持久缓存来加速数据流量。

CUDA L2 持久缓存

在此示例中,我将有一个具有某些值的小型常量缓冲区,用于重置大型流缓冲区。例如,如果常量缓冲区的大小为 4,值为[5, 2, 1, 4] ,并且要重置的大流缓冲区的大小为 100,则重置后,大流缓冲区的值为[5, 2, 1, 4, 5, 2, 1, 4, …] ,即重复常量缓冲区的值。

由于流式缓冲区比常量缓冲区大得多,因此常量缓冲区中的每个元素比流式缓冲区的访问频率更高。从全局内存访问缓冲区非常昂贵。如果我们能够将频繁访问的常量缓冲区缓存在二级缓存中,则可以加速对经常访问的常量缓冲区的访问。

CUDA 数据重置

对于数据重置 CUDA 内核,我创建了一个在不使用持久 L2 缓存的情况下启动内核的基线,一个使用 3 MB 持久 L2 缓存启动内核但在恒定缓冲区大小超过 3 MB 时出现数据抖动的变体,以及一个优化变体它使用 3 MB 持久 L2 缓存启动内核,但消除了数据抖动。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iomanip>
#include <iostream>
#include <vector>

#include <cuda_runtime.h>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
void check(cudaError_t err, char const* const func, char const* const file,
int const line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(char const* const file, int const line)
{
cudaError_t const err{cudaGetLastError()};
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
std::exit(EXIT_FAILURE);
}
}

template <class T>
float measure_performance(std::function<T(cudaStream_t)> bound_function,
cudaStream_t stream, int num_repeats = 100,
int num_warmups = 100)
{
cudaEvent_t start, stop;
float time;

CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));

for (int i{0}; i < num_warmups; ++i)
{
bound_function(stream);
}

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));

CHECK_CUDA_ERROR(cudaEventRecord(start, stream));
for (int i{0}; i < num_repeats; ++i)
{
bound_function(stream);
}
CHECK_CUDA_ERROR(cudaEventRecord(stop, stream));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
CHECK_LAST_CUDA_ERROR();
CHECK_CUDA_ERROR(cudaEventElapsedTime(&time, start, stop));
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));

float const latency{time / num_repeats};

return latency;
}

__global__ void reset_data(int* data_streaming, int const* lut_persistent,
size_t data_streaming_size,
size_t lut_persistent_size)
{
size_t const idx{blockDim.x * blockIdx.x + threadIdx.x};
size_t const stride{blockDim.x * gridDim.x};
for (size_t i{idx}; i < data_streaming_size; i += stride)
{
data_streaming[i] = lut_persistent[i % lut_persistent_size];
}
}

/**
* @brief Reset the data_streaming using lut_persistent so that the
* data_streaming is lut_persistent repeatedly.
*
* @param data_streaming The data for reseting.
* @param lut_persistent The values for resetting data_streaming.
* @param data_streaming_size The size for data_streaming.
* @param lut_persistent_size The size for lut_persistent.
* @param stream The CUDA stream.
*/
void launch_reset_data(int* data_streaming, int const* lut_persistent,
size_t data_streaming_size, size_t lut_persistent_size,
cudaStream_t stream)
{
dim3 const threads_per_block{1024};
dim3 const blocks_per_grid{32};
reset_data<<<blocks_per_grid, threads_per_block, 0, stream>>>(
data_streaming, lut_persistent, data_streaming_size,
lut_persistent_size);
CHECK_LAST_CUDA_ERROR();
}

bool verify_data(int* data, int n, size_t size)
{
for (size_t i{0}; i < size; ++i)
{
if (data[i] != i % n)
{
return false;
}
}
return true;
}

int main(int argc, char* argv[])
{
size_t num_megabytes_persistent_data{3};
if (argc == 2)
{
num_megabytes_persistent_data = std::atoi(argv[1]);
}

constexpr int const num_repeats{100};
constexpr int const num_warmups{10};

cudaDeviceProp device_prop{};
int current_device{0};
CHECK_CUDA_ERROR(cudaGetDevice(&current_device));
CHECK_CUDA_ERROR(cudaGetDeviceProperties(&device_prop, current_device));
std::cout << "GPU: " << device_prop.name << std::endl;
std::cout << "L2 Cache Size: " << device_prop.l2CacheSize / 1024 / 1024
<< " MB" << std::endl;
std::cout << "Max Persistent L2 Cache Size: "
<< device_prop.persistingL2CacheMaxSize / 1024 / 1024 << " MB"
<< std::endl;

size_t const num_megabytes_streaming_data{1024};
if (num_megabytes_persistent_data > num_megabytes_streaming_data)
{
std::runtime_error(
"Try setting persistent data size smaller than 1024 MB.");
}
size_t const size_persistent(num_megabytes_persistent_data * 1024 * 1024 /
sizeof(int));
size_t const size_streaming(num_megabytes_streaming_data * 1024 * 1024 /
sizeof(int));
std::cout << "Persistent Data Size: " << num_megabytes_persistent_data
<< " MB" << std::endl;
std::cout << "Steaming Data Size: " << num_megabytes_streaming_data << " MB"
<< std::endl;
cudaStream_t stream;

std::vector<int> lut_persistent_vec(size_persistent, 0);
for (size_t i{0}; i < lut_persistent_vec.size(); ++i)
{
lut_persistent_vec[i] = i;
}
std::vector<int> data_streaming_vec(size_streaming, 0);

int* d_lut_persistent;
int* d_data_streaming;
int* h_lut_persistent = lut_persistent_vec.data();
int* h_data_streaming = data_streaming_vec.data();

CHECK_CUDA_ERROR(
cudaMalloc(&d_lut_persistent, size_persistent * sizeof(int)));
CHECK_CUDA_ERROR(
cudaMalloc(&d_data_streaming, size_streaming * sizeof(int)));
CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
CHECK_CUDA_ERROR(cudaMemcpy(d_lut_persistent, h_lut_persistent,
size_persistent * sizeof(int),
cudaMemcpyHostToDevice));

launch_reset_data(d_data_streaming, d_lut_persistent, size_streaming,
size_persistent, stream);
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
CHECK_CUDA_ERROR(cudaMemcpy(h_data_streaming, d_data_streaming,
size_streaming * sizeof(int),
cudaMemcpyDeviceToHost));
assert(verify_data(h_data_streaming, size_persistent, size_streaming));

std::function<void(cudaStream_t)> const function{
std::bind(launch_reset_data, d_data_streaming, d_lut_persistent,
size_streaming, size_persistent, std::placeholders::_1)};
float const latency{
measure_performance(function, stream, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3)
<< "Latency Without Using Persistent L2 Cache: " << latency
<< " ms" << std::endl;

// Start to use persistent cache.
cudaStream_t stream_persistent_cache;
size_t const num_megabytes_persistent_cache{3};
CHECK_CUDA_ERROR(cudaStreamCreate(&stream_persistent_cache));

CHECK_CUDA_ERROR(
cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
num_megabytes_persistent_cache * 1024 * 1024));

cudaStreamAttrValue stream_attribute_thrashing;
stream_attribute_thrashing.accessPolicyWindow.base_ptr =
reinterpret_cast<void*>(d_lut_persistent);
stream_attribute_thrashing.accessPolicyWindow.num_bytes =
num_megabytes_persistent_data * 1024 * 1024;
stream_attribute_thrashing.accessPolicyWindow.hitRatio = 1.0;
stream_attribute_thrashing.accessPolicyWindow.hitProp =
cudaAccessPropertyPersisting;
stream_attribute_thrashing.accessPolicyWindow.missProp =
cudaAccessPropertyStreaming;

CHECK_CUDA_ERROR(cudaStreamSetAttribute(
stream_persistent_cache, cudaStreamAttributeAccessPolicyWindow,
&stream_attribute_thrashing));

float const latency_persistent_cache_thrashing{measure_performance(
function, stream_persistent_cache, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3) << "Latency With Using "
<< num_megabytes_persistent_cache
<< " MB Persistent L2 Cache (Potentially Thrashing): "
<< latency_persistent_cache_thrashing << " ms" << std::endl;

cudaStreamAttrValue stream_attribute_non_thrashing{
stream_attribute_thrashing};
stream_attribute_non_thrashing.accessPolicyWindow.hitRatio =
std::min(static_cast<double>(num_megabytes_persistent_cache) /
num_megabytes_persistent_data,
1.0);
CHECK_CUDA_ERROR(cudaStreamSetAttribute(
stream_persistent_cache, cudaStreamAttributeAccessPolicyWindow,
&stream_attribute_non_thrashing));

float const latency_persistent_cache_non_thrashing{measure_performance(
function, stream_persistent_cache, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3) << "Latency With Using "
<< num_megabytes_persistent_cache
<< " MB Persistent L2 Cache (Non-Thrashing): "
<< latency_persistent_cache_non_thrashing << " ms" << std::endl;

CHECK_CUDA_ERROR(cudaFree(d_lut_persistent));
CHECK_CUDA_ERROR(cudaFree(d_data_streaming));
CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
CHECK_CUDA_ERROR(cudaStreamDestroy(stream_persistent_cache));
}

为了避免数据抖动, accessPolicyWindow.hitRatioaccessPolicyWindow.num_bytes的乘积应小于或等于cudaLimitPersistingL2CacheSizeaccessPolicyWindow.hitRatio参数可用于指定接收accessPolicyWindow.hitProp属性的访问的比例,该属性通常是cudaAccessPropertyPersistingaccessPolicyWindow.num_bytes参数可用于指定访问策略窗口覆盖的字节数,通常是持久数据的大小。

实际上,我们可以将accessPolicyWindow.hitRatio设置为持久二级缓存大小与持久数据大小的比率。例如,如果持久二级缓存大小为 3 MB,持久数据大小为 4 MB,我们可以将accessPolicyWindow.hitRatio设置为 3 / 4 = 0.75。

运行 CUDA 数据重置

我们可以在 NVIDIA Ampere GPU 上构建并运行该示例。就我而言,我使用了 NVIDIA RTX 3090 GPU。

1
2
3
4
5
6
7
8
9
10
$ nvcc l2-persistent.cu -o l2-persistent -std=c++14 --gpu-architecture=compute_80
$ ./l2-persistent
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 3 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 3.071 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 2.436 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 2.443 ms

我们可以看到,当持久数据大小为 3 MB、持久二级缓存为 3 MB 时,应用程序的性能提高了大约 20%。

Benchmarking

我们还可以通过改变持久数据大小来运行一些小型基准测试。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
$ ./l2-persistent 1
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 1 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 1.754 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 1.685 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 1.674 ms

$ ./l2-persistent 2
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 2 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 2.158 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 1.997 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 2.002 ms

$ ./l2-persistent 3
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 3 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 3.095 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 2.510 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 2.533 ms

$ ./l2-persistent 4
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 4 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 3.906 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 3.632 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 3.706 ms

$ ./l2-persistent 5
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 5 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 4.120 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 4.554 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 3.920 ms

$ ./l2-persistent 6
GPU: NVIDIA GeForce RTX 3090
L2 Cache Size: 6 MB
Max Persistent L2 Cache Size: 4 MB
Persistent Data Size: 6 MB
Steaming Data Size: 1024 MB
Latency Without Using Persistent L2 Cache: 4.194 ms
Latency With Using 3 MB Persistent L2 Cache (Potentially Thrashing): 4.583 ms
Latency With Using 3 MB Persistent L2 Cache (Non-Thrashing): 4.255 ms

我们可以看到,即使持久数据大小大于持久二级缓存,使用无抖动的持久二级缓存的延迟通常也不会比基线更差。

FAQ

Q: 持久缓存 VS 共享内存?

持久缓存与共享内存不同。持久缓存对GPU中的所有线程都可见,而共享内存仅对同一块中的线程可见。对于经常访问的小数据,我们还可以使用共享内存来加速数据访问。但是,每个线程块的共享内存限制为 48 到 96 KB,具体取决于 GPU,而持久缓存则限制为每个 GPU 几 MB。

参考

L2 Cache Window
Function Binding and Performance Measurement