Hi there 👋

Chance favors the bold.

cuCollections

still draft, to be updated 实现了一个dynamic_map的retrieve_all 访存算子，经典过滤问题，predict为true的元素拷贝到out数组，重点是需要维护一个 atomic 的out index, https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/, 翻译：https://zhuanlan.zhihu.com/p/581078557 host端： template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator> template <typename KeyOut, typename ValueOut> std::pair<KeyOut, ValueOut> dynamic_map<Key, Value, Scope, Allocator>::retrieve_all( KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const { auto constexpr block_size = 128; auto constexpr stride = 1; auto const capacity = get_capacity(); auto grid_size = (capacity + stride * block_size - 1) / (stride * block_size); std::vector<size_t> submap_cap_prefix(submaps_.size()); std::inclusive_scan(submaps_.begin(), submaps_.end(), submap_cap_prefix.begin(), [](auto const& sum, auto const& submap) { return sum + submap->get_capacity(); }, (size_t)0); thrust::device_vector<size_t> submap_cap_prefix_d(submap_cap_prefix); // 复用alloc_（用于slots_的alloc_）会比直接cudaMalloc快一个数量级，不需要重新分配内存 // 单纯cudaMalloc会触发GPU driver/runtime 的 allocation 初始化、页表建立等 using temp_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<char>; auto temp_allocator = temp_allocator_type{alloc_}; auto d_num_out = reinterpret_cast<unsigned long long*>( std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(unsigned long long))); CUCO_CUDA_TRY(cudaMemsetAsync(d_num_out, 0, sizeof(unsigned long long), stream)); detail::retrieve_all<block_size><<<grid_size, block_size, 0, stream>>>( keys_out, values_out, submap_views_.data().get(), submaps_.size(), capacity, d_num_out, submap_cap_prefix_d.data().get(), empty_key_sentinel_, erased_key_sentinel_); size_t h_num_out; CUCO_CUDA_TRY( cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); CUCO_CUDA_TRY(cudaFree(d_num_out)) return {keys_out + h_num_out, values_out + h_num_out}; } naive实现，一个全局atomic template <uint32_t block_size, typename OutputIt, typename viewT, typename PrefixT, typename Key> CUCO_KERNEL void retrieve_all(OutputIt keys_out, OutputIt values_out, viewT* submap_views, uint32_t num_submaps, uint64_t capacity, unsigned long long* d_num_out, PrefixT* prefix_sum, Key empty_key_sentinel, Key erased_key_sentinel) { auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto stride = blockDim.x * gridDim.x; for (; tid < capacity; tid += stride) { uint32_t submap_idx = 0; uint32_t submap_offset = tid; // prefix_sum长度一般就10以内，不需要二分之类的操作 while (tid >= prefix_sum[submap_idx] && submap_idx < num_submaps ) ++submap_idx; if (submap_idx > 0) { submap_offset = tid - prefix_sum[submap_idx - 1]; } auto const &current_slot = submap_views[submap_idx].get_slots()[submap_offset]; Key const existing_key = current_slot.first.load(cuda::std::memory_order_relaxed); auto const is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) or cuco::detail::bitwise_compare(existing_key, erased_key_sentinel)); if (is_filled) { auto idx = atomicAdd(d_num_out, static_cast<unsigned long long>(1)); auto value = current_slot.second.load(cuda::std::memory_order_relaxed); keys_out[idx] = existing_key; values_out[idx] = value; } } } block内atomicAdd + 全局atomicAdd // 一个block内用一个__shared__ local_count表示这个block中predict为true的数量 // local_pos表示当前线程在该block内第几个predict为true template <uint32_t block_size, typename OutputIt, typename viewT, typename PrefixT, typename Key> CUCO_KERNEL void retrieve_all(OutputIt keys_out, OutputIt values_out, viewT* submap_views, uint32_t num_submaps, uint64_t capacity, unsigned long long* d_num_out, PrefixT* prefix_sum, Key empty_key_sentinel, Key erased_key_sentinel) { auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto stride = blockDim.x * gridDim.x; __shared__ unsigned int local_count; if (threadIdx.x == 0) { local_count = 0; } __syncthreads(); for (; tid < capacity; tid += stride) { uint32_t submap_idx = 0; uint32_t submap_offset = tid; while (tid >= prefix_sum[submap_idx] && submap_idx < num_submaps ) ++submap_idx; if (submap_idx > 0) { submap_offset = tid - prefix_sum[submap_idx - 1]; } auto const &current_slot = submap_views[submap_idx].get_slots()[submap_offset]; Key const existing_key = current_slot.first.load(cuda::std::memory_order_relaxed); auto const is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) or cuco::detail::bitwise_compare(existing_key, erased_key_sentinel)); unsigned int local_pos = 0; if (is_filled) { local_pos = atomicAdd_block(&local_count, 1); } __syncthreads(); if (threadIdx.x == 0) { local_count = atomicAdd(d_num_out, local_count); } __syncthreads(); if (is_filled) { auto value = current_slot.second.load(cuda::std::memory_order_relaxed); keys_out[local_count + local_pos] = existing_key; values_out[local_count + local_pos] = value; } } } // 类似原理，但是用cub::BlockScan实现 template <uint32_t block_size, typename OutputIt, typename viewT, typename PrefixT, typename Key> CUCO_KERNEL void retrieve_all(OutputIt keys_out, OutputIt values_out, viewT* submap_views, uint32_t num_submaps, uint64_t capacity, unsigned long long* d_num_out, PrefixT* prefix_sum, Key empty_key_sentinel, Key erased_key_sentinel) { using BlockScan = cub::BlockScan<unsigned int, block_size>; // Shared memory __shared__ typename BlockScan::TempStorage scan_temp_storage; __shared__ unsigned int block_base; auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto stride = blockDim.x * gridDim.x; for (; tid < capacity; tid += stride) { // Compute submap index and offset uint32_t submap_idx = 0; uint32_t submap_offset = tid; while (tid >= prefix_sum[submap_idx] && submap_idx < num_submaps) ++submap_idx; if (submap_idx > 0) { submap_offset = tid - prefix_sum[submap_idx - 1]; } auto const& current_slot = submap_views[submap_idx].get_slots()[submap_offset]; Key const existing_key = current_slot.first.load(cuda::std::memory_order_relaxed); // Check key validity bool is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) || cuco::detail::bitwise_compare(existing_key, erased_key_sentinel)); // Perform block-wide exclusive scan to compute local write index unsigned int local_idx = 0; unsigned int total_valid = 0; BlockScan(scan_temp_storage).ExclusiveSum(is_filled ? 1u : 0u, local_idx, total_valid); // Block leader calculates global offset if (threadIdx.x == 0) { block_base = atomicAdd(d_num_out, total_valid); } __syncthreads(); if (is_filled) { auto value = current_slot.second.load(cuda::std::memory_order_relaxed); keys_out[block_base + local_idx] = existing_key; values_out[block_base + local_idx] = value; } } } 3. warp-aggregated atomics: warp(or cooperative group)粒度atomicAdd + block内atomicAdd+全局atomicAdd ```cpp template <uint32_t block_size, typename OutputIt, typename viewT, typename PrefixT, typename Key> CUCO_KERNEL void retrieve_all(OutputIt keys_out, OutputIt values_out, viewT* submap_views, uint32_t num_submaps, uint64_t capacity, unsigned long long* d_num_out, PrefixT* prefix_sum, Key empty_key_sentinel, Key erased_key_sentinel) { auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto stride = blockDim.x * gridDim.x; __shared__ unsigned int block_count; __shared__ unsigned int block_base; if (threadIdx.x == 0) { block_count = 0; block_base = 0; } __syncthreads(); unsigned int local_idx = 0; for (; tid < capacity; tid += stride) { uint32_t submap_idx = 0; uint32_t submap_offset = tid; while (tid >= prefix_sum[submap_idx] && submap_idx < num_submaps ) ++submap_idx; if (submap_idx > 0) { submap_offset = tid - prefix_sum[submap_idx - 1]; } auto const &current_slot = submap_views[submap_idx].get_slots()[submap_offset]; Key const existing_key = current_slot.first.load(cuda::std::memory_order_relaxed); auto const is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) or cuco::detail::bitwise_compare(existing_key, erased_key_sentinel)); unsigned mask = __ballot_sync(0xffffffff, is_filled); int lane = threadIdx.x & 0x1f; int warp_prefix = __popc(mask & ((1u << lane) - 1)); if (is_filled) local_idx = warp_prefix; unsigned int warp_vote = __popc(mask); unsigned int warp_base = 0; if (lane == 0 && warp_vote) { warp_base = atomicAdd_block(&block_count, warp_vote); } warp_base = __shfl_sync(0xffffffff, warp_base, 0); __syncthreads(); if (threadIdx.x == 0) { block_base = atomicAdd(d_num_out, block_count); } __syncthreads(); if (is_filled) { auto value = current_slot.second.load(cuda::std::memory_order_relaxed); keys_out[block_base + warp_base + local_idx] = existing_key; values_out[block_base + warp_base + local_idx] = value; } } } // 类似的，但是用cooperative group实现,实测还是tile_size=32最快，和warp没区别 // 写起来更modern一点 template <uint32_t block_size, uint32_t tile_size, typename OutputIt, typename viewT, typename PrefixT, typename Key> CUCO_KERNEL void retrieve_all(OutputIt keys_out, OutputIt values_out, viewT* submap_views, uint32_t num_submaps, uint64_t capacity, unsigned long long* d_num_out, PrefixT* prefix_sum, Key empty_key_sentinel, Key erased_key_sentinel) { auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block()); auto block = cg::this_thread_block(); auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto stride = blockDim.x * gridDim.x; __shared__ unsigned int block_count; __shared__ unsigned int block_base; if (threadIdx.x == 0) { block_count = 0; block_base = 0; } block.sync(); for (; tid < capacity; tid += stride) { uint32_t submap_idx = 0; uint32_t submap_offset = tid; while (tid >= prefix_sum[submap_idx] && submap_idx < num_submaps ) ++submap_idx; if (submap_idx > 0) { submap_offset = tid - prefix_sum[submap_idx - 1]; } auto const &current_slot = submap_views[submap_idx].get_slots()[submap_offset]; Key const existing_key = current_slot.first.load(cuda::std::memory_order_relaxed); auto const is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) or cuco::detail::bitwise_compare(existing_key, erased_key_sentinel)); unsigned int tile_mask = tile.ballot(is_filled); unsigned int tile_rank = tile.thread_rank(); unsigned int tile_vote = __popc(tile_mask); unsigned int tile_prefix = __popc(tile_mask & ((1u << tile_rank) - 1)); unsigned int tile_base = 0; if (tile_rank == 0 && tile_mask) { tile_base = atomicAdd_block(&block_count, tile_vote); } tile_base = tile.shfl(tile_base, 0); block.sync(); if (block.thread_rank() == 0) { block_base = atomicAdd(d_num_out, block_count); } block.sync(); if (is_filled) { auto value = current_slot.second.load(cuda::std::memory_order_relaxed); keys_out[block_base + tile_base + tile_prefix] = existing_key; values_out[block_base + tile_base + tile_prefix] = value; } } } 实测2/3速度差不多，在插入1亿数据后（实际总cap达到2亿），<key, value>都是cuda::atomic<int64_t>的情况下，retrieve_all cost 3ms左右； ...

TinyMl —— 模型量化(quantization)

Overview 模型量化（quantization）指的是用更少的bit表示模型参数，从而减少模型的大小，加速推理过程的技术。一种常见的量化方式是线性量化(linear quantization)，也叫仿射量化(affine quantization)。其实就是按比例将tensor（一般为fp32）放缩到 $2^{bitwidth}$ 的范围内，比如8bit等。我们很容易给出量化公式： $$ r = s(q - z) $$ 其中，r(real value)值得是量化前的值，q(quantized value)是量化后的值，s(scale)是放缩比例，z(zero point)相当于是一个偏移量。如何求出$s$和$z$呢？一种简单且常见的方式是通过最大最小值来估计，即： $$ s = \frac{r_{max} - r_{min}}{q_{max} - q_{min}} $$ $r_{max}$就是这个tensor的最大值，$r_{min}$是最小值，$q_{max}$和$q_{min}$是我们指定的量化后的最大最小值。如下图所示：有了scale, 容易得到 $z = q_{min} - \frac{r_{min}}{s}$。在实际操作中，z一般会被round到最近的整数$z = round(q_{min} - \frac{r_{min}}{s})$（有很多不同的round规则，这个有具体实现决定）。得到量化方程： $$ q = clip(round(\frac{r}{s}) + z, q_{min}, q_{max}) $$ 代码示意如下：（实际会用pytorch已有的quantize api或者其他推理框架） def get_quantized_range(bitwidth): quantized_max = (1 << (bitwidth - 1)) - 1 quantized_min = -(1 << (bitwidth - 1)) return quantized_min, quantized_max def linear_quantize(fp_tensor, bitwidth, scale, zero_point, dtype=torch.int8) -> torch.Tensor: rounded_tensor = torch.round(fp_tensor / scale).to(dtype) shifted_tensor = rounded_tensor + zero_point quantized_min, quantized_max = get_quantized_range(bitwidth) quantized_tensor = shifted_tensor.clamp_(quantized_min, quantized_max) return quantized_tensor 上述过程被称为非对称量化(asymmetric quantization)。 ...

TinyMl —— 模型剪枝(pruning)

最近正在学习 MIT 6.5940, 韩松老师的课，做deep learning compression的应该都只知道。课程分为三个部分，efficient inference, domain-specific optimization, efficient training。有完整的课件，视频和实验。最后一个lab是将llama2部署在个人电脑上，非常有意思（谁不想要个自己的大模型呢）。其余lab也都可以白嫖google colab的gpu Introduction 正式介绍pruning and sparsity之前，我们先来聊聊为什么要做model compression这个事情。 Today’s Model is Too Big! 随着Large language model的出现，如GPT-3，如今的模型参数量已经达到了上百billion，别说训练，我们甚至无法在一个gpu上对其进行推理。更别提如果我们想要将其部署在其他边缘设备上。所以当前在做inference之前，一般都会有个model-compression的过程，包括pruning（剪枝），quantization（量化），distillation（蒸馏）等。这些方法都是为了减少模型的大小，加速推理过程。这些方法也被广泛地集成到了各种加速卡，gpu中。例如nv的A100就支持structured sparsity（[N:M]形式的，具体含义下文会详细介绍）。 Efficiency Metrics 我们再来看看一些 efficiency metrics，这也是我们在做inference过程中需要考虑的指标： Memory-Related Metrics # parameters model size total/peak #activations Computation-Related Metrics MACs FLOP, FLOPs # parameters 下表是一些常见结构的参数数量： Model #Parameters Linear Layer(FC) $feature_{in} * feature_{out}$ Conv Layer $c_{i} * c_{o} * k_{h} * k_{w} $ Grouped Conv Layer $c_{i} * c_{o} * k_{h} * k_{w} / g$ Depthwise Conv Layer $c_{o} * k_{h} * k_{w}$ 其中，Grouped Conv指的是将输入在channel维度进行分组，然后分别进行卷积，最后concatenate。Depthwise Conv是分组个数 $g$ 等于输入channel数的情况。 ...

CUDA编程模型

本系列笔记主要参考了 “Programming massively parallel processors"这本书，以及网上相关资料；不会特别详细，当作个人整理的面经 CUDA软件架构 CUDA从软件层面上提供了三层结构包括grid, block和thread。每个kernal内启动的所有线程在一个grid内。启动kernel时指定«<dimGrid, dimBlock»>，都是一个dim3结构。 gridDim的最大值范围： (x,y,z): (2^31 - 1, 65535, 65535) blockDim的最大值范围：(x,y,z): (1024, 1024, 64) 且还要同时满足：一个block内的threads数量不能超过1024(从kepler开始)。即：$ blockDim.x * blockDim.y * blockDim.z <= 1024 $ CUDA内存架构变量声明所在内存作用域生命周期 kernel内除了array的变量 register thread grid kernel内的array 变量 local thread grid __shared__ 修饰的kernel内的变量 shared block grid __device__ 修饰的全局变量 global grid application __device__ __constant__ 修饰的全局变量 constant grid application 其中寄存器是GPU上运行速度最快的内存空间，延迟为1个时钟周期。接下来是共享内存，共享内存是GPU上可受用户控制的一级缓存。共享内存类似于CPU的缓存，不过与CPU的缓存不同，GPU的共享内存可以有CUDA内核直接编程控制。延迟为1～32个时钟周期。 local memory实际上就在global memory上，只是通过编译器处理成私有的、每个线程独立的一块内存区域。一般一个kernal内的数组会被处理成local memory。延迟和global memory类似。还有texture memeory，但是和科学计算相关不大。 CUDA硬件结构一个GPU可以看作是SM(streaming multiprocessor)的集合，每个SM包含多个SP(streaming processor，或者现在一般叫CUDA cores)。 ...

Back to Basics -- c++ move sematics

1.3 Copying as a Fallback 1.4 const Return Values const std::string get_value();不再是guideline，因为const disables 移动语义例如： const std::string getValue(); std::vector<std::string> coll; coll.push_back(getValue()); // copies(because the return value is const) const std::string getValue(); // BAD: disables move semantics for return values const std::string& getRef(); // OK const std::string* getPtr(); // OK Summary Move semantics allows us to optimize the copying of objects, where we no longer need the value. It can be used implicitly (for unnamed temporary objects or local return values) or explicitly (with std::move()). std::move() means I no longer need this value here. It marks the object as movable. An object marked with std::move() is not (partially) destroyed (the destructor still will be called). By declaring a function with a non-const rvalue reference (such as std::string&&), you define an interface where the caller semantically claims that it no longer needs the passed value. The implementer of the function can use this information to optimize its task by “stealing” the value or do any other modification with the passed argument. Usually, the implementer also has to ensure that the passed argument is in a valid state after the call. Moved-from objects of the C++ standard library are still valid objects, but you no longer know their value. Copy semantics is used as a fallback for move semantics (if copy semantics is supported). If there is no implementation taking an rvalue reference, any implementation taking an ordinary const lvalue reference (such as const std::string&) is used. This fallback is then used even if the object is explicitly marked with std::move(). Calling std::move() for a const object usually has no effect. If you return by value (not by reference), do not declare the return value as a whole to be const. Moved-from objects Valid but Unspecified State ...

Computer Architecture —— 分支预测

H&P那本关于分支预测的部分比较简短且表述有点晦涩，（顺便吐槽一下第五版的中文翻译，建议看英文原版）本文主要参考超标量处理器设计，国人写的，用语符合习惯，强烈推荐！ Motivation 在处理器中，除了cache之外，另一个重要的内容就是分支预测，它和cache一起左右处理器的性能。以SPECint95作为benchmark，完美的cache和BP(branch-predictor)能使IPC提高两倍左右：图片来自论文SSMT。当然，这是21世纪之前的结果了。现代处理器分支预测普遍能达到97%~98%以上的精度，在多数浮点benchmark中基本都是99%的准确率。为什么需要这么高的精度呢？一般情况下，分支指令的占比通常在 15% 到 30% 之间。对于经典五级流水线无分支预测cpu，一个branch会造成一次stall；而对于现代的superscalar且流水线级数远高于5的（一般是二十级以上）cpu，其misprediction penalty是 $M * N$ 的（M = fetch group内指令数, N = branch resolution latency，就是决定分支最终是否跳转需要多少周期）。如下图所示：我们再做一个定量实验：假设我们有一个 $ N = 20 (20\ pipe stages), W = 5 (5\ wide fetch) $ 1 out of 5 instructions is a branch Each 5 instruction-block ends with a branch 的CPU，那么我们取出500条指令需要多少个周期呢？ 100% 预测正确率 100 个时钟周期 (all instructions fetched on the correct path) 无额外工作 99% 预测正确率 100 (correct path) + 20 (wrong path) = 120 个时钟周期 20% 额外指令被取出 98% 预测正确率 100 (correct path) + 20 * 2 (wrong path) = 140 个时钟周期 40% 额外指令被取出 95% 预测正确率 100 (correct path) + 20 * 5 (wrong path) = 200 个时钟周期 100% 额外指令被取出可以看出，分支预测失败在现代的超标量多流水线cpu中的penalty被极大的放大了。所以分支预测的正确性就显得额外重要。 ...

C++内存模型 —— 现代Architecture的妥协

介绍什么是内存模型(Memory Model)呢？这里介绍的内存模型并非C++对象的内存排布模型，而是一个非编程语言层面的概念。我们知道在C++11中，标准引入了 std::atomic<>原子对象，同时还引入了 memory_order_relaxed memory_order_consume memory_order_acquire memory_order_release memory_order_acq_rel memory_order_seq_cst 这六种 memory order。引入可以让我们进行无锁编程，而如果你想要更高性能的程序，你就必须深挖这六种内存模型的含义并正确应用。（当然，在不显式指明memory order的情况下，你能保证获得正确的代码，但存在性能损失）内存模型在介绍C++ memory order之前，我们先回答另一个问题。你的计算机执行的程序就是你写的程序吗? —— 显然不是的。原因也很简单，为了更高效的执行指令，编译器、CPU结构、缓存及其他硬件系统都会对指令进行增删，修改，重排。但要回答具体进行了什么样的修改，又是一个极其复杂的问题。或者说，整个现代体系结构，就是在保证程序正确性的前提下利用各种手段对程序优化。我们可以粗略的将其分成几个部分： source code order: 程序员在源代码中指定的顺序 program code order: 基本上可以看成汇编/机器码的顺序，它可以由编译器优化后得到 execution code order: CPU执行指令顺序也不见得与汇编相同，不同CPU在执行相同机器码时任然存在优化空间。 perceived order/physical order: 最终的执行顺序。即便CPU按照某种确定指令执行，物理时间上的执行顺序仍然可能不同。例如，在超标量CPU中，一次可以fetch and decode多个指令，这些指令之间的物理执行顺序就是不确定的；由于不同层级缓存之间延时不同，以及缓存之间的通信需要等带来的不确定的执行顺序等上图简要说明了你的源代码可能经历的优化步骤。这些优化的一个主要原因在于掩盖memory access操作与CPU执行速度上的巨大鸿沟。如果没有cache，CPU每个访存指令都需要stall一两百个时钟周期，这是不可接受的。但是引入cache的同时又会带来 cache coherence等问题，这也是造成x初始为0，两个线程同时执行 x++，而x最终不一定为 2的元凶。而一个内存模型则对上述并发程序的同一块内存进行了一定的限制，它给出了在并发程序下，任意一组写操作时，可能读到的值。不同体系结构(x86, arm, power…)通过不同的内存模型来保证程序的正确性。 bonus question: 不同等级的cache latency？ answer: l1: 1ns, l2: 5ns, l3: 50~100ns, main memory: 200ns ...

Computer Architecture —— 高级缓存技术

本文不会介绍cache的组织形式等基本内容，但也算不上什么"Advanced"。主要包含一些从硬件层面优化cache的手段。优化cache的几种方法 pipeline caches 上图为教科书上经常出现的cache形式(2-way associative为例)，它很精炼的解释了cache的实现。但也稍微引入了些“误导”：图中v、tag和data部分画在连续的一行上，仿佛硬件上他们就是同一块 SRAM 的不同bit 图中识别tag与data是并行完成的，这很好，某种意义上能降低时延；但我们经常遗忘一个事实，只有读cache的时候我们才能这么操作（或者说在写cache时，读取data block是没有意义的）对于第一点，在实际的实现当中，tag和data部分都是分开放置的，tag一般是由一种叫CAM(Context-Addressable Memory)的材料构成。当然，这与pi不pipeline没什么关系；读cache主要就两个部分：比较tag，获取data；我们暂且不考虑以pipeline的方式优化，那么serial的先比较tag再读data一定不如parallel的方式进行吗？当我们并行的读取tag和data的时候，我们会发现，读出来的data有可能没用（没有匹配的tag）；并且，在n-way set associate cache中，我们会浪费的读出$n-1$个data项；这给我们什么启示呢？如果我们串行的读cache，那么我们可以在比较tag阶段就知道我们想要的数据在不在cache当中；更有意义的是，根据tag比较的结果，我们就知道哪一路的数据是需要被访问的（提前知道了在n-way中的哪一way），那么我们访问data block时，就无需多路选择器，直接访问指定的way，将其他way的data访问的使能信号置为无效，这种做法的优点在于有效减小功耗。 serial的做法肯定比parallel的延时要大，若这时访问cache处于处理器的critical path(关键路径)上，我们可以再将其进行流水线化。我们现在再来看看写cache时的情况：写cache时，只有通过tag比较，确认要写的地址在cache中后，才可以写data SRAM，在主频较高的处理器中，这些操作很难在一个周期内完成，这也要求我们将其流水线化。下图为对cache进行写操作使用的流水线示意图：在上图的实现方式中，store第一个周期读取Tag并进行比较，根据比较的结果，在第二个周期选择是否将数据写到Data SRAM中。还需要注意的是，当执行load指令时，它想要的数据可能正好在store指令的流水线寄存器中（RAW的情况；上图中的DelayedStoreData寄存器），而不是来自于Data SRAM。因此需要一种机制检测这种情况，这需要将load指令所携带的地址和store指令的流水线寄存器(即DelayedStoreAddr寄存器)进行比较，如果相等，那么就将store指令的数据作为load指令的结果。由此可以看出，对写D-Cache使用流水线之后，不仅增加了流水线本身的硬件，也带来了其他一些额外的硬件开销。其实。不仅在Cache中有这种现象，在处理器的其他部分增加流水线的级数，也会伴随着其他方面的硬件需求,因此过深的流水线带来的硬件复杂度是非常高的，就像Intel的Pentium 4处理器，并不会从过深的流水线中得到预想的好处。当然，cache的流水线化已经是一种广泛使用的用于降低latency的方法了。 write buffers 我愿称之为buffer of buffer，本来cache就起buffer的作用了，但我们再加一个buffer，如下图所示：这和多一级的cache有什么不同呢？这是一个专门为写操作设计的buffer（注意：load也可能造成写操作）。原因在于我们知道写通常比读更慢，特别对于write-through来说；其次，当上层cache满后，需要先将dirty cache line写回下层cache，再读取下层cache中的数据。若下层cache只有一个读写端口，那么这种串行的过程导致D-Cache发生缺失的处理时间变得很长，此时就可以采用write buffer来解决这个问题。脏状态的cache-line会首先放到写级存中，等到下级存储器有空闲的时候，才会将写缓存中的数据写到下级存储器中。对于write buffer，我们还可以对其进行合并(merging) 操作。所谓merging，指的是将在同一个cache-line上的数据一并写入下层cache中，而非多次写入同一个cache-line。上图中的右侧表示了一个采用了merging write buffer策略的写缓冲区。 critial word first and early restart 先来看一下cache miss时的cpu：图中展示了一个blocking cache在cache miss时，cpu stall，而后cache将需要取得的cache-line放入后，cpu resume的timeline。我们可以发现，若我们只需要cache-line中的第3个word，cpu完全可以提早resume。如下图所示： ...

phmap —— 缓存友好的高效hashtable

众所周知，C++标准库的 unordered_map在性能上向来不是一个好的选择。开源市场上有非常多的高性能哈希表可供选择，phmap继承自 absl-hashmap，有着非常好的插入、查找性能。在著名的Comprehensive C++ Hashmap Benchmarks 2022榜单中名列前茅。事实上，我比对了 phmap::flat_hash_map与榜单中综合性能第一的 ankerl::unordered_dense::map，我的benchmark中只有遍历哈希表时，flat_hash_map的性能低于 unordered_dense::map，其余无论是插入还是随即查找，大部分情况下 flat_hash_map的性能都更优。本文简单介绍了 flat_hash_map相关情况，以及一些使用上的建议与坑点。 flat_hash_map 和 node_hash_map区别 phmap中有提供了两类哈希表，其内部布局示意图如下：由上图（忽略了bucket的细节）可以看出，flat_hash_map的最大的优点在于 node之间的内存是连续的(虽然可能中间存在空node)，遍历的时候对cache更加友好并且相比于 node_hash_map版少一次寻址过程（std::unordered_map的设计与 node_hash_map）相同。而 flat_*系列的缺点就是在 rehash的时候: 会引发原来的value失效（这里的失效指的是原来的那个对象所对应的内存失效，而不是value所包含的内容失效，例如，value是一个指针，那它的值——所指向的对象，不会受到影响）。举个例子： flat_hash_map<int, Data> mp; node_hash_map<int, Data> nodemp; mp[0] = Data(); nodemp[0] = Data(); auto& mp0 = mp[0]; auto& nodemp0 = nodemp[0]; // tigger rehash for (int i = 1; i <= 10; i ++) { mp[i] = Data(); nodemp[i] = Data(); } assert(std::addressof(mp[0]) != std::addressof(mp0)); assert(std::addressof(nodemp[0]) == std::addressof(nodemp0)); 原因就是 flat_hash_map的内存布局导致的。而 node_hash_map或者 std::unordered_map就保证不会出现这种情况，因为当他们rehash的时候，只需要将bucket内的指针重新分配，指针的值还是指向原来的 node<key, value>. ...