template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR> void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCopyRectCommand( conststd::function<void*(size_t)>& append, consthsa_pitched_ptr_t* dst, consthsa_dim3_t* dst_offset, consthsa_pitched_ptr_t* src, consthsa_dim3_t* src_offset, consthsa_dim3_t* range) { // Returns the index of the first set bit (ie log2 of the largest power of 2 that evenly divides // width), the largest element that perfectly covers width. // width | 16 ensures that we don't return a higher element than is supported and avoids // issues with 0. auto maxAlignedElement = [](size_t width) { return __builtin_ctz(width | 16); };
// Find maximum element that describes the pitch and slice. // Pitch and slice must both be represented in units of elements. No element larger than this // may be used in any tile as the pitches would not be exactly represented. int max_ele = Min(maxAlignedElement(src->pitch), maxAlignedElement(dst->pitch)); if (range->z != 1) // Only need to consider slice if HW will copy along Z. max_ele = Min(max_ele, maxAlignedElement(src->slice), maxAlignedElement(dst->slice));
/* Find the minimum element size that will be needed for any tile.
No subdivision of a range admits a larger element size for the smallest element in any subdivision than the element size that covers the whole range, though some can be worse (this is easily model checked). Subdividing with any element larger than the covering element won't change the covering element of the remainder ( Range%Element = (Range-N*LargerElement)%Element since LargerElement%Element=0 ). Ex. range->x=71, assume max range is 16 elements: We can break at 64 giving tiles: [0,63], [64-70] (width 64 & 7). 64 is covered by element 4 (16B) and 7 is covered by element 0 (1B). Exactly covering 71 requires using element 0.
Base addresses in each tile must be DWORD aligned, if not then the offset from an aligned address must be represented in elements. This may reduce the size of the element, but since elements are integer multiples of each other this is harmless.
src and dst base has already been checked for DWORD alignment so we only need to consider the offset here. */ int min_ele = Min(max_ele, maxAlignedElement(range->x), maxAlignedElement(src_offset->x % 4), maxAlignedElement(dst_offset->x % 4));
// Check that pitch and slice can be represented in the tile with the smallest element if ((src->pitch >> min_ele) > max_pitch || (dst->pitch >> min_ele) > max_pitch) throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch out of limits.\n"); if (range->z != 1) { // Only need to consider slice if HW will copy along Z. if ((src->slice >> min_ele) > max_slice || (dst->slice >> min_ele) > max_slice) throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice out of limits.\n"); }
// Break copy into tiles for (uint32_t z = 0; z < range->z; z += max_z) { for (uint32_t y = 0; y < range->y; y += max_y) { uint32_t x = 0; while (x < range->x) { uint32_t width = range->x - x;
// Get largest element which describes the start of this tile after its base address has // been aligned. Base addresses must be DWORD (4 byte) aligned. int aligned_ele = Min(maxAlignedElement((src_offset->x + x) % 4), maxAlignedElement((dst_offset->x + x) % 4), max_ele);
// Get largest permissible element which exactly covers width int element = Min(maxAlignedElement(width), aligned_ele); int xcount = width >> element;
// If width is too large then width is at least max_x bytes (bigger than any element) so // drop the width restriction and clip element count to max_x. if (xcount > max_x) { element = aligned_ele; xcount = Min(width >> element, max_x); }