| |
#ifdef _MSC_VER
|
| |
#define FORCE_INLINE __forceinline
|
| |
#elif defined(__GNUC__)
|
| |
#define FORCE_INLINE __attribute__((always_inline)) inline
|
| |
#else
|
| |
#define FORCE_INLINE inline
|
| |
#endif
|
| |
|
| |
/**
|
| |
* 8x8 SSE 转置微核
|
| |
* 读取 8 行源数据 -> 转置 -> 连续写入 64 字节到 buffer
|
| |
*/
|
| |
FORCE_INLINE void transpose_8x8_store_contiguous(const uint8_t* src0,
|
| |
const uint8_t* src1,
|
| |
const uint8_t* src2,
|
| |
const uint8_t* src3,
|
| |
const uint8_t* src4,
|
| |
const uint8_t* src5,
|
| |
const uint8_t* src6,
|
| |
const uint8_t* src7,
|
| |
uint8_t* pDst) {
|
| |
__m128i r0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src0));
|
| |
__m128i r1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src1));
|
| |
__m128i r2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src2));
|
| |
__m128i r3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src3));
|
| |
__m128i r4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src4));
|
| |
__m128i r5 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src5));
|
| |
__m128i r6 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src6));
|
| |
__m128i r7 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src7));
|
| |
|
| |
__m128i t0 = _mm_unpacklo_epi8(r0, r1);
|
| |
__m128i t1 = _mm_unpacklo_epi8(r2, r3);
|
| |
__m128i t2 = _mm_unpacklo_epi8(r4, r5);
|
| |
__m128i t3 = _mm_unpacklo_epi8(r6, r7);
|
| |
|
| |
__m128i t4 = _mm_unpacklo_epi16(t0, t1);
|
| |
__m128i t5 = _mm_unpacklo_epi16(t2, t3);
|
| |
__m128i t6 = _mm_unpackhi_epi16(t0, t1);
|
| |
__m128i t7 = _mm_unpackhi_epi16(t2, t3);
|
| |
|
| |
__m128i c0 = _mm_unpacklo_epi32(t4, t5);
|
| |
__m128i c1 = _mm_unpackhi_epi32(t4, t5);
|
| |
__m128i c2 = _mm_unpacklo_epi32(t6, t7);
|
| |
__m128i c3 = _mm_unpackhi_epi32(t6, t7);
|
| |
|
| |
// 将转置后的 8x8 块 (64字节) 连续写入 buffer
|
| |
// buffer 是 alignas(64) 的,始终对齐
|
| |
_mm_store_si128(reinterpret_cast<__m128i*>(pDst + 0), c0);
|
| |
_mm_store_si128(reinterpret_cast<__m128i*>(pDst + 16), c1);
|
| |
_mm_store_si128(reinterpret_cast<__m128i*>(pDst + 32), c2);
|
| |
_mm_store_si128(reinterpret_cast<__m128i*>(pDst + 48), c3);
|
| |
}
|
| |
|
| |
/**
|
| |
* 64x64 Tile 转置核心优化
|
| |
* 使用 64x64 栈上缓存
|
| |
* 1. 读 Src (8x8 块),转置后线性写入 Tmp (Row-Major Block)
|
| |
* 2. 读 Tmp (Strided),合并后流式写入 Dst (Contiguous Rows)
|
| |
*/
|
| |
template <bool UseStream>
|
| |
FORCE_INLINE void
|
| |
transpose_64x64_tile_impl(const uint8_t* pSrc, unsigned int srcStep, uint8_t* pDst, unsigned int dstStep) {
|
| |
// 64x64 临时 Buffer
|
| |
alignas(64) uint8_t tmp[64 * 64];
|
| |
uint8_t* tmpPtr = tmp;
|
| |
|
| |
// 1. 读取源并填充 Buffer
|
| |
// 策略:保持源图像的线性访问 (Y then X),这对性能至关重要
|
| |
// 结果:tmp 中的块是按 "Row-Major Block" 顺序排列的
|
| |
// 即:[B(0,0)] [B(0,1)] ... [B(0,7)] [B(1,0)] ...
|
| |
|
| |
// 预计算步长指针,减少循环内乘法
|
| |
size_t srcStep8 = (size_t)srcStep * 8;
|
| |
const uint8_t* s0 = pSrc;
|
| |
|
| |
for (int y = 0; y < 64; y += 8) {
|
| |
const uint8_t* r0 = s0;
|
| |
const uint8_t* r1 = s0 + srcStep;
|
| |
const uint8_t* r2 = s0 + srcStep * 2;
|
| |
const uint8_t* r3 = s0 + srcStep * 3;
|
| |
const uint8_t* r4 = s0 + srcStep * 4;
|
| |
const uint8_t* r5 = s0 + srcStep * 5;
|
| |
const uint8_t* r6 = s0 + srcStep * 6;
|
| |
const uint8_t* r7 = s0 + srcStep * 7;
|
| |
|
| |
for (int x = 0; x < 64; x += 8) {
|
| |
transpose_8x8_store_contiguous(r0 + x, r1 + x, r2 + x, r3 + x, r4 + x, r5 + x, r6 + x, r7 + x, tmpPtr);
|
| |
tmpPtr += 64; // buffer 线性写入
|
| |
}
|
| |
s0 += srcStep8;
|
| |
}
|
| |
|
| |
// 2. 从 Buffer 读取并流式写入 Dst
|
| |
// 目标:写入 Dst 的行
|
| |
// Dst 的第 i 个条带 (由8行组成) 对应 Source 的第 i 个块列
|
| |
// Source 的块列 i 包含块:B(0,i), B(1,i), ... B(7,i)
|
| |
// 在 Row-Major 的 tmp 中,这些块的内存地址不是连续的,而是相隔 8个块 (8*64 = 512字节)
|
| |
|
| |
// 外层循环:遍历 8 个垂直条带 (Strip),对应 tmp 中的 Block Column 0..7
|
| |
for (int colBlock = 0; colBlock < 8; ++colBlock) {
|
| |
// 当前条带中 B(0, colBlock) 的起始地址
|
| |
// 在 tmp 中,Block(row, col) 的索引是 row*8 + col
|
| |
// Block(0, colBlock) 的偏移是 colBlock * 64
|
| |
const uint8_t* bBase = tmp + colBlock * 64;
|
| |
|
| |
// 处理条带内的 8 行
|
| |
for (int r = 0; r < 8; ++r) {
|
| |
// 我们需要从 8 个垂直堆叠的块中,分别取出第 r 行
|
| |
// Block stride = 512 bytes.
|
| |
// Row offset inside block = r * 8 bytes.
|
| |
int laneOffset = r * 8;
|
| |
|
| |
// 从 tmp 中以 512 字节 stride 读取
|
| |
__m128i b0 = _mm_loadl_epi64((const __m128i*)(bBase + 0 * 512 + laneOffset));
|
| |
__m128i b1 = _mm_loadl_epi64((const __m128i*)(bBase + 1 * 512 + laneOffset));
|
| |
__m128i b2 = _mm_loadl_epi64((const __m128i*)(bBase + 2 * 512 + laneOffset));
|
| |
__m128i b3 = _mm_loadl_epi64((const __m128i*)(bBase + 3 * 512 + laneOffset));
|
| |
__m128i b4 = _mm_loadl_epi64((const __m128i*)(bBase + 4 * 512 + laneOffset));
|
| |
__m128i b5 = _mm_loadl_epi64((const __m128i*)(bBase + 5 * 512 + laneOffset));
|
| |
__m128i b6 = _mm_loadl_epi64((const __m128i*)(bBase + 6 * 512 + laneOffset));
|
| |
__m128i b7 = _mm_loadl_epi64((const __m128i*)(bBase + 7 * 512 + laneOffset));
|
| |
|
| |
__m128i v0 = _mm_unpacklo_epi64(b0, b1);
|
| |
__m128i v1 = _mm_unpacklo_epi64(b2, b3);
|
| |
__m128i v2 = _mm_unpacklo_epi64(b4, b5);
|
| |
__m128i v3 = _mm_unpacklo_epi64(b6, b7);
|
| |
|
| |
// 计算目标地址:
|
| |
// 当前是第 colBlock 个条带,第 r 行 -> 全局行 colBlock*8 + r
|
| |
uint8_t* dstRowPtr = pDst + (colBlock * 8 + r) * dstStep;
|
| |
|
| |
if (UseStream) { // 编译期优化,生成无分支代码
|
| |
// Stream 路径:要求 dstRowPtr 必须 16 字节对齐
|
| |
// 适用于 dstStep % 16 == 0 且 pDst 对齐的情况
|
| |
_mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 0), v0);
|
| |
_mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 16), v1);
|
| |
_mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 32), v2);
|
| |
_mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 48), v3);
|
| |
} else {
|
| |
// StoreU 路径:安全处理任意对齐,且依然是 SIMD 向量化
|
| |
// 适用于 dstStep % 16 != 0 的情况
|
| |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 0), v0);
|
| |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 16), v1);
|
| |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 32), v2);
|
| |
_mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 48), v3);
|
| |
}
|
| |
}
|
| |
}
|
| |
}
|
| |
|
| |
/**
|
| |
* 处理边缘的小块 (8x8 fallback)
|
| |
* 将 8x8 源块 (srcStep) 转置写入 8x8 目标块 (dstStep)
|
| |
*/
|
| |
FORCE_INLINE void
|
| |
transpose_8x8_u8_to_strided(const uint8_t* pSrc, unsigned int srcStep, uint8_t* pDst, unsigned int dstStep) {
|
| |
__m128i r0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 0 * srcStep));
|
| |
__m128i r1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 1 * srcStep));
|
| |
__m128i r2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 2 * srcStep));
|
| |
__m128i r3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 3 * srcStep));
|
| |
__m128i r4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 4 * srcStep));
|
| |
__m128i r5 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 5 * srcStep));
|
| |
__m128i r6 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 6 * srcStep));
|
| |
__m128i r7 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 7 * srcStep));
|
| |
|
| |
__m128i t0 = _mm_unpacklo_epi8(r0, r1);
|
| |
__m128i t1 = _mm_unpacklo_epi8(r2, r3);
|
| |
__m128i t2 = _mm_unpacklo_epi8(r4, r5);
|
| |
__m128i t3 = _mm_unpacklo_epi8(r6, r7);
|
| |
|
| |
__m128i t4 = _mm_unpacklo_epi16(t0, t1);
|
| |
__m128i t5 = _mm_unpacklo_epi16(t2, t3);
|
| |
__m128i t6 = _mm_unpackhi_epi16(t0, t1);
|
| |
__m128i t7 = _mm_unpackhi_epi16(t2, t3);
|
| |
|
| |
__m128i c0 = _mm_unpacklo_epi32(t4, t5);
|
| |
__m128i c1 = _mm_unpackhi_epi32(t4, t5);
|
| |
__m128i c2 = _mm_unpacklo_epi32(t6, t7);
|
| |
__m128i c3 = _mm_unpackhi_epi32(t6, t7);
|
| |
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 0 * dstStep), c0);
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 1 * dstStep), _mm_srli_si128(c0, 8));
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 2 * dstStep), c1);
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 3 * dstStep), _mm_srli_si128(c1, 8));
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 4 * dstStep), c2);
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 5 * dstStep), _mm_srli_si128(c2, 8));
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 6 * dstStep), c3);
|
| |
_mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 7 * dstStep), _mm_srli_si128(c3, 8));
|
| |
}
|
| |
|
| |
/**
|
| |
* 核心转置内核,处理任意 WxH 块
|
| |
* 内部使用 64x64 Tile 优化,并处理 8x8 和 1x1 边缘
|
| |
*/
|
| |
template <bool UseStream>
|
| |
int64_t icv_y8_owniTransposeWxH_8uC1_impl(const uint8_t* pSrc,
|
| |
unsigned int srcStep,
|
| |
uint8_t* pDst,
|
| |
unsigned int dstStep,
|
| |
int width,
|
| |
int height) {
|
| |
if (width <= 0 ** height <= 0)
|
| |
return 0;
|
| |
|
| |
constexpr int TILE = 64;
|
| |
constexpr int MICRO = 8;
|
| |
|
| |
const int wMain = width & ~(TILE - 1); // 64x 块主区域
|
| |
const int hMain = height & ~(TILE - 1); // 64x 块主区域
|
| |
|
| |
// 1. 主循环 64x64 Tile (使用模板参数选择优化策略)
|
| |
for (int y = 0; y < hMain; y += TILE) {
|
| |
for (int x = 0; x < wMain; x += TILE) {
|
| |
// Source Tile (x, y) 转置后写入 Dst Tile (y, x)
|
| |
const uint8_t* srcTile = pSrc + y * srcStep + x;
|
| |
uint8_t* dstTile = pDst + x * dstStep + y;
|
| |
transpose_64x64_tile_impl<UseStream>(srcTile, srcStep, dstTile, dstStep);
|
| |
}
|
| |
}
|
| |
|
| |
// 2. 边缘处理 (通用代码,不依赖 UseStream,因为 storel 总是安全的)
|
| |
// 高度为 hMain,宽度为 wTail
|
| |
const int wTail = width - wMain;
|
| |
if (wTail > 0) {
|
| |
int wTailMain = wTail & ~(MICRO - 1); // 8x 块区域
|
| |
int wTailTail = wTail - wTailMain; // 1x 标量区域
|
| |
|
| |
for (int y = 0; y < hMain; y += MICRO) {
|
| |
const uint8_t* srcRow = pSrc + y * srcStep + wMain;
|
| |
uint8_t* dstCol = pDst + wMain * dstStep + y;
|
| |
|
| |
int xOff = 0;
|
| |
// 8x8 块
|
| |
for (; xOff < wTailMain; xOff += MICRO) {
|
| |
transpose_8x8_u8_to_strided(srcRow + xOff, srcStep, dstCol + xOff * dstStep, dstStep);
|
| |
}
|
| |
// 标量补齐
|
| |
// (y, xOff) -> (xOff, y)
|
| |
for (int k = 0; k < MICRO; ++k) { // 遍历 8 行
|
| |
for (int x = 0; x < wTailTail; ++x) {
|
| |
dstCol[(xOff + x) * dstStep + k] = srcRow[k * srcStep + (xOff + x)];
|
| |
}
|
| |
}
|
| |
}
|
| |
}
|
| |
|
| |
// 3. 处理底部边缘 (Height non-64, 左侧部分)
|
| |
// 高度为 hBottomTail,宽度为 wMain
|
| |
const int hBottomTail = height - hMain;
|
| |
if (hBottomTail > 0) {
|
| |
int hBottomMain = hBottomTail & ~(MICRO - 1); // 8x 块区域
|
| |
int hBottomTailTail = hBottomTail - hBottomMain; // 1x 标量区域
|
| |
|
| |
for (int x = 0; x < wMain; x += MICRO) {
|
| |
const uint8_t* srcCol = pSrc + hMain * srcStep + x;
|
| |
uint8_t* dstRow = pDst + x * dstStep + hMain;
|
| |
|
| |
int yOff = 0;
|
| |
// 8x8 块
|
| |
for (; yOff < hBottomMain; yOff += MICRO) {
|
| |
transpose_8x8_u8_to_strided(srcCol + yOff * srcStep, srcStep, dstRow + yOff, dstStep);
|
| |
}
|
| |
// 标量补齐
|
| |
// (yOff, k) -> (k, yOff)
|
| |
for (int k = 0; k < MICRO; ++k) { // 遍历 8 列
|
| |
for (int y = 0; y < hBottomTailTail; ++y) {
|
| |
dstRow[k * dstStep + (yOff + y)] = srcCol[(yOff + y) * srcStep + k];
|
| |
}
|
| |
}
|
| |
}
|
| |
}
|
| |
|
| |
// 4. 处理右下角 (wTail x hBottomTail)
|
| |
if (wTail > 0 && hBottomTail > 0) {
|
| |
// C++ 标量实现
|
| |
const uint8_t* srcCorner = pSrc + hMain * srcStep + wMain;
|
| |
uint8_t* dstCorner = pDst + wMain * dstStep + hMain;
|
| |
for (int y = 0; y < hBottomTail; ++y) {
|
| |
for (int x = 0; x < wTail; ++x) {
|
| |
dstCorner[x * dstStep + y] = srcCorner[y * srcStep + x];
|
| |
}
|
| |
}
|
| |
}
|
| |
|
| |
// 如果使用了 Stream (NT Store),需要 sfence 确保数据可见性
|
| |
if (UseStream) {
|
| |
_mm_sfence();
|
| |
}
|
| |
return 0;
|
| |
}
|
| |
|
| |
/**
|
| |
* 核心转置内核 Dispatcher
|
| |
* 根据 dstStep 和 pDst 的对齐情况,分发到 Stream 版或 StoreU 版
|
| |
*/
|
| |
int64_t icv_y8_owniTransposeWxH_8uC1(const uint8_t* pSrc,
|
| |
unsigned int srcStep,
|
| |
uint8_t* pDst,
|
| |
unsigned int dstStep,
|
| |
int width,
|
| |
int height) {
|
| |
// 检查对齐
|
| |
// 1. pDst 地址必须 16 字节对齐
|
| |
// 2. dstStep 必须是 16 的倍数
|
| |
// 只有同时满足,才能在 64x64 块内部安全使用 stream 指令
|
| |
bool isAligned = (((uintptr_t)pDst * (uintptr_t)dstStep) & 0xF) == 0;
|
| |
|
| |
if (isAligned) {
|
| |
return icv_y8_owniTransposeWxH_8uC1_impl<true>(pSrc, srcStep, pDst, dstStep, width, height);
|
| |
} else {
|
| |
return icv_y8_owniTransposeWxH_8uC1_impl<false>(pSrc, srcStep, pDst, dstStep, width, height);
|
| |
}
|
| |
}
|
| |
|
| |
|
| |
/**
|
| |
* 顶层转置函数:将整幅图像按 512x512 分块,调度到 icv_y8_owniTransposeWxH_8uC1
|
| |
*/
|
| |
int64_t icv_transpose_8u_C1(const uint8_t* pSrc,
|
| |
unsigned int srcStep,
|
| |
uint8_t* pDst,
|
| |
unsigned int dstStep,
|
| |
int width,
|
| |
int height) {
|
| |
constexpr int TILE = 512;
|
| |
|
| |
if (width <= 0 ** height <= 0) {
|
| |
return 0;
|
| |
}
|
| |
|
| |
const int h_main = height & ~(TILE - 1); // height - height % 512
|
| |
const int w_main = width & ~(TILE - 1); // width - width % 512
|
| |
const int h_tail = height - h_main; // height % 512
|
| |
const int w_tail = width - w_main; // width % 512
|
| |
|
| |
int64_t last_ret = 0; // 保存最后一次调用内核的返回值
|
| |
|
| |
// 1. 主 512x512 网格区域:0..h_main-1, 0..w_main-1
|
| |
// 外层循环 Width (bj),内层展开 Height (bi)
|
| |
// 这使得 Source 每次读取跳跃 512 行 (垂直),
|
| |
// 而 Destination 每次写入跳跃 512 列 (水平,即连续内存),
|
| |
// 这对写合并缓冲 (Write Combining) 非常友好
|
| |
if (h_main > 0 && w_main > 0) {
|
| |
const int blocksH = h_main / TILE; // 垂直方向块数
|
| |
const int blocksW = w_main / TILE; // 水平方向块数
|
| |
const int GROUP = 8; // 8 个 512x512 块一组
|
| |
|
| |
// 外层遍历 Destination 的行 (即 Source 的列)
|
| |
for (int bj = 0; bj < blocksW; ++bj) {
|
| |
const int srcColOffset = bj * TILE;
|
| |
const int dstRowOffset = bj * TILE * static_cast<int>(dstStep);
|
| |
|
| |
int bi = 0;
|
| |
|
| |
// 1a. 内层展开:处理 Source 的 8 个垂直块 (Vertical Blocks)
|
| |
// 这会生成 Destination 的 8 个水平块 (Horizontal Blocks -> 连续写入)
|
| |
for (; bi + GROUP - 1 < blocksH; bi += GROUP) {
|
| |
const int srcRowOffset = bi * TILE * static_cast<int>(srcStep);
|
| |
const int dstColOffset = bi * TILE;
|
| |
|
| |
const uint8_t* srcBase = pSrc + srcRowOffset + srcColOffset;
|
| |
uint8_t* dstBase = pDst + dstRowOffset + dstColOffset;
|
| |
|
| |
// Source 指针每次加 srcStep * TILE (垂直移动)
|
| |
// Dest 指针每次加 TILE (水平移动)
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 0 * TILE * srcStep, srcStep,
|
| |
dstBase + 0 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 1 * TILE * srcStep, srcStep,
|
| |
dstBase + 1 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 2 * TILE * srcStep, srcStep,
|
| |
dstBase + 2 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 3 * TILE * srcStep, srcStep,
|
| |
dstBase + 3 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 4 * TILE * srcStep, srcStep,
|
| |
dstBase + 4 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 5 * TILE * srcStep, srcStep,
|
| |
dstBase + 5 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 6 * TILE * srcStep, srcStep,
|
| |
dstBase + 6 * TILE, dstStep, TILE, TILE);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 7 * TILE * srcStep, srcStep,
|
| |
dstBase + 7 * TILE, dstStep, TILE, TILE);
|
| |
}
|
| |
|
| |
// 1b. 本行(列)剩余的 512x512 块(不足 8 个的一段)
|
| |
for (; bi < blocksH; ++bi) {
|
| |
const int srcRowOffset = bi * TILE * static_cast<int>(srcStep);
|
| |
const int dstColOffset = bi * TILE;
|
| |
|
| |
const uint8_t* srcBlock = pSrc + srcRowOffset + srcColOffset;
|
| |
uint8_t* dstBlock = pDst + dstRowOffset + dstColOffset;
|
| |
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, TILE, TILE);
|
| |
}
|
| |
}
|
| |
}
|
| |
|
| |
// 2. 右侧边缘:宽度剩余 w_tail x 高度 h_main
|
| |
// 这个区域的块尺寸是 w_tail x 512
|
| |
if (w_tail > 0 && h_main > 0) {
|
| |
const int blocksH = h_main / TILE;
|
| |
|
| |
for (int bi = 0; bi < blocksH; ++bi) {
|
| |
const int srcRowOffset = bi * TILE * static_cast<int>(srcStep);
|
| |
const int dstColOffset = bi * TILE;
|
| |
|
| |
const uint8_t* srcBlock = pSrc + srcRowOffset + w_main;
|
| |
uint8_t* dstBlock = pDst + w_main * static_cast<int>(dstStep) + dstColOffset;
|
| |
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, w_tail, TILE);
|
| |
}
|
| |
}
|
| |
|
| |
// 3. 底部边缘:宽度 w_main x 高度剩余 h_tail
|
| |
// 区域被拆成若干 512x h_tail 的块,同样按宽度做 8x 展开
|
| |
// (这部分保持不变,因为高度 < 512,无法进行垂直展开)
|
| |
if (h_tail > 0 && w_main > 0) {
|
| |
const int blocksW = w_main / TILE;
|
| |
const int GROUP = 8;
|
| |
|
| |
const int srcRowOffsetBase = h_main * static_cast<int>(srcStep);
|
| |
const int dstColOffsetBase = h_main;
|
| |
|
| |
int bj = 0;
|
| |
|
| |
// 3a. 每次处理 8 个 512x h_tail 的块
|
| |
for (; bj + GROUP - 1 < blocksW; bj += GROUP) {
|
| |
const int srcColOffset = bj * TILE;
|
| |
const int dstRowOffset = bj * TILE * static_cast<int>(dstStep);
|
| |
|
| |
const uint8_t* srcBase = pSrc + srcRowOffsetBase + srcColOffset;
|
| |
uint8_t* dstBase = pDst + dstRowOffset + dstColOffsetBase;
|
| |
|
| |
// 水平展开
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 0 * TILE,
|
| |
srcStep,
|
| |
dstBase + 0 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 1 * TILE,
|
| |
srcStep,
|
| |
dstBase + 1 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 2 * TILE,
|
| |
srcStep,
|
| |
dstBase + 2 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 3 * TILE,
|
| |
srcStep,
|
| |
dstBase + 3 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 4 * TILE,
|
| |
srcStep,
|
| |
dstBase + 4 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 5 * TILE,
|
| |
srcStep,
|
| |
dstBase + 5 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 6 * TILE,
|
| |
srcStep,
|
| |
dstBase + 6 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 7 * TILE,
|
| |
srcStep,
|
| |
dstBase + 7 * TILE * static_cast<int>(dstStep),
|
| |
dstStep,
|
| |
TILE,
|
| |
h_tail);
|
| |
}
|
| |
|
| |
// 3b. 本行剩余的 512x h_tail 块
|
| |
for (; bj < blocksW; ++bj) {
|
| |
const uint8_t* srcBlock = pSrc + srcRowOffsetBase + bj * TILE;
|
| |
uint8_t* dstBlock = pDst + bj * TILE * static_cast<int>(dstStep) + dstColOffsetBase;
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, TILE, h_tail);
|
| |
}
|
| |
}
|
| |
|
| |
// 4. 右下角小块:w_tail x h_tail
|
| |
if (h_tail > 0 && w_tail > 0) {
|
| |
const uint8_t* srcBlock = pSrc + h_main * static_cast<int>(srcStep) + w_main;
|
| |
uint8_t* dstBlock = pDst + w_main * static_cast<int>(dstStep) + h_main;
|
| |
last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, w_tail, h_tail);
|
| |
}
|
| |
|
| |
return last_ret;
|
| |
}
|