Transpose8x8

1H:Transpose8x8.cpp
// Transpose an 8x8 matrix with each element a byte for pixel.
// Below is compiled using visual studio 2010 on my PC
__declspec(naked) void Transpose8x8(unsigned char dat[64]) {
// This constant indexes are used for final shuffling of bytes
static __declspec(align(16)) unsigned int pidx[4] = {
0x0A020800, 0x0E060C04, 0x0B030901, 0x0F070D05};
_asm {
push ebp
mov ebp, esp
mov eax, [ebp+8] // Address of the matrix data
// Load the 64 bytes data into xmm0 to xmm3
movdqa xmm0, [eax ]
movdqa xmm1, [eax+16]
// We starts with
// XMM0 = 0F0E0D0C0B0A09080706050403020100
// XMM1 = 1F1E1D1C1B1A19181716151413121110
// XMM2 = 2F2E2D2C2B2A29282726252423222120
// XMM3 = 3F3E3D3C3B3A39383736353433323130
// After transpose permutation should get
// XMM0 = 39312921191109013830282018100800
// XMM1 = 3B332B231B130B033A322A221A120A02
// XMM2 = 3D352D251D150D053C342C241C140C04
// XMM3 = 3F372F271F170F073E362E261E160E06
// Duplicate them, so we can un-pack
movdqa xmm4, xmm0
//movdqa xmm5, xmm1
movdqa xmm6, xmm2
//movdqa xmm7, xmm3
// Permutate bytes by unpacking instructions. 0x1, 2x3
punpcklwd xmm0, xmm1
punpckhwd xmm4, xmm1
punpcklwd xmm2, xmm3
punpckhwd xmm6, xmm3
// XMM0 = 17160706151405041312030211100100
// XMM2 = 37362726353425243332232231302120
// XMM4 = 1F1E0F0E1D1C0D0C1B1A0B0A19180908
// XMM6 = 3F3E2F2E3D3C2D2C3B3A2B2A39382928
// Duplicate again for next round permutation
movdqa xmm1, xmm0
//movdqa xmm3, xmm2
movdqa xmm5, xmm4
//movdqa xmm7, xmm6
punpckldq xmm0, xmm2
punpckhdq xmm1, xmm2
punpckldq xmm4, xmm6
punpckhdq xmm5, xmm6
// XMM0 = 33322322131203023130212011100100
// XMM1 = 37362726171607063534252415140504
// XMM4 = 3B3A2B2A1B1A0B0A3938292819180908
// XMM5 = 3F3E2F2E1F1E0F0E3D3C2D2C1D1C0D0C
// Duplicate again for next round permutation
movdqa xmm2, xmm0
movdqa xmm3, xmm1

2H:Transpose8x8.cpp
//movdqa xmm6, xmm4
//movdqa xmm7, xmm5
// Pre-load the permutation indexes. Use now spared xmm6
//movaps xmm6, pidx (or) movdqa xmm6, pidx
movdqa xmm6, pidx // pidx is permutation index.
// XMM6 = 0F070D050B0309010E060C040A020800
punpcklqdq xmm0, xmm4
punpckhqdq xmm2, xmm4
punpcklqdq xmm1, xmm5
punpckhqdq xmm3, xmm5
// XMM0 = 39382928191809083130212011100100
// XMM1 = 3D3C2D2C1D1C0D0C3534252415140504
// XMM2 = 3B3A2B2A1B1A0B0A3332232213120302
// XMM3 = 3F3E2F2E1F1E0F0E3736272617160706
// We are almost there. Need to permutate within each xmm
//movaps xmm5, pidx (or) movdqa xmm5, pidx
movdqa xmm5, pidx // pidx is permutation index.
// XMM5 = 0F070D050B0309010E060C040A020800
// Now shuffle bytes within each xmm register
pshufb xmm0, xmm6
pshufb xmm2, xmm6
pshufb xmm1, xmm6
pshufb xmm3, xmm6
// XMM0 = 39312921191109013830282018100800
// XMM1 = 3D352D251D150D053C342C241C140C04
// XMM2 = 3B332B231B130B033A322A221A120A02
// XMM3 = 3F372F271F170F073E362E261E160E06
// ==> Great. Exactly what I want. Order=0,2,1,3
// Now save the XMM registers back to memory
movdqa [eax ], xmm0
movdqa [eax+16], xmm2
emms // Recover the floating point states.
pop ebp
ret
}
}
// Returns 0 for no error, none-zero for error.
int TestTranspose() {
__declspec(align(16)) unsigned char dat[64];
for (int i=0; i<64; i++) dat[i] = i;
Transpose8x8(dat);
int ret = 0; // Test to confirm it works.
for (int i=0; i<64; i++) {
// (actual - expected) should always be zero.
ret |= dat[i] - (((i&7)<<3) | (i>>3));
}
return ret;
}

Transpose8x8

Recommended

Recommended

More Related Content

Viewers also liked

Viewers also liked (10)

Similar to Transpose8x8

Similar to Transpose8x8 (20)

Transpose8x8