SlideShare a Scribd company logo
1 of 2
Download to read offline
1H:Transpose8x8.cpp
// Transpose an 8x8 matrix with each element a byte for pixel.
// Below is compiled using visual studio 2010 on my PC
__declspec(naked) void Transpose8x8(unsigned char dat[64]) {
// This constant indexes are used for final shuffling of bytes
static __declspec(align(16)) unsigned int pidx[4] = {
0x0A020800, 0x0E060C04, 0x0B030901, 0x0F070D05};
_asm {
push ebp
mov ebp, esp
mov eax, [ebp+8] // Address of the matrix data
// Load the 64 bytes data into xmm0 to xmm3
movdqa xmm0, [eax ]
movdqa xmm1, [eax+16]
movdqa xmm2, [eax+32]
movdqa xmm3, [eax+48]
// We starts with
// XMM0 = 0F0E0D0C0B0A09080706050403020100
// XMM1 = 1F1E1D1C1B1A19181716151413121110
// XMM2 = 2F2E2D2C2B2A29282726252423222120
// XMM3 = 3F3E3D3C3B3A39383736353433323130
// After transpose permutation should get
// XMM0 = 39312921191109013830282018100800
// XMM1 = 3B332B231B130B033A322A221A120A02
// XMM2 = 3D352D251D150D053C342C241C140C04
// XMM3 = 3F372F271F170F073E362E261E160E06
// Duplicate them, so we can un-pack
movdqa xmm4, xmm0
//movdqa xmm5, xmm1
movdqa xmm6, xmm2
//movdqa xmm7, xmm3
// Permutate bytes by unpacking instructions. 0x1, 2x3
punpcklwd xmm0, xmm1
punpckhwd xmm4, xmm1
punpcklwd xmm2, xmm3
punpckhwd xmm6, xmm3
// XMM0 = 17160706151405041312030211100100
// XMM2 = 37362726353425243332232231302120
// XMM4 = 1F1E0F0E1D1C0D0C1B1A0B0A19180908
// XMM6 = 3F3E2F2E3D3C2D2C3B3A2B2A39382928
// Duplicate again for next round permutation
movdqa xmm1, xmm0
//movdqa xmm3, xmm2
movdqa xmm5, xmm4
//movdqa xmm7, xmm6
// Permutate bytes by unpacking instructions. 0x2, 4x6
punpckldq xmm0, xmm2
punpckhdq xmm1, xmm2
punpckldq xmm4, xmm6
punpckhdq xmm5, xmm6
// XMM0 = 33322322131203023130212011100100
// XMM1 = 37362726171607063534252415140504
// XMM4 = 3B3A2B2A1B1A0B0A3938292819180908
// XMM5 = 3F3E2F2E1F1E0F0E3D3C2D2C1D1C0D0C
// Duplicate again for next round permutation
movdqa xmm2, xmm0
movdqa xmm3, xmm1
2H:Transpose8x8.cpp
//movdqa xmm6, xmm4
//movdqa xmm7, xmm5
// Pre-load the permutation indexes. Use now spared xmm6
//movaps xmm6, pidx (or) movdqa xmm6, pidx
movdqa xmm6, pidx // pidx is permutation index.
// XMM6 = 0F070D050B0309010E060C040A020800
// Permutate bytes by unpacking instructions. 0x4, 1x5
punpcklqdq xmm0, xmm4
punpckhqdq xmm2, xmm4
punpcklqdq xmm1, xmm5
punpckhqdq xmm3, xmm5
// XMM0 = 39382928191809083130212011100100
// XMM1 = 3D3C2D2C1D1C0D0C3534252415140504
// XMM2 = 3B3A2B2A1B1A0B0A3332232213120302
// XMM3 = 3F3E2F2E1F1E0F0E3736272617160706
// We are almost there. Need to permutate within each xmm
//movaps xmm5, pidx (or) movdqa xmm5, pidx
movdqa xmm5, pidx // pidx is permutation index.
// XMM5 = 0F070D050B0309010E060C040A020800
// Now shuffle bytes within each xmm register
pshufb xmm0, xmm6
pshufb xmm2, xmm6
pshufb xmm1, xmm6
pshufb xmm3, xmm6
// XMM0 = 39312921191109013830282018100800
// XMM1 = 3D352D251D150D053C342C241C140C04
// XMM2 = 3B332B231B130B033A322A221A120A02
// XMM3 = 3F372F271F170F073E362E261E160E06
// ==> Great. Exactly what I want. Order=0,2,1,3
// Now save the XMM registers back to memory
movdqa [eax ], xmm0
movdqa [eax+16], xmm2
movdqa [eax+32], xmm1
movdqa [eax+48], xmm3
emms // Recover the floating point states.
pop ebp
ret
}
}
// Returns 0 for no error, none-zero for error.
int TestTranspose() {
__declspec(align(16)) unsigned char dat[64];
for (int i=0; i<64; i++) dat[i] = i;
Transpose8x8(dat);
int ret = 0; // Test to confirm it works.
for (int i=0; i<64; i++) {
// (actual - expected) should always be zero.
ret |= dat[i] - (((i&7)<<3) | (i>>3));
}
return ret;
}

More Related Content

Viewers also liked

Подводные камни System.Security.Cryptography
Подводные камни System.Security.CryptographyПодводные камни System.Security.Cryptography
Подводные камни System.Security.CryptographyVladimir Kochetkov
 
Movember Facebook Campaign
Movember Facebook CampaignMovember Facebook Campaign
Movember Facebook CampaignKyle Wood, MBA
 
Evaluation questionnaire croatia
Evaluation questionnaire   croatiaEvaluation questionnaire   croatia
Evaluation questionnaire croatiaAndrea Ljubej
 
Learning to Love De Bruijn Graphs
Learning to Love De Bruijn GraphsLearning to Love De Bruijn Graphs
Learning to Love De Bruijn Graphsbenjwoodcroft
 
Finding Corrections Today in Gale
Finding Corrections Today in GaleFinding Corrections Today in Gale
Finding Corrections Today in GaleNancy Little
 
Brecha digital e inclusión educativa
Brecha digital e inclusión educativaBrecha digital e inclusión educativa
Brecha digital e inclusión educativaISa Qe Más Quieres
 
Rapport 2010 CDHP de la Loire
Rapport 2010 CDHP de la LoireRapport 2010 CDHP de la Loire
Rapport 2010 CDHP de la LoireCCDH75
 

Viewers also liked (10)

So what if it's a bubble?
So what if it's a bubble?So what if it's a bubble?
So what if it's a bubble?
 
KASP
KASPKASP
KASP
 
Подводные камни System.Security.Cryptography
Подводные камни System.Security.CryptographyПодводные камни System.Security.Cryptography
Подводные камни System.Security.Cryptography
 
Movember Facebook Campaign
Movember Facebook CampaignMovember Facebook Campaign
Movember Facebook Campaign
 
Evaluation questionnaire croatia
Evaluation questionnaire   croatiaEvaluation questionnaire   croatia
Evaluation questionnaire croatia
 
Learning to Love De Bruijn Graphs
Learning to Love De Bruijn GraphsLearning to Love De Bruijn Graphs
Learning to Love De Bruijn Graphs
 
Finding Corrections Today in Gale
Finding Corrections Today in GaleFinding Corrections Today in Gale
Finding Corrections Today in Gale
 
Brecha digital e inclusión educativa
Brecha digital e inclusión educativaBrecha digital e inclusión educativa
Brecha digital e inclusión educativa
 
Tarea equipo 7
Tarea equipo 7Tarea equipo 7
Tarea equipo 7
 
Rapport 2010 CDHP de la Loire
Rapport 2010 CDHP de la LoireRapport 2010 CDHP de la Loire
Rapport 2010 CDHP de la Loire
 

Similar to Transpose8x8

Vectorization on x86: all you need to know
Vectorization on x86: all you need to knowVectorization on x86: all you need to know
Vectorization on x86: all you need to knowRoberto Agostino Vitillo
 
The forgotten art of assembly
The forgotten art of assemblyThe forgotten art of assembly
The forgotten art of assemblyMarian Marinov
 
Ethereum virtual machine for Developers Part 1
Ethereum virtual machine for Developers Part 1Ethereum virtual machine for Developers Part 1
Ethereum virtual machine for Developers Part 1ArcBlock
 
java memory management & gc
java memory management & gcjava memory management & gc
java memory management & gcexsuns
 
A Speculative Technique for Auto-Memoization Processor with Multithreading
A Speculative Technique for Auto-Memoization Processor with MultithreadingA Speculative Technique for Auto-Memoization Processor with Multithreading
A Speculative Technique for Auto-Memoization Processor with MultithreadingMatsuo and Tsumura lab.
 
Floating point basicsThe core idea of floating-point representatio.pdf
Floating point basicsThe core idea of floating-point representatio.pdfFloating point basicsThe core idea of floating-point representatio.pdf
Floating point basicsThe core idea of floating-point representatio.pdfinfo235816
 
Potapenko, vyukov forewarned is forearmed. a san and tsan
Potapenko, vyukov   forewarned is forearmed. a san and tsanPotapenko, vyukov   forewarned is forearmed. a san and tsan
Potapenko, vyukov forewarned is forearmed. a san and tsanDefconRussia
 
Simple, fast, and scalable torch7 tutorial
Simple, fast, and scalable torch7 tutorialSimple, fast, and scalable torch7 tutorial
Simple, fast, and scalable torch7 tutorialJin-Hwa Kim
 
20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugs20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugsComputer Science Club
 
Devirtualizing FinSpy
Devirtualizing FinSpyDevirtualizing FinSpy
Devirtualizing FinSpyjduart
 
Pf: the OpenBSD packet filter
Pf: the OpenBSD packet filterPf: the OpenBSD packet filter
Pf: the OpenBSD packet filterGiovanni Bechis
 
Vectorization vs Compilation
Vectorization vs CompilationVectorization vs Compilation
Vectorization vs CompilationAlex Averbuch
 
Multilayer Neuronal network hardware implementation
Multilayer Neuronal network hardware implementation Multilayer Neuronal network hardware implementation
Multilayer Neuronal network hardware implementation Nabil Chouba
 
Microprocessor 8086 instructions
Microprocessor 8086 instructionsMicroprocessor 8086 instructions
Microprocessor 8086 instructionsRavi Anand
 
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...Positive Hack Days
 
Lrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with rLrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with rFerdinand Jamitzky
 
Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...
Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...
Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...Anne Nicolas
 
Address/Thread/Memory Sanitizer
Address/Thread/Memory SanitizerAddress/Thread/Memory Sanitizer
Address/Thread/Memory SanitizerPlatonov Sergey
 
Datasheet of SEN-10061(JPEG Camera)
Datasheet of SEN-10061(JPEG Camera)Datasheet of SEN-10061(JPEG Camera)
Datasheet of SEN-10061(JPEG Camera)Tsuyoshi Horigome
 
8086 Micro-processor and MDA 8086 Trainer Kit
8086 Micro-processor and MDA 8086 Trainer Kit8086 Micro-processor and MDA 8086 Trainer Kit
8086 Micro-processor and MDA 8086 Trainer KitAmit Kumer Podder
 

Similar to Transpose8x8 (20)

Vectorization on x86: all you need to know
Vectorization on x86: all you need to knowVectorization on x86: all you need to know
Vectorization on x86: all you need to know
 
The forgotten art of assembly
The forgotten art of assemblyThe forgotten art of assembly
The forgotten art of assembly
 
Ethereum virtual machine for Developers Part 1
Ethereum virtual machine for Developers Part 1Ethereum virtual machine for Developers Part 1
Ethereum virtual machine for Developers Part 1
 
java memory management & gc
java memory management & gcjava memory management & gc
java memory management & gc
 
A Speculative Technique for Auto-Memoization Processor with Multithreading
A Speculative Technique for Auto-Memoization Processor with MultithreadingA Speculative Technique for Auto-Memoization Processor with Multithreading
A Speculative Technique for Auto-Memoization Processor with Multithreading
 
Floating point basicsThe core idea of floating-point representatio.pdf
Floating point basicsThe core idea of floating-point representatio.pdfFloating point basicsThe core idea of floating-point representatio.pdf
Floating point basicsThe core idea of floating-point representatio.pdf
 
Potapenko, vyukov forewarned is forearmed. a san and tsan
Potapenko, vyukov   forewarned is forearmed. a san and tsanPotapenko, vyukov   forewarned is forearmed. a san and tsan
Potapenko, vyukov forewarned is forearmed. a san and tsan
 
Simple, fast, and scalable torch7 tutorial
Simple, fast, and scalable torch7 tutorialSimple, fast, and scalable torch7 tutorial
Simple, fast, and scalable torch7 tutorial
 
20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugs20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugs
 
Devirtualizing FinSpy
Devirtualizing FinSpyDevirtualizing FinSpy
Devirtualizing FinSpy
 
Pf: the OpenBSD packet filter
Pf: the OpenBSD packet filterPf: the OpenBSD packet filter
Pf: the OpenBSD packet filter
 
Vectorization vs Compilation
Vectorization vs CompilationVectorization vs Compilation
Vectorization vs Compilation
 
Multilayer Neuronal network hardware implementation
Multilayer Neuronal network hardware implementation Multilayer Neuronal network hardware implementation
Multilayer Neuronal network hardware implementation
 
Microprocessor 8086 instructions
Microprocessor 8086 instructionsMicroprocessor 8086 instructions
Microprocessor 8086 instructions
 
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
 
Lrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with rLrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with r
 
Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...
Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...
Kernel Recipes 2014 - x86 instruction encoding and the nasty hacks we do in t...
 
Address/Thread/Memory Sanitizer
Address/Thread/Memory SanitizerAddress/Thread/Memory Sanitizer
Address/Thread/Memory Sanitizer
 
Datasheet of SEN-10061(JPEG Camera)
Datasheet of SEN-10061(JPEG Camera)Datasheet of SEN-10061(JPEG Camera)
Datasheet of SEN-10061(JPEG Camera)
 
8086 Micro-processor and MDA 8086 Trainer Kit
8086 Micro-processor and MDA 8086 Trainer Kit8086 Micro-processor and MDA 8086 Trainer Kit
8086 Micro-processor and MDA 8086 Trainer Kit
 

Transpose8x8

  • 1. 1H:Transpose8x8.cpp // Transpose an 8x8 matrix with each element a byte for pixel. // Below is compiled using visual studio 2010 on my PC __declspec(naked) void Transpose8x8(unsigned char dat[64]) { // This constant indexes are used for final shuffling of bytes static __declspec(align(16)) unsigned int pidx[4] = { 0x0A020800, 0x0E060C04, 0x0B030901, 0x0F070D05}; _asm { push ebp mov ebp, esp mov eax, [ebp+8] // Address of the matrix data // Load the 64 bytes data into xmm0 to xmm3 movdqa xmm0, [eax ] movdqa xmm1, [eax+16] movdqa xmm2, [eax+32] movdqa xmm3, [eax+48] // We starts with // XMM0 = 0F0E0D0C0B0A09080706050403020100 // XMM1 = 1F1E1D1C1B1A19181716151413121110 // XMM2 = 2F2E2D2C2B2A29282726252423222120 // XMM3 = 3F3E3D3C3B3A39383736353433323130 // After transpose permutation should get // XMM0 = 39312921191109013830282018100800 // XMM1 = 3B332B231B130B033A322A221A120A02 // XMM2 = 3D352D251D150D053C342C241C140C04 // XMM3 = 3F372F271F170F073E362E261E160E06 // Duplicate them, so we can un-pack movdqa xmm4, xmm0 //movdqa xmm5, xmm1 movdqa xmm6, xmm2 //movdqa xmm7, xmm3 // Permutate bytes by unpacking instructions. 0x1, 2x3 punpcklwd xmm0, xmm1 punpckhwd xmm4, xmm1 punpcklwd xmm2, xmm3 punpckhwd xmm6, xmm3 // XMM0 = 17160706151405041312030211100100 // XMM2 = 37362726353425243332232231302120 // XMM4 = 1F1E0F0E1D1C0D0C1B1A0B0A19180908 // XMM6 = 3F3E2F2E3D3C2D2C3B3A2B2A39382928 // Duplicate again for next round permutation movdqa xmm1, xmm0 //movdqa xmm3, xmm2 movdqa xmm5, xmm4 //movdqa xmm7, xmm6 // Permutate bytes by unpacking instructions. 0x2, 4x6 punpckldq xmm0, xmm2 punpckhdq xmm1, xmm2 punpckldq xmm4, xmm6 punpckhdq xmm5, xmm6 // XMM0 = 33322322131203023130212011100100 // XMM1 = 37362726171607063534252415140504 // XMM4 = 3B3A2B2A1B1A0B0A3938292819180908 // XMM5 = 3F3E2F2E1F1E0F0E3D3C2D2C1D1C0D0C // Duplicate again for next round permutation movdqa xmm2, xmm0 movdqa xmm3, xmm1
  • 2. 2H:Transpose8x8.cpp //movdqa xmm6, xmm4 //movdqa xmm7, xmm5 // Pre-load the permutation indexes. Use now spared xmm6 //movaps xmm6, pidx (or) movdqa xmm6, pidx movdqa xmm6, pidx // pidx is permutation index. // XMM6 = 0F070D050B0309010E060C040A020800 // Permutate bytes by unpacking instructions. 0x4, 1x5 punpcklqdq xmm0, xmm4 punpckhqdq xmm2, xmm4 punpcklqdq xmm1, xmm5 punpckhqdq xmm3, xmm5 // XMM0 = 39382928191809083130212011100100 // XMM1 = 3D3C2D2C1D1C0D0C3534252415140504 // XMM2 = 3B3A2B2A1B1A0B0A3332232213120302 // XMM3 = 3F3E2F2E1F1E0F0E3736272617160706 // We are almost there. Need to permutate within each xmm //movaps xmm5, pidx (or) movdqa xmm5, pidx movdqa xmm5, pidx // pidx is permutation index. // XMM5 = 0F070D050B0309010E060C040A020800 // Now shuffle bytes within each xmm register pshufb xmm0, xmm6 pshufb xmm2, xmm6 pshufb xmm1, xmm6 pshufb xmm3, xmm6 // XMM0 = 39312921191109013830282018100800 // XMM1 = 3D352D251D150D053C342C241C140C04 // XMM2 = 3B332B231B130B033A322A221A120A02 // XMM3 = 3F372F271F170F073E362E261E160E06 // ==> Great. Exactly what I want. Order=0,2,1,3 // Now save the XMM registers back to memory movdqa [eax ], xmm0 movdqa [eax+16], xmm2 movdqa [eax+32], xmm1 movdqa [eax+48], xmm3 emms // Recover the floating point states. pop ebp ret } } // Returns 0 for no error, none-zero for error. int TestTranspose() { __declspec(align(16)) unsigned char dat[64]; for (int i=0; i<64; i++) dat[i] = i; Transpose8x8(dat); int ret = 0; // Test to confirm it works. for (int i=0; i<64; i++) { // (actual - expected) should always be zero. ret |= dat[i] - (((i&7)<<3) | (i>>3)); } return ret; }