Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.
MUDA
   Syoyo Fujita
twitter.com/syoyo

           LT
   2010/01/31
Agenda
• MUDA
•
• MUDA
• MUDA
• MUDA
            .
MUDA
• MUltiple Data Acceleration environment
• HPU


 • MUDA
 • MUDA
HPU
•   HPU =

    •   High-perfoemance Processing Unit

    •   Heterogeneous Processing Unit

•               , GPGPU


...
•       ...

    •   (SSE/AVX, etc)



    •
• ->
•   HPU

    •   C/C++

        •                          SIMD

    •   CUDA

        •        GPU        (3.0          C...
...
•
                  !

    • DSL             !
MUDA
•
    • CUDA, OpenCL, SSE/AVX, LLVM IR, etc.
•
    • Windows, Linux, Mac, iPhone(ARM)
•                , ISA

•      ...
Frontend                    Backend


                            SSE/AVX


                            LLVM IR
MUDA lang ...
void BlackScholesMUDA(
        out vec retCall,
        out vec retPut,
        vec S,                 //   Stock price
  ...
void BlackScholesMUDA (float * retCall, float * retPut, const float * S,
const float * X, const float * T, const float * R...
void
isect(dvec rox, dvec roy,   dvec   roz,
      dvec rdx, dvec rdy,   dvec   rdz,
      dvec v0x, dvec v0y,   dvec   v0...
MUDA_STATIC MUDA_INLINE __m256d cross4 (const double *   a, const double *
b, const double * c, const double * d)
{
    co...
define internal <4xdouble> @cross4 (<4xdouble> %a, <4xdouble> %b,
<4xdouble> %c
, <4xdouble> %d)
{
    %a.addr = alloca <4...
MUDA runtime API
class MUDADevice
{
public:
    MUDADevice(MUDADeviceTarget target);
    ~MUDADevice();

    bool initiali...
MUDA
•
•

    •
        MUDA   (Erlang
        ?)
MUDA BlackScholes
•   [Perf] CPU = 48.667000 (msec)
                                           MUDA        CPU
    •   16....
MUDA Pathtracer
• CUDA, OpenCL backend
OpenSource
•   http://lucille.sourceforge.net/muda/
    •   MUDA              .

    •                 Haskell            ...
•
•
•       ISA,

    •    ..    ...   ..   .....

• DSL            !
References
• GPU Ocelet
• MCUDA
 • CUDA kernel       CPU(x86)
        .

 • http://impact.crhc.illinois.edu/mcuda.php
MUDA
MUDA
MUDA
MUDA
MUDA
Upcoming SlideShare
Loading in …5
×

MUDA

3,390 views

Published on

第一回並カンでの MUDA 発表資料です.

Published in: Technology, Business
  • Be the first to comment

MUDA

  1. 1. MUDA Syoyo Fujita twitter.com/syoyo LT 2010/01/31
  2. 2. Agenda • MUDA • • MUDA • MUDA • MUDA .
  3. 3. MUDA • MUltiple Data Acceleration environment • HPU • MUDA • MUDA
  4. 4. HPU • HPU = • High-perfoemance Processing Unit • Heterogeneous Processing Unit • , GPGPU • , • • .
  5. 5. • ... • (SSE/AVX, etc) • • ->
  6. 6. • HPU • C/C++ • SIMD • CUDA • GPU (3.0 CUDA C -> LLVM(plang) CPU ) • OpenCL • ? • .
  7. 7. ... • ! • DSL !
  8. 8. MUDA • • CUDA, OpenCL, SSE/AVX, LLVM IR, etc. • • Windows, Linux, Mac, iPhone(ARM) • , ISA • DSL
  9. 9. Frontend Backend SSE/AVX LLVM IR MUDA lang MUDA compiler CUDA OpenCL
  10. 10. void BlackScholesMUDA( out vec retCall, out vec retPut, vec S, // Stock price vec X, // Option strike vec T, // Option years vec R, // Riskfree rate vec V) // Volatility rate { vec sqrtT = sqrt(T); vec d1 = (log(S /. X) + (R + 0.5f * V * V) * T) /. (V * sqrtT); vec d2 = d1 - V * sqrtT; vec cnd_d1 = cndMUDA(d1); vec cnd_d2 = cndMUDA(d2); vec expRT = exp(-1.0f * R * T); retCall = S * cnd_d1 - X * expRT * cnd_d2; retPut = X * expRT * (1.0f - cnd_d2) - S * (1.0f - cnd_d1); }
  11. 11. void BlackScholesMUDA (float * retCall, float * retPut, const float * S, const float * X, const float * T, const float * R, const float * V) { const __m128 S_ld_1 = _mm_loadu_ps(S + 0) ; const __m128 X_ld_1 = _mm_loadu_ps(X + 0) ; const __m128 T_ld_1 = _mm_loadu_ps(T + 0) ; const __m128 R_ld_1 = _mm_loadu_ps(R + 0) ; const __m128 V_ld_1 = _mm_loadu_ps(V + 0) ; const __m128 t_vec78_1 = T_ld_1 ; const __m128 t_vec77_1 = sqrtmu( (float *) &( t_vec78_1) ) ; __m128 sqrtT_1 = t_vec77_1 ; const __m128 t_vec79_1 = S_ld_1 ; const __m128 t_vec80_1 = X_ld_1 ; const __m128 Vect_vec80_rcp = _mm_rcp_ps( t_vec80_1) ; ...
  12. 12. void isect(dvec rox, dvec roy, dvec roz, dvec rdx, dvec rdy, dvec rdz, dvec v0x, dvec v0y, dvec v0z, dvec e1x, dvec e1y, dvec e1z, dvec e2x, dvec e2y, dvec e2z, out dvec inoutMask, out dvec inoutT, out dvec inoutU, out dvec inoutV) { dvec px, py, pz; // p = d x e2 px = cross4(e2z, e2y, rdy, rdz); py = cross4(e2x, e2z, rdz, rdx); pz = cross4(e2y, e2x, rdx, rdy); ...
  13. 13. MUDA_STATIC MUDA_INLINE __m256d cross4 (const double * a, const double * b, const double * c, const double * d) { const __m256d a_ld_1 = _mm256_load_pd(a + 0) ; const __m256d b_ld_1 = _mm256_load_pd(b + 0) ; const __m256d c_ld_1 = _mm256_load_pd(c + 0) ; const __m256d d_ld_1 = _mm256_load_pd(d + 0) ; const __m256d t_dvec2_1 = a_ld_1 ; const __m256d t_dvec1_1 = c_ld_1 ; const __m256d t_dvec3_1 = _mm256_mul_pd(t_dvec2_1, t_dvec1_1) ; const __m256d t_dvec4_1 = b_ld_1 ; const __m256d t_dvec5_1 = d_ld_1 ; const __m256d t_dvec6_1 = _mm256_mul_pd(t_dvec4_1, t_dvec5_1) ; const __m256d t_dvec7_1 = _mm256_sub_pd(t_dvec3_1, t_dvec6_1) ; return t_dvec7_1 ; } ...
  14. 14. define internal <4xdouble> @cross4 (<4xdouble> %a, <4xdouble> %b, <4xdouble> %c , <4xdouble> %d) { %a.addr = alloca <4xdouble> ; store <4xdouble> %a, <4xdouble>* %a.addr ; %b.addr = alloca <4xdouble> ; store <4xdouble> %b, <4xdouble>* %b.addr ; %c.addr = alloca <4xdouble> ; store <4xdouble> %c, <4xdouble>* %c.addr ; %d.addr = alloca <4xdouble> ; store <4xdouble> %d, <4xdouble>* %d.addr ; %t_dvec2 = load <4xdouble>* %a.addr ; %t_dvec1 = load <4xdouble>* %c.addr ; %t_dvec3 = mul <4xdouble> %t_dvec2 , %t_dvec1 ; %t_dvec4 = load <4xdouble>* %b.addr ; %t_dvec5 = load <4xdouble>* %d.addr ; ...
  15. 15. MUDA runtime API class MUDADevice { public: MUDADevice(MUDADeviceTarget target); ~MUDADevice(); bool initialize(); int getNumDevices(); bool loadKernelSource(const char *filename, const char **headers, const char *options); MUDAKernel createKernel(const char *functionName); MUDAMemory alloc(MUDAMemoryType memType, MUDAMemoryAttrib memAttrib, size_t memSize); bool write(int ID, MUDAMemory mem, size_t size, const void *ptr); bool read(int deviceID, MUDAMemory mem, size_t size, void *ptr); bool bindMemoryObject(MUDAKernel kernel, int argNum, MUDAMemory mem); bool execute(int deviceID, MUDAKernel kernel, int dimension, size_t sizeX, size_t sizeY, size_t sizeZ);
  16. 16. MUDA • • • MUDA (Erlang ?)
  17. 17. MUDA BlackScholes • [Perf] CPU = 48.667000 (msec) MUDA CPU • 16.438244 MOps/sec 40 • [Perf] MUDA = 21.705400 (msec) 30 • 36.857188 MOps/sec 20 10 Better 0 2.16GHz C2D
  18. 18. MUDA Pathtracer • CUDA, OpenCL backend
  19. 19. OpenSource • http://lucille.sourceforge.net/muda/ • MUDA . • Haskell ! • MUDA (CUDA, OCL backend) •
  20. 20. • • • ISA, • .. ... .. ..... • DSL !
  21. 21. References • GPU Ocelet
  22. 22. • MCUDA • CUDA kernel CPU(x86) . • http://impact.crhc.illinois.edu/mcuda.php

×