Università Milano Bicocca Studio di Tecniche di compilazione  parallela  per architetture riconfigurabili Pavesi Lorenzo 071042
Agenda Processore ibrido XiRisc PiCoGa e GriffyC Suif  Compilatore x PiCoGa  Risultati sperimentali
Hybrid Processors Semplice core non piu sufficiente  Incremento delle performance Riduzione dei consumi (potenza, area) Core Configurabili Specializzazione ISA (Xtensa Tensilica, ARC) Ideali per applicazioni di Digital Signal Processing e Bit Level Manipulation Svilluppi futuri: Core riconfigurabile GarpChip, ASH
XiRisc+PiCoGa e GriffyC •  Microcontrollore RISC 32bit •  Architettura VLIW a 2 issues •  Pipeline a 5 stadi  •  ISA Configurabile •  Componente riconfigurabile - PiCoGa 16x24 RLC GriffyC superset Ansi-C - Stile DFG
PGAop DFG Multi contesto (4 configurazione, 1 esecuzione) + + + A B C D Y A B C D Y
GriffyC L1 : sub  a,a,2 rol  b,b,a add  d,d,a add  c,b,d add  i,i,1 bnz  c,L1  sub  a,a,2 add  d,d,a rol  b,b,a add  c,b,d add  i,i,1 A D I B L1 : sub  a,2 rol  b,a add  d,a add  c,b,d add  i,i,1 bnz  c,L1  PGAop a,b,d,i [..] for(;c!=0;i++)[ a=a-2;  b=b<<a; d=d+a; c=b+d; ] [..] [..] PD_0=pga_allocate(myPGAop); [..] for(;c!=0;i++)[ pgadirect1(PD_0, a,i,b,d); ] [..] pga_deallocate(myPGAop);  [..]
SUIF Infrastruttura per compilatori  (  http://suif.stanford.edu/  ) Orientata alla ricerca e sviluppo  Passi di compilazione modulari Sistema estendibile Suifdriver Pass - analyses - optimization IR - suifnodes - basicnodes Kernel - suifkernel - iokernel MODULES
Machine SUIF Optimization & Analysis Algorithms O P I Target Machines Compilation Environment ( SUIF ) Permette la costruzione di “back ends”  Machine level intermediate forms Descrizione architettura target Suif (v.2.1) Machine SUIF-IR  (qui è definito machine ir.hoof file) OPI cfa bvd suifvm x86 alpha cma / ssa picovm ksta ex1 m2gc Parametrized Target dependent Compilation  Environment is defined Str.Anl machine cfg ssa
Flusso di compilazione per PiCoGA C to SUIF LIR MACHINE-SUIF CFG STRUCTURAL   ANALYSIS KERNEL IDENTIFICATION Innermost while-region; “ PiCoGa basic block” marking; selezione di sub-trees while-region contenenti solo PiCoGa Basic Block; PiCoGa Kernel translation SSA representation Cti    Cmove replacement Independent from Identification - manual selected kernels translation   GRIFFY–C  COMPILER Kernel ranking Kernel incapsulation KERNEL EXTRACTION 1 2 3
Generazione del GriffyC ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ C SUIF SUIF (LIR) Dismantling delle strutture di controllo FileSetBlock FileBlock procedure procedure procedure FileBlock procedure procedure Machine SUIF CFG
Generazione del GriffyC picovm Control Tree ANNOTED Mach – SUIF ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ PICOHEADER ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ FileSetBlock FileBlock procedure procedure procedure FileBlock procedure procedure kernel Ottimizzazioni sul tipo di  selezione ottimizzazioni sul body del kernel Selezione 2 3 Ranking & Estrazione SSA M2GC ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double  _vr1; float  _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5  +  _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif  /*PICOHEADER__provaTmp1*/ Structural Analysis 1 X
Test e Risultati Applicazioni di codifica video iDCT, quantizzazione Block division DCT Storage DCT Quantize Entropy Encoder IDCT Entropy Decoder Immagine  Reconstruct Dequantize originale Immagine
Test e Risultati
Conclusioni Realizzazione di un flusso di Compilazione completa Buon numero di kernel identificati Kernel di medie-piccole dimensioni Prototipo stabile e sufficientemente efficiente
Sviluppi Futuri Strategie di selezione più evolute Integrazione con il compilatore FastGriffy Nuovi passi di ottimizzazione Analisi Interprocedurali Incremento della dimensione media dei kernel accellerabili Aggiornamento ad evoluzione del PiCoGa
Domande?

3D-DRESD Lorenzo Pavesi

  • 1.
    Università Milano BicoccaStudio di Tecniche di compilazione parallela per architetture riconfigurabili Pavesi Lorenzo 071042
  • 2.
    Agenda Processore ibridoXiRisc PiCoGa e GriffyC Suif Compilatore x PiCoGa Risultati sperimentali
  • 3.
    Hybrid Processors Semplicecore non piu sufficiente Incremento delle performance Riduzione dei consumi (potenza, area) Core Configurabili Specializzazione ISA (Xtensa Tensilica, ARC) Ideali per applicazioni di Digital Signal Processing e Bit Level Manipulation Svilluppi futuri: Core riconfigurabile GarpChip, ASH
  • 4.
    XiRisc+PiCoGa e GriffyC• Microcontrollore RISC 32bit • Architettura VLIW a 2 issues • Pipeline a 5 stadi • ISA Configurabile • Componente riconfigurabile - PiCoGa 16x24 RLC GriffyC superset Ansi-C - Stile DFG
  • 5.
    PGAop DFG Multicontesto (4 configurazione, 1 esecuzione) + + + A B C D Y A B C D Y
  • 6.
    GriffyC L1 :sub a,a,2 rol b,b,a add d,d,a add c,b,d add i,i,1 bnz c,L1 sub a,a,2 add d,d,a rol b,b,a add c,b,d add i,i,1 A D I B L1 : sub a,2 rol b,a add d,a add c,b,d add i,i,1 bnz c,L1 PGAop a,b,d,i [..] for(;c!=0;i++)[ a=a-2; b=b<<a; d=d+a; c=b+d; ] [..] [..] PD_0=pga_allocate(myPGAop); [..] for(;c!=0;i++)[ pgadirect1(PD_0, a,i,b,d); ] [..] pga_deallocate(myPGAop); [..]
  • 7.
    SUIF Infrastruttura percompilatori ( http://suif.stanford.edu/ ) Orientata alla ricerca e sviluppo Passi di compilazione modulari Sistema estendibile Suifdriver Pass - analyses - optimization IR - suifnodes - basicnodes Kernel - suifkernel - iokernel MODULES
  • 8.
    Machine SUIF Optimization& Analysis Algorithms O P I Target Machines Compilation Environment ( SUIF ) Permette la costruzione di “back ends” Machine level intermediate forms Descrizione architettura target Suif (v.2.1) Machine SUIF-IR (qui è definito machine ir.hoof file) OPI cfa bvd suifvm x86 alpha cma / ssa picovm ksta ex1 m2gc Parametrized Target dependent Compilation Environment is defined Str.Anl machine cfg ssa
  • 9.
    Flusso di compilazioneper PiCoGA C to SUIF LIR MACHINE-SUIF CFG STRUCTURAL ANALYSIS KERNEL IDENTIFICATION Innermost while-region; “ PiCoGa basic block” marking; selezione di sub-trees while-region contenenti solo PiCoGa Basic Block; PiCoGa Kernel translation SSA representation Cti  Cmove replacement Independent from Identification - manual selected kernels translation GRIFFY–C COMPILER Kernel ranking Kernel incapsulation KERNEL EXTRACTION 1 2 3
  • 10.
    Generazione del GriffyC...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ C SUIF SUIF (LIR) Dismantling delle strutture di controllo FileSetBlock FileBlock procedure procedure procedure FileBlock procedure procedure Machine SUIF CFG
  • 11.
    Generazione del GriffyCpicovm Control Tree ANNOTED Mach – SUIF ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ PICOHEADER ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ FileSetBlock FileBlock procedure procedure procedure FileBlock procedure procedure kernel Ottimizzazioni sul tipo di selezione ottimizzazioni sul body del kernel Selezione 2 3 Ranking & Estrazione SSA M2GC ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ ...... ........ ......#ifndef PICOHEADER__provaTmp1 #define PICOHEADER__provaTmp1 #pragma fpga _provaTmp1 0x00 0 0 { /* Virtual register declarations */ void * _vr0; double _vr1; float _vr2; _vr4 = (float (*)[1])part_amplitude; _vr5 = (float *)_vr4; _vr6 = (float *)((char *)_vr5 + _vr3); _vr7 = *_vr6; _vr2 = (float)_vr7; _vr1 = (double)_vr2; printf(_vr0, i, _vr1); } #pragma end #endif /*PICOHEADER__provaTmp1*/ Structural Analysis 1 X
  • 12.
    Test e RisultatiApplicazioni di codifica video iDCT, quantizzazione Block division DCT Storage DCT Quantize Entropy Encoder IDCT Entropy Decoder Immagine Reconstruct Dequantize originale Immagine
  • 13.
  • 14.
    Conclusioni Realizzazione diun flusso di Compilazione completa Buon numero di kernel identificati Kernel di medie-piccole dimensioni Prototipo stabile e sufficientemente efficiente
  • 15.
    Sviluppi Futuri Strategiedi selezione più evolute Integrazione con il compilatore FastGriffy Nuovi passi di ottimizzazione Analisi Interprocedurali Incremento della dimensione media dei kernel accellerabili Aggiornamento ad evoluzione del PiCoGa
  • 16.