Write an MPI program that implements a shell-sort like parallel algorithm that sorts an array of
integers. The initial array is partitioned into equal size sub-arrays which are distributed to the
processes (one per process). The parallel algorithm is described in Section 9.3.2 of the textbook
(pages 398-399). It consists of two phases: (i) the processes that are far away from each other
compare and split their assigned sub-arrays (using a hypercube pattern of communication); (ii)
perform odd-even transposition iterations as long as the sub-arrays are changing. The following
is a high-level pseudocode description of the algorithm: Algorithm 1 Shell-sort like parallel
algorithm 1: {Phase I: Hypercube Compare-Exchange.} 2: for i = (d 1) to 0 do 3: if (i-th bit of
rank) = 1 then 4: compare-split-hi(i); 5: else 6: compare-split-low(i); 7: {Phase II: Odd-even
Transposition Iterations.} 8: done = FALSE 9: while done = FALSE do 10: {Perform odd-even
iterations} 11: if received items need to be passed further then 12: broadcast FALSE to all
processes; 13: else 14: broadcast TRUE to all processes; 15: if all processes broadcast TRUE
then 16: done = TRUE In the algorithm description: d - the number of bits required to represent
the ID’s of the processes (d=3 for 8 processes). compare-split-hi(i) - performs a compare-and-
split operation so that processor i keeps half of the merged sub-arrays containing the greatest
integers. compare-split-low(i) - performs a compare-and-split operation so that processor i keeps
half of the merged sub-arrays containing the smallest integers. Test the program on 8 processes.
The input array should consist of 128 random integers from the range [0, 128]. The array is
generated at process 0 which is responsible for partitioning the array and sending the sub-arrays
to the other processors. Process 0 will keep its corresponding sub-array, so that it can participate
in the algorithm. At the end of the computation, process 0 collects all the sub-arrays and displays
the sorted array. Compare the execution times for your parallel shell-sort implementation with
those of the standard odd-even transposition sort (given in the textbook, section 6.3.5, pages 248-
250) and the serial quicksort. For this performance comparison you should use 8 processors and
randomly generated integer arrays of sizes: 216, 220, 224, and 230. The random integers should
be in the range [0, 128]. Produce a plot showing the execution times of the three algorithms.
Produce another plot to show the speedup obtained by the parallel shell-sort with respect to the
sequential quicksort. Write a short (max. 2 pages) report describing the implementation and the
obtained results. The report should be typeset using Latex and the plots should be generated
using gnuplot.
Solution
Answer:
Assembly Language Code :
.zero 1
.LC0:
.string " "
print_ar(int*, int):
push rbp
mov rbp, rsp
sub rsp, 32
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-4], 0
.L3:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-28]
jge .L2
mov eax, DWORD PTR [rbp-4]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov esi, eax
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >::operator<<(int)
mov esi, OFFSET FLAT:.LC0
mov rdi, rax
call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*)
add DWORD PTR [rbp-4], 1
jmp .L3
.L2:
mov esi, OFFSET FLAT:std::basic_ostream >& std::endl >(std::basic_ostream >&)
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >::operator<<(std::basic_ostream >& (*)(std::basic_ostream >&))
nop
leave
ret
shell_sort(int*, int):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov eax, DWORD PTR [rbp-28]
mov edx, eax
shr edx, 31
add eax, edx
sar eax
mov DWORD PTR [rbp-8], eax
.L10:
cmp DWORD PTR [rbp-8], 0
jle .L11
mov eax, DWORD PTR [rbp-8]
mov DWORD PTR [rbp-12], eax
.L9:
mov eax, DWORD PTR [rbp-12]
cmp eax, DWORD PTR [rbp-28]
jge .L6
mov eax, DWORD PTR [rbp-12]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-16], eax
mov eax, DWORD PTR [rbp-12]
mov DWORD PTR [rbp-4], eax
.L8:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
jl .L7
mov eax, DWORD PTR [rbp-4]
sub eax, DWORD PTR [rbp-8]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
cmp eax, DWORD PTR [rbp-16]
jle .L7
mov eax, DWORD PTR [rbp-4]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-4]
sub eax, DWORD PTR [rbp-8]
cdqe
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rdx], eax
mov eax, DWORD PTR [rbp-8]
sub DWORD PTR [rbp-4], eax
jmp .L8
.L7:
mov eax, DWORD PTR [rbp-4]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-16]
mov DWORD PTR [rdx], eax
add DWORD PTR [rbp-12], 1
jmp .L9
.L6:
mov eax, DWORD PTR [rbp-8]
mov edx, eax
shr edx, 31
add eax, edx
sar eax
mov DWORD PTR [rbp-8], eax
jmp .L10
.L11:
nop
pop rbp
ret
.LC1:
.string "Intial Array : "
.LC2:
.string "Sorted Array : "
main:
push rbp
mov rbp, rsp
sub rsp, 48
mov DWORD PTR [rbp-48], 1
mov DWORD PTR [rbp-44], 4
mov DWORD PTR [rbp-40], 16
mov DWORD PTR [rbp-36], 30
mov DWORD PTR [rbp-32], 29
mov DWORD PTR [rbp-28], 18
mov DWORD PTR [rbp-24], 100
mov DWORD PTR [rbp-20], 2
mov DWORD PTR [rbp-16], 43
mov DWORD PTR [rbp-12], 1
mov esi, OFFSET FLAT:.LC1
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*)
lea rax, [rbp-48]
mov esi, 10
mov rdi, rax
call print_ar(int*, int)
lea rax, [rbp-48]
mov esi, 10
mov rdi, rax
call shell_sort(int*, int)
mov esi, OFFSET FLAT:.LC2
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*)
lea rax, [rbp-48]
mov esi, 10
mov rdi, rax
call print_ar(int*, int)
mov eax, 0
leave
ret
__static_initialization_and_destruction_0(int, int):
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
cmp DWORD PTR [rbp-4], 1
jne .L16
cmp DWORD PTR [rbp-8], 65535
jne .L16
mov edi, OFFSET FLAT:std::__ioinit
call std::ios_base::Init::Init()
mov edx, OFFSET FLAT:__dso_handle
mov esi, OFFSET FLAT:std::__ioinit
mov edi, OFFSET FLAT:std::ios_base::Init::~Init()
call __cxa_atexit
.L16:
nop
leave
ret
push rbp
mov rbp, rsp
mov esi, 65535
mov edi, 1
call __static_initialization_and_destruction_0(int, int)
pop rbp
ret

Write an MPI program that implements a shell-sort like parallel algo.pdf

  • 1.
    Write an MPIprogram that implements a shell-sort like parallel algorithm that sorts an array of integers. The initial array is partitioned into equal size sub-arrays which are distributed to the processes (one per process). The parallel algorithm is described in Section 9.3.2 of the textbook (pages 398-399). It consists of two phases: (i) the processes that are far away from each other compare and split their assigned sub-arrays (using a hypercube pattern of communication); (ii) perform odd-even transposition iterations as long as the sub-arrays are changing. The following is a high-level pseudocode description of the algorithm: Algorithm 1 Shell-sort like parallel algorithm 1: {Phase I: Hypercube Compare-Exchange.} 2: for i = (d 1) to 0 do 3: if (i-th bit of rank) = 1 then 4: compare-split-hi(i); 5: else 6: compare-split-low(i); 7: {Phase II: Odd-even Transposition Iterations.} 8: done = FALSE 9: while done = FALSE do 10: {Perform odd-even iterations} 11: if received items need to be passed further then 12: broadcast FALSE to all processes; 13: else 14: broadcast TRUE to all processes; 15: if all processes broadcast TRUE then 16: done = TRUE In the algorithm description: d - the number of bits required to represent the ID’s of the processes (d=3 for 8 processes). compare-split-hi(i) - performs a compare-and- split operation so that processor i keeps half of the merged sub-arrays containing the greatest integers. compare-split-low(i) - performs a compare-and-split operation so that processor i keeps half of the merged sub-arrays containing the smallest integers. Test the program on 8 processes. The input array should consist of 128 random integers from the range [0, 128]. The array is generated at process 0 which is responsible for partitioning the array and sending the sub-arrays to the other processors. Process 0 will keep its corresponding sub-array, so that it can participate in the algorithm. At the end of the computation, process 0 collects all the sub-arrays and displays the sorted array. Compare the execution times for your parallel shell-sort implementation with those of the standard odd-even transposition sort (given in the textbook, section 6.3.5, pages 248- 250) and the serial quicksort. For this performance comparison you should use 8 processors and randomly generated integer arrays of sizes: 216, 220, 224, and 230. The random integers should be in the range [0, 128]. Produce a plot showing the execution times of the three algorithms. Produce another plot to show the speedup obtained by the parallel shell-sort with respect to the sequential quicksort. Write a short (max. 2 pages) report describing the implementation and the obtained results. The report should be typeset using Latex and the plots should be generated using gnuplot. Solution Answer: Assembly Language Code :
  • 2.
    .zero 1 .LC0: .string "" print_ar(int*, int): push rbp mov rbp, rsp sub rsp, 32 mov QWORD PTR [rbp-24], rdi mov DWORD PTR [rbp-28], esi mov DWORD PTR [rbp-4], 0 .L3: mov eax, DWORD PTR [rbp-4] cmp eax, DWORD PTR [rbp-28] jge .L2 mov eax, DWORD PTR [rbp-4] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rax, rdx mov eax, DWORD PTR [rax] mov esi, eax mov edi, OFFSET FLAT:std::cout call std::basic_ostream >::operator<<(int) mov esi, OFFSET FLAT:.LC0 mov rdi, rax call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*) add DWORD PTR [rbp-4], 1 jmp .L3 .L2: mov esi, OFFSET FLAT:std::basic_ostream >& std::endl >(std::basic_ostream >&) mov edi, OFFSET FLAT:std::cout call std::basic_ostream >::operator<<(std::basic_ostream >& (*)(std::basic_ostream >&)) nop leave ret shell_sort(int*, int):
  • 3.
    push rbp mov rbp,rsp mov QWORD PTR [rbp-24], rdi mov DWORD PTR [rbp-28], esi mov eax, DWORD PTR [rbp-28] mov edx, eax shr edx, 31 add eax, edx sar eax mov DWORD PTR [rbp-8], eax .L10: cmp DWORD PTR [rbp-8], 0 jle .L11 mov eax, DWORD PTR [rbp-8] mov DWORD PTR [rbp-12], eax .L9: mov eax, DWORD PTR [rbp-12] cmp eax, DWORD PTR [rbp-28] jge .L6 mov eax, DWORD PTR [rbp-12] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rax, rdx mov eax, DWORD PTR [rax] mov DWORD PTR [rbp-16], eax mov eax, DWORD PTR [rbp-12] mov DWORD PTR [rbp-4], eax .L8: mov eax, DWORD PTR [rbp-4] cmp eax, DWORD PTR [rbp-8] jl .L7 mov eax, DWORD PTR [rbp-4] sub eax, DWORD PTR [rbp-8] cdqe lea rdx, [0+rax*4]
  • 4.
    mov rax, QWORDPTR [rbp-24] add rax, rdx mov eax, DWORD PTR [rax] cmp eax, DWORD PTR [rbp-16] jle .L7 mov eax, DWORD PTR [rbp-4] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rdx, rax mov eax, DWORD PTR [rbp-4] sub eax, DWORD PTR [rbp-8] cdqe lea rcx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rax, rcx mov eax, DWORD PTR [rax] mov DWORD PTR [rdx], eax mov eax, DWORD PTR [rbp-8] sub DWORD PTR [rbp-4], eax jmp .L8 .L7: mov eax, DWORD PTR [rbp-4] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rdx, rax mov eax, DWORD PTR [rbp-16] mov DWORD PTR [rdx], eax add DWORD PTR [rbp-12], 1 jmp .L9 .L6: mov eax, DWORD PTR [rbp-8] mov edx, eax shr edx, 31 add eax, edx
  • 5.
    sar eax mov DWORDPTR [rbp-8], eax jmp .L10 .L11: nop pop rbp ret .LC1: .string "Intial Array : " .LC2: .string "Sorted Array : " main: push rbp mov rbp, rsp sub rsp, 48 mov DWORD PTR [rbp-48], 1 mov DWORD PTR [rbp-44], 4 mov DWORD PTR [rbp-40], 16 mov DWORD PTR [rbp-36], 30 mov DWORD PTR [rbp-32], 29 mov DWORD PTR [rbp-28], 18 mov DWORD PTR [rbp-24], 100 mov DWORD PTR [rbp-20], 2 mov DWORD PTR [rbp-16], 43 mov DWORD PTR [rbp-12], 1 mov esi, OFFSET FLAT:.LC1 mov edi, OFFSET FLAT:std::cout call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*) lea rax, [rbp-48] mov esi, 10 mov rdi, rax call print_ar(int*, int) lea rax, [rbp-48] mov esi, 10 mov rdi, rax call shell_sort(int*, int)
  • 6.
    mov esi, OFFSETFLAT:.LC2 mov edi, OFFSET FLAT:std::cout call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*) lea rax, [rbp-48] mov esi, 10 mov rdi, rax call print_ar(int*, int) mov eax, 0 leave ret __static_initialization_and_destruction_0(int, int): push rbp mov rbp, rsp sub rsp, 16 mov DWORD PTR [rbp-4], edi mov DWORD PTR [rbp-8], esi cmp DWORD PTR [rbp-4], 1 jne .L16 cmp DWORD PTR [rbp-8], 65535 jne .L16 mov edi, OFFSET FLAT:std::__ioinit call std::ios_base::Init::Init() mov edx, OFFSET FLAT:__dso_handle mov esi, OFFSET FLAT:std::__ioinit mov edi, OFFSET FLAT:std::ios_base::Init::~Init() call __cxa_atexit .L16: nop leave ret push rbp mov rbp, rsp mov esi, 65535 mov edi, 1 call __static_initialization_and_destruction_0(int, int) pop rbp
  • 7.