Intel® C++ Compiler 19.0 Developer Guide and Reference
Example 1 demonstrates the efficiency of using a Structure of Arrays (SoA) approach by comparing the assembly generated from a simple SIMD loop using an Array of Structures (AoS) approach with the assembly generated using the SoA approach of SDLT.
Source:
#include <stdio.h>
#define N 1024
typedef struct RGBs {
float r;
float g;
float b;
} RGBTy;
void main()
{
RGBTy a[N];
#pragma omp simd
for (int k = 0; k<N; ++k) {
a[k].r = k*1.5; // non-unit stride access
a[k].g = k*2.5; // non-unit stride access
a[k].b = k*3.5; // non-unit stride access
}
std::cout << "k =" << 10 <<
", a[k].r =" << a[10].r <<
", a[k].g =" << a[10].g <<
", a[k].b =" << a[10].b << std::endl;
}
AVX2 assembly generated (69 instructions):
..TOP_OF_LOOP:
vcvtdq2ps %ymm7, %ymm1
lea (%rax), %rcx
vcvtdq2ps %ymm5, %ymm2
vpaddd %ymm3, %ymm7, %ymm7
vpaddd %ymm3, %ymm5, %ymm5
vmulps %ymm1, %ymm4, %ymm8
vmulps %ymm1, %ymm6, %ymm12
vmulps %ymm2, %ymm6, %ymm14
vmulps %ymm1, %ymm0, %ymm1
vmulps %ymm2, %ymm4, %ymm10
addl $16, %edx
vextractf128 $1, %ymm8, %xmm9
vmovss %xmm8, (%rsp,%rcx)
vmovss %xmm9, 48(%rsp,%rcx)
vextractps $1, %xmm8, 12(%rsp,%rcx)
vextractps $2, %xmm8, 24(%rsp,%rcx)
vextractps $3, %xmm8, 36(%rsp,%rcx)
vmulps %ymm2, %ymm0, %ymm8
vextractps $1, %xmm9, 60(%rsp,%rcx)
vextractps $2, %xmm9, 72(%rsp,%rcx)
vextractps $3, %xmm9, 84(%rsp,%rcx)
vextractf128 $1, %ymm12, %xmm13
vextractf128 $1, %ymm14, %xmm15
vextractf128 $1, %ymm1, %xmm2
vextractf128 $1, %ymm8, %xmm9
vmovss %xmm12, 4(%rsp,%rax)
vmovss %xmm13, 52(%rsp,%rax)
vextractps $1, %xmm12, 16(%rsp,%rax)
vextractps $2, %xmm12, 28(%rsp,%rax)
vextractps $3, %xmm12, 40(%rsp,%rax)
vextractps $1, %xmm13, 64(%rsp,%rax)
vextractps $2, %xmm13, 76(%rsp,%rax)
vextractps $3, %xmm13, 88(%rsp,%rax)
vmovss %xmm14, 100(%rsp,%rax)
vextractps $1, %xmm14, 112(%rsp,%rax)
vextractps $2, %xmm14, 124(%rsp,%rax)
vextractps $3, %xmm14, 136(%rsp,%rax)
vmovss %xmm15, 148(%rsp,%rax)
vextractps $1, %xmm15, 160(%rsp,%rax)
vextractps $2, %xmm15, 172(%rsp,%rax)
vextractps $3, %xmm15, 184(%rsp,%rax)
vmovss %xmm1, 8(%rsp,%rax)
vextractps $1, %xmm1, 20(%rsp,%rax)
vextractps $2, %xmm1, 32(%rsp,%rax)
vextractps $3, %xmm1, 44(%rsp,%rax)
vmovss %xmm2, 56(%rsp,%rax)
vextractps $1, %xmm2, 68(%rsp,%rax)
vextractps $2, %xmm2, 80(%rsp,%rax)
vextractps $3, %xmm2, 92(%rsp,%rax)
vmovss %xmm8, 104(%rsp,%rax)
vextractps $1, %xmm8, 116(%rsp,%rax)
vextractps $2, %xmm8, 128(%rsp,%rax)
vextractps $3, %xmm8, 140(%rsp,%rax)
vmovss %xmm9, 152(%rsp,%rax)
vextractps $1, %xmm9, 164(%rsp,%rax)
vextractps $2, %xmm9, 176(%rsp,%rax)
vextractps $3, %xmm9, 188(%rsp,%rax)
addq $192, %rax
vextractf128 $1, %ymm10, %xmm11
vmovss %xmm10, 96(%rsp,%rcx)
vmovss %xmm11, 144(%rsp,%rcx)
vextractps $1, %xmm10, 108(%rsp,%rcx)
vextractps $2, %xmm10, 120(%rsp,%rcx)
vextractps $3, %xmm10, 132(%rsp,%rcx)
vextractps $1, %xmm11, 156(%rsp,%rcx)
vextractps $2, %xmm11, 168(%rsp,%rcx)
vextractps $3, %xmm11, 180(%rsp,%rcx)
cmpl $1024, %edx
jb ..TOP_OF_LOOP
To introduce the use of SDLT, the code below will:
declares a primitive,
use an soa1d_container instead of an array,
use an accessor inside a SIMD loop to generate efficient code,
and use a proxy object’s data member interface to access individual data members of an element inside the container.
Source:
#include <stdio.h>
#include <sdlt/sdlt.h>
#define N 1024
typedef struct RGBs {
float r;
float g;
float b;
} RGBTy;
SDLT_PRIMITIVE(RGBTy, r, g, b)
void main()
{
// Use SDLT to get SOA data layout
sdlt::soa1d_container<RGBTy> aContainer(N);
auto a = aContainer.access();
// use SDLT Data Member Interface to access struct members r, g, and b.
// achieve unit-stride access after vectorization
#pragma omp simd
for (int k = 0; k<N; k++) {
a[k].r() = k*1.5;
a[k].g() = k*2.5;
a[k].b() = k*3.5;
}
std::cout << "k =" << 10 <<
", a[k].r =" << a[10].r() <<
", a[k].g =" << a[10].g() <<
", a[k].b =" << a[10].b() << std::endl;
}
AVX2 assemply generated (19 instructions):
..TOP_OF_LOOP:
vpaddd %ymm4, %ymm3, %ymm12
vcvtdq2ps %ymm3, %ymm7
vcvtdq2ps %ymm12, %ymm10
vmulps %ymm7, %ymm2, %ymm5
vmulps %ymm7, %ymm1, %ymm6
vmulps %ymm7, %ymm0, %ymm8
vmulps %ymm10, %ymm2, %ymm3
vmulps %ymm10, %ymm1, %ymm9
vmulps %ymm10, %ymm0, %ymm11
vmovups %ymm5, (%r13,%rax,4)
vmovups %ymm6, (%r15,%rax,4)
vmovups %ymm8, (%rbx,%rax,4)
vmovups %ymm3, 32(%r13,%rax,4)
vmovups %ymm9, 32(%r15,%rax,4)
vmovups %ymm11, 32(%rbx,%rax,4)
vpaddd %ymm4, %ymm12, %ymm3
addq $16, %rax
cmpq $1024, %rax
jb ..TOP_OF_LOOP
Both versions appear to have unrolled the loop twice. When examining the assembly generated for AVX2 instruction set, we can see a measurable reduction in the number of instructions (19 vs. 69) when we are able to perform unit stride access using SDLT. Also, at runtime, the soa1d_container aligned its data allocation and will gain any of the architectural advantages that come with using aligned instead of unaligned SIMD stores.