Un code plus complet

Le code source de cette section illustre les différentes façons de manipuler un registre 128 bits en le considérant comme un vecteur composé de mots de 64 bits, 32 bits, 16 bits ou 8 bits.

Afin de gérer l’affichage en fonction de l’interprétation du registre de 128 bits, une solution élégante est de déclarer un nouveau type __reg128 en utilisant une union.

typedef union
{
  __m128i vec128;
  uint64_t vec64[2];
  uint32_t vec32[4];
  uint16_t vec16[8];
  uint8_t vec8[16];
} __reg128;

ce qui permet de définir une fonction print128

void print128(char *s, int mode, __reg128 A)
{
  switch(mode)
  {
      case 8 :
            printf("%s=(",s);
            for(int i =0; i < 15; i++) printf("%3u,",A.vec8[i]);
            printf("%3u)\n",A.vec8[15]);
            break;
      case 16 :
            printf("%s=(",s);
            for(int i =0; i < 7; i++) printf("%3u,",A.vec16[i]);
            printf("%3u)\n",A.vec16[7]);
            break;
      case 32 :
            printf("%s=(",s);
            for(int i =0; i < 3; i++) printf("%3u,",A.vec32[i]);
            printf("%3u)\n",A.vec32[3]);
            break;
      case 64 :
           printf("%s=(%3lu,%3lu)\n",s,A.vec64[0],A.vec64[1]);

  }
}

Le code final :

#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>

typedef union
{
   __m128i vec128;
   uint64_t vec64[2];
   uint32_t vec32[4];
   uint16_t vec16[8];
   uint8_t vec8[16];
} __reg128;

void print128(char *s, int mode, __reg128 A)
{
  switch(mode)
  {
      case 8 :
            printf("%s=(",s);
            for(int i =0; i < 15; i++) printf("%3u,",A.vec8[i]);
            printf("%3u)\n",A.vec8[15]);
            break;
      case 16 :
            printf("%s=(",s);
            for(int i =0; i < 7; i++) printf("%5u,",A.vec16[i]);
            printf("%5u)\n",A.vec16[7]);
            break;
      case 32 :
            printf("%s=(",s);
            for(int i =0; i < 3; i++) printf("%10u,",A.vec32[i]);
            printf("%10u)\n",A.vec32[3]);
            break;
      case 64 :
           printf("%s=(%20lu,%20lu)\n",s,A.vec64[0],A.vec64[1]);

  }
}

int main(void)
{

__reg128 V1, V2, S;


for (int i = 0; i < 16; i++)
{
    V1.vec8[i] = i+1;
    V2.vec8[i] = 2*(i+1);
}

S.vec128 = _mm_add_epi8(V1.vec128,V2.vec128);
print128("V1",8,V1);
print128("V2",8,V2);
print128("S ",8,S);

for (int i = 0; i < 8; i++)
{
    V1.vec16[i] = 1500*i+1;
    V2.vec16[i] = 2*(1500*i+1);
}

S.vec128 = _mm_add_epi16(V1.vec128,V2.vec128);
print128("\nV1",16,V1);
print128("V2",16,V2);
print128("S ",16,S);

for (int i = 0; i < 4; i++)
{
    V1.vec32[i] = 107370000*i+1;
    V2.vec32[i] = 2*(107370000*i+1);
}

S.vec128 = _mm_add_epi32(V1.vec128,V2.vec128);
print128("\nV1",32,V1);
print128("V2",32,V2);
print128("S ",32,S);


V1.vec64[0] = 1152921504606846976;
V1.vec64[1] = 144115188075855875;
V2.vec64[0] = 100;
V2.vec64[1] = 60000000000000000;

S.vec128 = _mm_add_epi64(V1.vec128,V2.vec128);
print128("\nV1",64,V1);
print128("V2",64,V2);
print128("S ",64,S);
}

et le résultat d’exécution :

V1=(  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16)
V2=(  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32)
S =(  3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48)

V1=(    1, 1501, 3001, 4501, 6001, 7501, 9001,10501)
V2=(    2, 3002, 6002, 9002,12002,15002,18002,21002)
S =(    3, 4503, 9003,13503,18003,22503,27003,31503)

V1=(         1, 107370001, 214740001, 322110001)
V2=(         2, 214740002, 429480002, 644220002)
S =(         3, 322110003, 644220003, 966330003)

V1=( 1152921504606846976,  144115188075855875)
V2=(                 100,   60000000000000000)
S =( 1152921504606847076,  204115188075855875)