I needed the other day some quaternion multiplication code for a certain pet project of mine. A quick Google search later, I came to this one at StackOverlow:
How to multiply two quaternions with minimal instructions?
Also 18 insns, and, according to my rough calculations and timing with `rdtsc` - 18 cycles per quaternion multiplication. In the meantime, I also tweaked the other code a bit, replacing `shufps` with `pshufd`, where possible, reducing it to 16 insns. I'm not entirely sure, which version is faster, but the difference seems insignificant for any application of this code.
Cheers.
#include <pmmintrin.h> /* SSE3 intrinsics */ /* multiplication of two quaternions (x, y, z, w) x (a, b, c, d) */ __m128 _mm_cross4_ps(__m128 xyzw, __m128 abcd) { /* The product of two quaternions is: */ /* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */ __m128 wzyx = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(0,1,2,3)); __m128 baba = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0,1,0,1)); __m128 dcdc = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2,3,2,3)); /* variable names below are for parts of componens of result (X,Y,Z,W) */ /* nX stands for -X and similarly for the other components */ /* znxwy = (xb - ya, zb - wa, wd - zc, yd - xc) */ __m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw, baba), _mm_mul_ps(wzyx, dcdc)); /* xzynw = (xd + yc, zd + wc, wb + za, yb + xa) */ __m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw, dcdc), _mm_mul_ps(wzyx, baba)); /* _mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)) */ /* = (xd + yc, zd + wc, wd - zc, yd - xc) */ /* _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)) */ /* = (zb - wa, xb - ya, yb + xa, wb + za) */ /* _mm_addsub_ps adds elements 1 and 3 and subtracts elements 0 and 2, so we get: */ /* _mm_addsub_ps(*, *) = (xd+yc-zb+wa, xb-ya+zd+wc, wd-zc+yb+xa, yd-xc+wb+za) */ __m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)), _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1))); /* now we only need to shuffle the components in place and return the result */ return _mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0)); /* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions */ }Which is pretty pretty nice, compiling only to 18 instructions. There are some things of note, though:
- the `_mm_shuffle_ps` intrinsic generates the `shufps` insn, which on recent Intel processors has 1 cycle latency and 1 cycle throughput. When swizzling a single register, though, the `_mm_shuffle_epi32` (the `pshufd` insn) seems preferable, because it has one cycle latency, but 0.5 cycles throughput, which, in my understanding, means that in a single cycle two of these insns can be issued to the same port;
- the `_mm_hadd_ps/_mm_hsub_ps` are a bit on the heavy side for addition/subtraction with 5 cycles of latency and 2 cycles throughput, i.e. such insn can be issued every other cycle and it takes 5 cycles until their result is available for subsequent insns. Other addition/subtraction insns (`addps`, `addsubps`, etc) take 3 cycles with 1 cycle of throughput.
#include <immintrin.h> #define _mm_pshufd(r,i) __m128 (_mm_shuffle_epi32 (__m128i (r), i)) /* Nehalem/Westmere/SandyBidge/IvyBridge insn timings. */ __m128 qmul (__m128 abcd, __m128 xyzw) { __m128 t0 = _mm_pshufd (abcd, _MM_SHUFFLE (3, 3, 3, 3)); /* 1, 0.5 */ __m128 t1 = _mm_pshufd (xyzw, _MM_SHUFFLE (2, 3, 0, 1)); /* 1, 0.5 */ __m128 t3 = _mm_pshufd (abcd, _MM_SHUFFLE (0, 0, 0, 0)); /* 1, 0.5 */ __m128 t4 = _mm_pshufd (xyzw, _MM_SHUFFLE (1, 0, 3, 2)); /* 1, 0.5 */ __m128 t5 = _mm_pshufd (abcd, _MM_SHUFFLE (1, 1, 1, 1)); /* 1, 0.5 */ __m128 t6 = _mm_pshufd (xyzw, _MM_SHUFFLE (2, 0, 3, 1)); /* 1, 0.5 */ /* [d,d,d,d]*[z,w,x,y] = [dz,dw,dx,dy] */ __m128 m0 = _mm_mul_ps (t0, t1); /* 5/4, 1 */ /* [a,a,a,a]*[y,x,w,z] = [ay,ax,aw,az]*/ __m128 m1 = _mm_mul_ps (t3, t4); /* 5/4, 1 */ /* [b,b,b,b]*[z,x,w,y] = [bz,bx,bw,by]*/ __m128 m2 = _mm_mul_ps (t5, t6); /* 5/4, 1 */ /* [c,c,c,c]*[w,z,x,y] = [cw,cz,cx,cy] */ __m128 t7 = _mm_pshufd (abcd, _MM_SHUFFLE (2, 2, 2, 2)); /* 1, 0.5 */ __m128 t8 = _mm_pshufd (xyzw, _MM_SHUFFLE (3, 2, 0, 1)); /* 1, 0.5 */ __m128 m3 = _mm_mul_ps (t7, t8); /* 5/4, 1 */ /* 1 */ /* [dz,dw,dx,dy]+-[ay,ax,aw,az] = [dz+ay,dw-ax,dx+aw,dy-az] */ __m128 e = _mm_addsub_ps (m0, m1); /* 3, 1 */ /* 2 */ /* [dx+aw,dz+ay,dy-az,dw-ax] */ e = _mm_pshufd (e, _MM_SHUFFLE (1, 3, 0, 2)); /* 1, 0.5 */ /* [dx+aw,dz+ay,dy-az,dw-ax]+-[bz,bx,bw,by] = [dx+aw+bz,dz+ay-bx,dy-az+bw,dw-ax-by]*/ e = _mm_addsub_ps (e, m2); /* 3, 1 */ /* 2 */ /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz] */ e = _mm_pshufd (e, _MM_SHUFFLE (2, 0, 1, 3)); /* 1, 0.5 */ /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz]+-[cw,cz,cx,cy] = [dz+ay-bx+cw,dw-ax-by-cz,dy-az+bw+cx,dx+aw+bz-cy] */ e = _mm_addsub_ps (e, m3); /* 3, 1 */ /* 2 */ /* [dw-ax-by-cz,dz+ay-bx+cw,dy-az+bw+cx,dx+aw+bz-cy] */ e = _mm_pshufd (e, _MM_SHUFFLE (2, 3, 1, 0)); /* 1, 0.5 */ return e; }
Also 18 insns, and, according to my rough calculations and timing with `rdtsc` - 18 cycles per quaternion multiplication. In the meantime, I also tweaked the other code a bit, replacing `shufps` with `pshufd`, where possible, reducing it to 16 insns. I'm not entirely sure, which version is faster, but the difference seems insignificant for any application of this code.
Cheers.
2 comments:
It's been so useful for me.
thanks, bro.
Thanks )
Post a Comment