#ifndef _SSE_MATH_FUN_ #define _SSE_MATH_FUN_ #include /* yes I know, the top of this file is quite ugly */ #ifdef _MSC_VER /* visual c++ */ # define ALIGN16_BEG __declspec(align(16)) # define ALIGN16_END #else /* gcc or icc */ # define ALIGN16_BEG # define ALIGN16_END __attribute__((aligned(16))) #endif /* __m128 is ugly to write */ typedef __m128 v4sf; // vector of 4 float (sse1) #ifdef USE_SSE2 # include typedef __m128i v4si; // vector of 4 int (sse2) #else typedef __m64 v2si; // vector of 2 int (mmx) #endif /* natural logarithm computed for 4 simultaneous float return NaN for x <= 0 */ v4sf log_ps(v4sf x); #ifndef USE_SSE2 typedef union xmm_mm_union { __m128 xmm; __m64 mm[2]; } xmm_mm_union; #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ xmm_mm_union u; u.xmm = xmm_; \ mm0_ = u.mm[0]; \ mm1_ = u.mm[1]; \ } #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ } #endif // USE_SSE2 v4sf exp_ps(v4sf x); v4sf sin_ps(v4sf x); /* almost the same as sin_ps */ v4sf cos_ps(v4sf x); /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sf x, v4sf *s, v4sf *c); #endif