initial imlementation for sse

2014-11-13 23:16:18 +01:00
parent 24a72dbffb
commit 23452bcec5
14 changed files with 904 additions and 70 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 NN.kdev4
 .kdev4
+*.o
+*.a
+*.nm
--- a/1
+++ b/1
@@ -40,6 +40,7 @@ genetics_build:
 clean:
 	@make -C src/Genetics clean
 	@make -C src/NeuronNetwork clean
+	@make -C tests clean
 	#@rm -f ./*.so ./*.a  ./*.nm
 	@rm -f ./lib/*.so ./lib/*.a  ./lib/*.nm
 	@echo "Cleaned....."
--- a/Makefile.const
+++ b/Makefile.const
@@ -2,7 +2,7 @@ CXX=g++ -m64
 CXXFLAGS+= -Wall -Wextra -pedantic -Weffc++ -Wshadow -Wstrict-aliasing -ansi -Woverloaded-virtual -Wdelete-non-virtual-dtor
 #CXXFLAGS+=-Werror
 CXXFLAGS+= -g
-CXXFLAGS+= -O3
+CXXFLAGS+= -O3 -msse4.2 -mfpmath=sse -march=native -mtune=native
 CXXFLAGS+=  -std=c++14
 #CXXFLAGS+= -pg -fPIC
 CXXFLAGS+= -fPIC -pthread
--- a/src/NeuronNetwork/FeedForwardQuick.cpp
+++ b/src/NeuronNetwork/FeedForwardQuick.cpp
@@ -1,5 +1,5 @@
 #include "FeedForwardQuick"
-#include <thread>
+#include <pmmintrin.h>

 using namespace Shin::NeuronNetwork;

@@ -59,10 +59,31 @@ FeedForwardNetworkQuick::~FeedForwardNetworkQuick()
 	}
 }

+#define _LOOP(FROM,TO) {}
+
+void FeedForwardNetworkQuick::solvePart(float *newSolution, size_t begin, size_t steps,size_t prevSize, float *sol,size_t layer)
+{
+	register size_t end=begin+steps;
+	for( size_t j=begin;j<end;j++)
+	{
+		newSolution[j]=sol[0]*weights[layer][j][0];
+		for(register size_t k=1;k<prevSize;k++)
+		{
+			if(layer==0)
+			{
+				newSolution[j]+=sol[k]*weights[layer][j][k];
+			}else
+			{
+				newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[layer][j][k];
+			}
+		}
+	}
+}
+
 Solution FeedForwardNetworkQuick::solve(const Problem& p)
 {
 	std::vector<bool> solution(p);
-	register double* sol=sums[0];//new bool[solution.size()];
+	register float* sol=sums[0];//new bool[solution.size()];

 	for(size_t i=0;i<solution.size();i++)
 	{
@@ -72,8 +93,8 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
 	register size_t prevSize=layerSizes[0];
 	for(register size_t i=0;i<layers;i++)
 	{
-		double* newSolution= sums[i+1];//new bool[layerSizes[i]];
-		if(threads > 1 && layerSizes[i] > 600)
+		float* newSolution= sums[i+1];//new bool[layerSizes[i]];
+		if(threads > 1 && layerSizes[i] > 600) // 600 is an guess about actual size, when creating thread has some speedup
 		{
 			std::vector<std::thread> th;
 			size_t s=1;
@@ -85,22 +106,54 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
 				if(s>=layerSizes[i])
 					break;
 				th.push_back(std::thread([i,this,newSolution,prevSize,sol](size_t from, size_t to)->void{
+					_LOOP(from,to<(from+4)?to:(from+4));

-					for( size_t j=from;j<to;j++)
+					register size_t max= (int)(to-4) < 0?0:(to-4);
+
+					for( size_t j=from+4;j<max;j+=4)
 					{
 						newSolution[j]=sol[0]*weights[i][j][0];
-						register size_t k;
-						for(k=1;k<prevSize;k++)
+						newSolution[j+1]=sol[0]*weights[i][j+1][0];
+						newSolution[j+2]=sol[0]*weights[i][j+2][0];
+						newSolution[j+3]=sol[0]*weights[i][j+3][0];
+
+						__m128 partialSolution = _mm_load_ps(newSolution+j);
+						register size_t upper_limit=prevSize-4;
+
+						for(register size_t k=from;k< 4 && k <prevSize;k++)
 						{
 							if(i==0)
-							{
 								newSolution[j]+=sol[k]*weights[i][j][k];
-							}else
-							{
+							else
 								newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
+						}
+
+						for(register size_t k=4;k<upper_limit;k+=4)
+						{
+							__m128 w  = _mm_loadr_ps((this->weights[i][j])+k);
+							__m128 sols  = _mm_loadr_ps(sol+k);
+							if(i!=0)
+							{
+								__m128 tmp = _mm_set1_ps(-lambda);
+								sols=_mm_mul_ps(tmp,sols); //-lambda*sol[k]
+								sols=exp_ps(sols); //exp(sols)
+								tmp = _mm_set1_ps(1.0);
+								sols= _mm_add_ps(sols,tmp); //1+exp()
+								sols= _mm_div_ps(tmp,sols);//1/....
 							}
+							w=_mm_mul_ps(w,sols);
+							partialSolution=_mm_add_ps(partialSolution,w);
+						}
+						for(register size_t k=upper_limit;k<prevSize;k++)
+						{
+							if(i==0)
+								newSolution[j]+=sol[k]*weights[i][j][k];
+							else
+								newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
 						}
 					}
+					if(max!=0)
+						_LOOP(max,to);
 				},s,t==threads?layerSizes[i]:s+step));//{}
 				s+=step;
 			}
@@ -109,20 +162,55 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
 				thr.join();
 		}else
 		{
-			for( size_t j=1;j<layerSizes[i];j++)
+			if(1)
 			{
-				newSolution[j]=sol[0]*weights[i][j][0];
-				register size_t k;
-				for(k=1;k<prevSize;k++)
+				solvePart(newSolution,1,layerSizes[i]-1,prevSize,sol,i);
+			}else
+			{
+				_LOOP(1,layerSizes[i]<4?layerSizes[i]:4);
+				register size_t max= (int)(layerSizes[i]-4) < 0?0:(layerSizes[i]-4);
+				for( size_t j=4;j<max;j=j+4)
 				{
-					if(i==0)
+					newSolution[j]=sol[0]*weights[i][j][0];
+					newSolution[j+1]=sol[0]*weights[i][j+1][0];
+					newSolution[j+2]=sol[0]*weights[i][j+2][0];
+					newSolution[j+3]=sol[0]*weights[i][j+3][0];
+
+					__m128 partialSolution = _mm_load_ps(newSolution+j);
+					register size_t upper_limit=prevSize-prevSize%4;
+					for(register size_t k=1;k<prevSize && k <4;k++)
 					{
-						newSolution[j]+=sol[k]*weights[i][j][k];
-					}else
+						if(i==0)
+							newSolution[j]+=sol[k]*weights[i][j][k];
+						else
+							newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
+					}
+					for(register size_t k=4;k<upper_limit;k+=4)
 					{
-						newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
+						__m128 w  = _mm_loadr_ps((this->weights[i][j])+k);
+						__m128 sols  = _mm_loadr_ps(sol+k);
+						if(i!=0)
+						{
+							__m128 tmp = _mm_set1_ps(-lambda);
+							sols=_mm_mul_ps(tmp,sols); //-lambda*sol[k]
+							sols=exp_ps(sols); //exp(sols)
+							tmp = _mm_set1_ps(1.0);
+							sols= _mm_add_ps(sols,tmp); //1+exp()
+							sols= _mm_div_ps(tmp,sols);//1/....
+						}
+						w=_mm_mul_ps(w,sols);
+						partialSolution=_mm_add_ps(partialSolution,w);
+					}
+					for(register size_t k=upper_limit;k<prevSize;k++)
+					{
+						if(i==0)
+							newSolution[j]+=sol[k]*weights[i][j][k];
+						else
+							newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
 					}
 				}
+				if(max!=0)
+					_LOOP(max,layerSizes[i]);
 			}
 		}
 		prevSize=layerSizes[i];
--- a/src/NeuronNetwork/FeedForwardQuick.h
+++ b/src/NeuronNetwork/FeedForwardQuick.h
@@ -8,10 +8,17 @@

 #include <vector>
 #include <initializer_list>
+#include <thread>

 #include <iostream>
 #include <math.h>

+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include "../sse_mathfun.h"
+
 #define LAMBDA 0.8

 namespace Shin
@@ -24,20 +31,20 @@ namespace NeuronNetwork
 			FFNeuron() = delete;
 			FFNeuron(const FFNeuron&) = delete;
 			FFNeuron& operator=(const FFNeuron&) = delete;
-			FFNeuron(double *pot, double *w, double*s,double lam):potential(pot),weights(w),sum(s),lambda(lam) { }
+			FFNeuron(float *pot, float *w, float*s,float lam):potential(pot),weights(w),sum(s),lambda(lam) { }

-			double getPotential() {return *potential;}
+			float getPotential() {return *potential;}
 			void setPotential(double p) { *potential=p;}
-			double getWeight(unsigned int i ) { return weights[i];}
-			void setWeight(unsigned int i,double p) { weights[i]=p; }
-			inline double output() const { return 1.0/(1.0+(exp(-lambda*input()))); }
-			inline double input() const { return *sum; }
-			inline double derivatedOutput() const { return lambda*output()*(1.0-output()); }
+			float getWeight(unsigned int i ) { return weights[i];}
+			void setWeight(unsigned int i,float p) { weights[i]=p; }
+			inline float output() const { return 1.0/(1.0+(exp(-lambda*input()))); }
+			inline float input() const { return *sum; }
+			inline float derivatedOutput() const { return lambda*output()*(1.0-output()); }
 		protected:
-			double *potential;
-			double *weights;
-			double *sum;
-			double lambda;
+			float *potential;
+			float *weights;
+			float *sum;
+			float lambda;
 		private:	
 	};

@@ -46,17 +53,17 @@ namespace NeuronNetwork
 		public:
 			FFLayer(const FFLayer &) =delete;
 			FFLayer operator=(const FFLayer &) = delete;
-			FFLayer(size_t s, double *p,double **w,double *su,double lam): neurons(nullptr),layerSize(s),potentials(p),weights(w),sums(su),lambda(lam) {}
+			FFLayer(size_t s, float *p,float **w,float *su,float lam): neurons(nullptr),layerSize(s),potentials(p),weights(w),sums(su),lambda(lam) {}
 			~FFLayer();
 			FFNeuron* operator[](int neuron);
 			size_t size() const {return layerSize;};
 		protected:
 			FFNeuron **neurons;
 			size_t layerSize;
-			double *potentials;
-			double **weights;
-			double *sums;
-			double lambda;
+			float *potentials;
+			float **weights;
+			float *sums;
+			float lambda;
 	};

 	class FeedForwardNetworkQuick:public ACyclicNetwork
@@ -64,12 +71,13 @@ namespace NeuronNetwork
 		public:
 			FeedForwardNetworkQuick(const FeedForwardNetworkQuick &f) = delete; //TODO
 			FeedForwardNetworkQuick operator=(const FeedForwardNetworkQuick &f)=delete;
-			template<typename... Args>inline FeedForwardNetworkQuick(std::initializer_list<int> s, double lam=LAMBDA):ffLayers(nullptr),weights(nullptr),potentials(nullptr),sums(nullptr),layerSizes(nullptr),layers(s.size()),lambda(lam)
+			template<typename... Args>inline FeedForwardNetworkQuick(std::initializer_list<int> s, double lam=LAMBDA):ffLayers(nullptr),weights(nullptr),potentials(nullptr),sums(nullptr),inputs(nullptr),layerSizes(nullptr),layers(s.size()),lambda(lam)
 			{
-				weights= new double**[s.size()];
-				potentials= new double*[s.size()];
+				weights= new float**[s.size()];
+				potentials= new float*[s.size()];
 				layerSizes= new size_t[s.size()];
-				sums= new double*[s.size()+1];
+				sums= new float*[s.size()+1];
+				inputs= new float*[s.size()+1];
 				int i=0;
 				int prev_size=1;
 				for(int layeSize:s) // TODO rename
@@ -78,22 +86,23 @@ namespace NeuronNetwork
 					if(i==0)
 					{
 						prev_size=layeSize;
-						sums[0]= new double[layeSize];
+						sums[0]= new float[layeSize];
 						sums[0][0]=1.0;
 					}
 					layerSizes[i]=layeSize;
-					weights[i]= new double*[layeSize];
-					potentials[i]= new double[layeSize];
-					sums[i+1]= new double[layeSize];
+					weights[i]= new float*[layeSize];
+					potentials[i]= new float[layeSize];
+					sums[i+1]= new float[layeSize];
+					inputs[i+1]= new float[layeSize];
 					potentials[i][0]=1.0;
 					sums[i+1][0]=1.0;
 					for (int j=1;j<layeSize;j++)
 					{
 						potentials[i][j]=1.0;
-						weights[i][j]= new double[prev_size];
+						weights[i][j]= new float[prev_size];
 						for(int k=0;k<prev_size;k++)
 						{
-							weights[i][j][k]=1.0-((double)(rand()%2001))/1000.0;
+							weights[i][j][k]=1.0-((float)(rand()%2001))/1000.0;
 						}
 					}
 					i++;
@@ -106,16 +115,18 @@ namespace NeuronNetwork
 			FFLayer* operator[](int l);
 			void setThreads(unsigned t) {threads=t;}
 		protected:
+			void solvePart(float *newSolution, size_t begin, size_t steps,size_t prevSize, float* sol,size_t layer);
 		private:
 			FFLayer **ffLayers;
-			double ***weights;
-			double **potentials;
+			float ***weights;
+			float **potentials;
+			float **sums;
+			float **inputs;
 		public:
-			double **sums;
 		private:
 			size_t *layerSizes;
 			size_t layers;
-			double lambda;
+			float lambda;
 			unsigned threads=1;
 	};

--- a/src/NeuronNetwork/Learning/BackPropagation.cpp
+++ b/src/NeuronNetwork/Learning/BackPropagation.cpp
@@ -8,11 +8,11 @@ Shin::NeuronNetwork::Learning::BackPropagation::BackPropagation(FeedForwardNetwo

 void Shin::NeuronNetwork::Learning::BackPropagation::propagate(const Shin::NeuronNetwork::Solution& expectation)
 {
-	double **deltas;
-	deltas=new double*[network.size()];
+	float **deltas;
+	deltas=new float*[network.size()];
 	for(int i=(int)network.size()-1;i>=0;i--)
 	{
-		deltas[i]=new double[network[i]->size()];
+		deltas[i]=new float[network[i]->size()];
 		deltas[i][0]=0.0;
 		if(i==(int)network.size()-1)
 		{
@@ -37,7 +37,7 @@ void Shin::NeuronNetwork::Learning::BackPropagation::propagate(const Shin::Neuro
 					th.push_back(std::thread([&i,this,&deltas](size_t from, size_t to)->void{
 						for(size_t j=from;j<to;j++)
 						{
-							register double deltasWeight = 0;
+							register float deltasWeight = 0;
 							for(size_t k=1;k<this->network[i+1]->size();k++)
 							{
 								deltasWeight+=deltas[i+1][k]*this->network[i+1]->operator[](k)->getWeight(j);
@@ -53,7 +53,7 @@ void Shin::NeuronNetwork::Learning::BackPropagation::propagate(const Shin::Neuro
 			{
 				for(size_t j=0;j<network[i]->size();j++)
 				{
-					register double deltasWeight = 0;
+					register float deltasWeight = 0;
 					for(size_t k=1;k<this->network[i+1]->size();k++)
 					{
 						deltasWeight+=deltas[i+1][k]*this->network[i+1]->operator[](k)->getWeight(j);
@@ -79,7 +79,7 @@ void Shin::NeuronNetwork::Learning::BackPropagation::propagate(const Shin::Neuro
 			{
 				network[i]->operator[](j)->setWeight(k,
 					network[i]->operator[](j)->getWeight(k)+learningCoeficient*	deltas[i][j]*
-					(i==0? network.sums[0][k]:(double)network[i-1]->operator[](k)->output()));
+					(i==0? network.sums[0][k]:(float)network[i-1]->operator[](k)->output()));
 			}
 		}
 	}
@@ -91,7 +91,7 @@ void Shin::NeuronNetwork::Learning::BackPropagation::propagate(const Shin::Neuro
 }


-double Shin::NeuronNetwork::Learning::BackPropagation::teach(const Shin::NeuronNetwork::Problem& p, const Shin::NeuronNetwork::Solution& solution)
+float Shin::NeuronNetwork::Learning::BackPropagation::teach(const Shin::NeuronNetwork::Problem& p, const Shin::NeuronNetwork::Solution& solution)
 {
 	Shin::NeuronNetwork::Solution a=network.solve(p);
 	double error=calculateError(solution,a);
@@ -115,7 +115,7 @@ double Shin::NeuronNetwork::Learning::BackPropagation::teach(const Shin::NeuronN
 }


-void Shin::NeuronNetwork::Learning::BackPropagation::setLearningCoeficient(double c)
+void Shin::NeuronNetwork::Learning::BackPropagation::setLearningCoeficient(float c)
 {
 	learningCoeficient=c;
 }
--- a/src/NeuronNetwork/Learning/BackPropagation.h
+++ b/src/NeuronNetwork/Learning/BackPropagation.h
@@ -31,14 +31,14 @@ namespace Learning
 		public:
 			BackPropagation(FeedForwardNetworkQuick &n);
 			virtual void propagate(const Shin::NeuronNetwork::Solution& expectation);
-			double teach(const Shin::NeuronNetwork::Problem &p,const Solution &solution);
+			float teach(const Shin::NeuronNetwork::Problem &p,const Solution &solution);

-			void setLearningCoeficient (double);
+			void setLearningCoeficient (float);
 			void allowEntropy() {entropy=1;}
 			void setEntropySize(int milipercents) { entropySize=milipercents; }
 			inline void allowThreading() {allowThreads=1; }
 		protected:
-			double learningCoeficient=0.4;
+			float learningCoeficient=0.4;
 			bool entropy=0;
 			bool allowThreads=0;
 			int entropySize=500;
--- a/src/NeuronNetwork/Learning/Supervised.cpp
+++ b/src/NeuronNetwork/Learning/Supervised.cpp
@@ -5,9 +5,9 @@ Shin::NeuronNetwork::Learning::Supervised::Supervised(Shin::NeuronNetwork::FeedF
 }


-double Shin::NeuronNetwork::Learning::Supervised::calculateError(const Shin::NeuronNetwork::Solution& expectation, const Shin::NeuronNetwork::Solution& solution)
+float Shin::NeuronNetwork::Learning::Supervised::calculateError(const Shin::NeuronNetwork::Solution& expectation, const Shin::NeuronNetwork::Solution& solution)
 {
-	register double a=0;
+	register float a=0;
 	for (size_t i=0;i<expectation.size();i++)
 	{
 		a+=pow(expectation[i]-solution[i],2)/2;
@@ -15,7 +15,7 @@ double Shin::NeuronNetwork::Learning::Supervised::calculateError(const Shin::Neu
 	return a;
 }

-double Shin::NeuronNetwork::Learning::Supervised::teachSet(std::vector< Shin::NeuronNetwork::Problem* >& p, std::vector< Shin::NeuronNetwork::Solution* >& solution)
+float Shin::NeuronNetwork::Learning::Supervised::teachSet(std::vector< Shin::NeuronNetwork::Problem* >& p, std::vector< Shin::NeuronNetwork::Solution* >& solution)
 {
 	double error=0;
 	for (register size_t i=0;i<p.size();i++)
--- a/src/NeuronNetwork/Learning/Supervised.h
+++ b/src/NeuronNetwork/Learning/Supervised.h
@@ -19,9 +19,9 @@ namespace Learning
 			Supervised() =delete;
 			Supervised(FeedForwardNetworkQuick &n);
 			virtual ~Supervised() {};
-			double calculateError(const Solution &expectation,const Solution &solution);
-			virtual double teach(const Shin::NeuronNetwork::Problem &p,const Solution &solution)=0;
-			double teachSet(std::vector<Shin::NeuronNetwork::Problem*> &p,std::vector<Shin::NeuronNetwork::Solution*> &solution);
+			float calculateError(const Solution &expectation,const Solution &solution);
+			virtual float teach(const Shin::NeuronNetwork::Problem &p,const Solution &solution)=0;
+			float teachSet(std::vector<Shin::NeuronNetwork::Problem*> &p,std::vector<Shin::NeuronNetwork::Solution*> &solution);
 			void debugOn();
 			void debugOff();
 		protected:
--- a/src/NeuronNetwork/Makefile
+++ b/src/NeuronNetwork/Makefile
@@ -3,6 +3,8 @@ OBJFILES= Neuron.o Network.o FeedForward.o FeedForwardQuick.o\
 	Learning/Unsupervised.o Learning/Reinforcement.o\
 	Solution.o Problem.o

+LINKFILES= ../sse_mathfun.o
+
 LIBNAME=NeuronNetwork

 include ../../Makefile.const
@@ -12,11 +14,11 @@ all: lib
 lib: $(LIBNAME).so $(LIBNAME).a

 $(LIBNAME).so:  $(OBJFILES)
-	$(CXX) -shared $(CXXFLAGS) $(OBJFILES) -o $(LIBNAME).so
+	$(CXX) -shared $(CXXFLAGS) $(OBJFILES) $(LINKFILES) -o $(LIBNAME).so

 $(LIBNAME).a: $(OBJFILES)
 	rm -f $(LIBNAME).a  # create new library
-	ar rcv $(LIBNAME).a $(OBJFILES)
+	ar rcv $(LIBNAME).a $(OBJFILES) $(LINKFILES)
 	ranlib $(LIBNAME).a
 	nm --demangle $(LIBNAME).a > $(LIBNAME).nm

--- a/src/sse_mathfun.cpp
+++ b/src/sse_mathfun.cpp
@@ -0,0 +1,664 @@
+#include "./sse_mathfun.h"
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
--- a/src/sse_mathfun.h
+++ b/src/sse_mathfun.h
@@ -0,0 +1,63 @@
+#ifndef _SSE_MATH_FUN_
+#define  _SSE_MATH_FUN_
+
+#include <xmmintrin.h>
+
+
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+
+v4sf exp_ps(v4sf x);
+
+v4sf sin_ps(v4sf x);
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x);
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c);
+
+#endif
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -17,7 +17,6 @@ CXXFLAGS += -I$(LIB_DIR)

 all:| lib $(ALL_TESTS);

-
 gen: $(GEN_TESTS)

 test: all
@@ -31,3 +30,6 @@ nn-%: nn-%.cpp $(LIB_DIR)/NeuronNetwork.a

 lib:
 	make -C ../
+
+clean:
+	@for i in $(ALL_TESTS);do rm -f $$i;done;
--- a/tests/nn-01.cpp
+++ b/tests/nn-01.cpp
@@ -31,7 +31,7 @@ int main(int argc)
 	s.push_back(Shin::NeuronNetwork::Solution(std::vector<double>({0})));
 	p.push_back(X(std::vector<bool>({1})));

-	Shin::NeuronNetwork::FeedForwardNetworkQuick q({1,5000,5000,5000,500,500,500,500});
+	Shin::NeuronNetwork::FeedForwardNetworkQuick q({1,5000,5000,5000,5000});
 	Shin::NeuronNetwork::Learning::BackPropagation b(q);
 	if(argc > 1)
 	{