sse4.2 ready FFQ and Backpropagation changed to fit new FeedForward

This commit is contained in:
2014-11-15 00:52:12 +01:00
parent 23452bcec5
commit dddf96da5b
5 changed files with 76 additions and 140 deletions

View File

@@ -22,7 +22,7 @@ FFNeuron* FFLayer::operator[](int neuron)
neurons=new FFNeuron*[layerSize];
for(size_t i=0;i<layerSize;i++)
{
neurons[i]=new FFNeuron(&potentials[i],weights[i],&sums[i],lambda);
neurons[i]=new FFNeuron(potentials[i],weights[i],sums[i],inputs[i],lambda);
}
}
return neurons[neuron];
@@ -34,14 +34,14 @@ FeedForwardNetworkQuick::~FeedForwardNetworkQuick()
{
for(size_t i=0;i<layers;i++)
{
for (size_t j=0;j<layerSizes[i];j++)
for (size_t j=1;j<layerSizes[i];j++)
{
if(j!=0)
delete[] weights[i][j];
}
delete[] weights[i];
delete[] potentials[i];
delete[] sums[i];
if(i!=layers-1)
delete[] sums[i];
}
delete[] sums[layers];
delete[] weights;
@@ -59,23 +59,58 @@ FeedForwardNetworkQuick::~FeedForwardNetworkQuick()
}
}
#define _LOOP(FROM,TO) {}
void FeedForwardNetworkQuick::solvePart(float *newSolution, size_t begin, size_t steps,size_t prevSize, float *sol,size_t layer)
void FeedForwardNetworkQuick::solvePart(float *newSolution, register size_t begin, size_t end,size_t prevSize, float *sol,size_t layer)
{
register size_t end=begin+steps;
for( size_t j=begin;j<end;j++)
if(prevSize >)
{
newSolution[j]=sol[0]*weights[layer][j][0];
for(register size_t k=1;k<prevSize;k++)
for( size_t j=begin;j<end;j++)
{
if(layer==0)
register size_t alignedPrev=prevSize>8?(prevSize-(prevSize%8)):0;
__m128 partialSolution = _mm_setzero_ps();
__m128 partialSolution2 = _mm_setzero_ps();
__m128 w;
__m128 sols;
__m128 w2;
__m128 sols2;
for(register size_t k=0;k<alignedPrev;k+=8)
{
newSolution[j]+=sol[k]*weights[layer][j][k];
}else
{
newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[layer][j][k];
w = _mm_load_ps(this->weights[layer][j]+k);
w2 = _mm_load_ps(this->weights[layer][j]+k+4);
sols = _mm_load_ps(sol+k);
sols2 = _mm_load_ps(sol+k+4);
w=_mm_mul_ps(w,sols);
w2=_mm_mul_ps(w2,sols2);
partialSolution=_mm_add_ps(partialSolution,w);
partialSolution2=_mm_add_ps(partialSolution2,w2);
}
partialSolution = _mm_hadd_ps(partialSolution, partialSolution2);
partialSolution = _mm_hadd_ps(partialSolution, partialSolution);
partialSolution = _mm_hadd_ps(partialSolution, partialSolution);
_mm_store_ss(inputs[layer]+j,partialSolution);
for(register size_t k=alignedPrev;k<prevSize;k++)
{
inputs[layer][j]+=sol[k]*weights[layer][j][k];
}
partialSolution=_mm_load_ss(inputs[layer]+j);
__m128 temporaryConst = _mm_set1_ps(-lambda);
partialSolution=_mm_mul_ps(temporaryConst,partialSolution); //-lambda*sol[k]
partialSolution=exp_ps(partialSolution); //exp(sols)
temporaryConst = _mm_set1_ps(1.0);
partialSolution= _mm_add_ps(partialSolution,temporaryConst); //1+exp()
partialSolution= _mm_div_ps(temporaryConst,partialSolution);//1/....*/
_mm_store_ss(newSolution+j,partialSolution);
}
}else
{
for( size_t j=begin;j<end;j++)
{
register float tmp=0;
for(register size_t k=0;k<prevSize;k++)
{
tmp+=sol[k]*weights[layer][j][k];
}
newSolution[j]=(1.0/(1.0+exp(-lambda*tmp)));
inputs[layer][j]=tmp;
}
}
}
@@ -94,7 +129,7 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
for(register size_t i=0;i<layers;i++)
{
float* newSolution= sums[i+1];//new bool[layerSizes[i]];
if(threads > 1 && layerSizes[i] > 600) // 600 is an guess about actual size, when creating thread has some speedup
if(threads > 1 && (layerSizes[i] > 700 ||prevSize > 700)) // 600 is an guess about actual size, when creating thread has some speedup
{
std::vector<std::thread> th;
size_t s=1;
@@ -106,54 +141,7 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
if(s>=layerSizes[i])
break;
th.push_back(std::thread([i,this,newSolution,prevSize,sol](size_t from, size_t to)->void{
_LOOP(from,to<(from+4)?to:(from+4));
register size_t max= (int)(to-4) < 0?0:(to-4);
for( size_t j=from+4;j<max;j+=4)
{
newSolution[j]=sol[0]*weights[i][j][0];
newSolution[j+1]=sol[0]*weights[i][j+1][0];
newSolution[j+2]=sol[0]*weights[i][j+2][0];
newSolution[j+3]=sol[0]*weights[i][j+3][0];
__m128 partialSolution = _mm_load_ps(newSolution+j);
register size_t upper_limit=prevSize-4;
for(register size_t k=from;k< 4 && k <prevSize;k++)
{
if(i==0)
newSolution[j]+=sol[k]*weights[i][j][k];
else
newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
}
for(register size_t k=4;k<upper_limit;k+=4)
{
__m128 w = _mm_loadr_ps((this->weights[i][j])+k);
__m128 sols = _mm_loadr_ps(sol+k);
if(i!=0)
{
__m128 tmp = _mm_set1_ps(-lambda);
sols=_mm_mul_ps(tmp,sols); //-lambda*sol[k]
sols=exp_ps(sols); //exp(sols)
tmp = _mm_set1_ps(1.0);
sols= _mm_add_ps(sols,tmp); //1+exp()
sols= _mm_div_ps(tmp,sols);//1/....
}
w=_mm_mul_ps(w,sols);
partialSolution=_mm_add_ps(partialSolution,w);
}
for(register size_t k=upper_limit;k<prevSize;k++)
{
if(i==0)
newSolution[j]+=sol[k]*weights[i][j][k];
else
newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
}
}
if(max!=0)
_LOOP(max,to);
solvePart(newSolution,from,to,prevSize,sol,i);
},s,t==threads?layerSizes[i]:s+step));//{}
s+=step;
}
@@ -162,56 +150,7 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
thr.join();
}else
{
if(1)
{
solvePart(newSolution,1,layerSizes[i]-1,prevSize,sol,i);
}else
{
_LOOP(1,layerSizes[i]<4?layerSizes[i]:4);
register size_t max= (int)(layerSizes[i]-4) < 0?0:(layerSizes[i]-4);
for( size_t j=4;j<max;j=j+4)
{
newSolution[j]=sol[0]*weights[i][j][0];
newSolution[j+1]=sol[0]*weights[i][j+1][0];
newSolution[j+2]=sol[0]*weights[i][j+2][0];
newSolution[j+3]=sol[0]*weights[i][j+3][0];
__m128 partialSolution = _mm_load_ps(newSolution+j);
register size_t upper_limit=prevSize-prevSize%4;
for(register size_t k=1;k<prevSize && k <4;k++)
{
if(i==0)
newSolution[j]+=sol[k]*weights[i][j][k];
else
newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
}
for(register size_t k=4;k<upper_limit;k+=4)
{
__m128 w = _mm_loadr_ps((this->weights[i][j])+k);
__m128 sols = _mm_loadr_ps(sol+k);
if(i!=0)
{
__m128 tmp = _mm_set1_ps(-lambda);
sols=_mm_mul_ps(tmp,sols); //-lambda*sol[k]
sols=exp_ps(sols); //exp(sols)
tmp = _mm_set1_ps(1.0);
sols= _mm_add_ps(sols,tmp); //1+exp()
sols= _mm_div_ps(tmp,sols);//1/....
}
w=_mm_mul_ps(w,sols);
partialSolution=_mm_add_ps(partialSolution,w);
}
for(register size_t k=upper_limit;k<prevSize;k++)
{
if(i==0)
newSolution[j]+=sol[k]*weights[i][j][k];
else
newSolution[j]+=(1.0/(1.0+exp(-lambda*sol[k])))*weights[i][j][k];
}
}
if(max!=0)
_LOOP(max,layerSizes[i]);
}
solvePart(newSolution,1,layerSizes[i],prevSize,sol,i);
}
prevSize=layerSizes[i];
sol=newSolution;
@@ -219,7 +158,7 @@ Solution FeedForwardNetworkQuick::solve(const Problem& p)
std::vector<double> ret;
for(size_t i=1;i<prevSize;i++)
{
ret.push_back((1.0/(1.0+exp(-lambda*sol[i]))));
ret.push_back(sol[i]);
}
return ret;
}
@@ -231,7 +170,7 @@ FFLayer* FeedForwardNetworkQuick::operator[](int l)
ffLayers=new FFLayer*[layers];
for(size_t i=0;i<layers;i++)
{
ffLayers[i]=new FFLayer(layerSizes[i],potentials[i],weights[i],sums[i+1],lambda);
ffLayers[i]=new FFLayer(layerSizes[i],potentials[i],weights[i],sums[i+1],inputs[i],lambda);
}
}
return ffLayers[l];

View File

@@ -31,19 +31,20 @@ namespace NeuronNetwork
FFNeuron() = delete;
FFNeuron(const FFNeuron&) = delete;
FFNeuron& operator=(const FFNeuron&) = delete;
FFNeuron(float *pot, float *w, float*s,float lam):potential(pot),weights(w),sum(s),lambda(lam) { }
FFNeuron(float &pot, float *w, float &s, float &i,float lam):potential(pot),weights(w),sum(s),inputs(i),lambda(lam) { }
float getPotential() {return *potential;}
void setPotential(double p) { *potential=p;}
float getPotential() {return potential;}
void setPotential(double p) { potential=p;}
float getWeight(unsigned int i ) { return weights[i];}
void setWeight(unsigned int i,float p) { weights[i]=p; }
inline float output() const { return 1.0/(1.0+(exp(-lambda*input()))); }
inline float input() const { return *sum; }
inline float output() const { return sum; }
inline float input() const { return inputs; }
inline float derivatedOutput() const { return lambda*output()*(1.0-output()); }
protected:
float *potential;
float &potential;
float *weights;
float *sum;
float &sum;
float &inputs;
float lambda;
private:
};
@@ -53,7 +54,7 @@ namespace NeuronNetwork
public:
FFLayer(const FFLayer &) =delete;
FFLayer operator=(const FFLayer &) = delete;
FFLayer(size_t s, float *p,float **w,float *su,float lam): neurons(nullptr),layerSize(s),potentials(p),weights(w),sums(su),lambda(lam) {}
FFLayer(size_t s, float *p,float **w,float *su,float *in,float lam): neurons(nullptr),layerSize(s),potentials(p),weights(w),sums(su),inputs(in),lambda(lam) {}
~FFLayer();
FFNeuron* operator[](int neuron);
size_t size() const {return layerSize;};
@@ -63,6 +64,7 @@ namespace NeuronNetwork
float *potentials;
float **weights;
float *sums;
float *inputs;
float lambda;
};
@@ -77,7 +79,7 @@ namespace NeuronNetwork
potentials= new float*[s.size()];
layerSizes= new size_t[s.size()];
sums= new float*[s.size()+1];
inputs= new float*[s.size()+1];
inputs= new float*[s.size()];
int i=0;
int prev_size=1;
for(int layeSize:s) // TODO rename
@@ -93,7 +95,7 @@ namespace NeuronNetwork
weights[i]= new float*[layeSize];
potentials[i]= new float[layeSize];
sums[i+1]= new float[layeSize];
inputs[i+1]= new float[layeSize];
inputs[i]= new float[layeSize];
potentials[i][0]=1.0;
sums[i+1][0]=1.0;
for (int j=1;j<layeSize;j++)
@@ -115,14 +117,14 @@ namespace NeuronNetwork
FFLayer* operator[](int l);
void setThreads(unsigned t) {threads=t;}
protected:
void solvePart(float *newSolution, size_t begin, size_t steps,size_t prevSize, float* sol,size_t layer);
void solvePart(float *newSolution, size_t begin, size_t end,size_t prevSize, float* sol,size_t layer);
private:
FFLayer **ffLayers;
float ***weights;
float **potentials;
public:
float **sums;
float **inputs;
public:
private:
size_t *layerSizes;
size_t layers;

View File

@@ -79,7 +79,7 @@ void Shin::NeuronNetwork::Learning::BackPropagation::propagate(const Shin::Neuro
{
network[i]->operator[](j)->setWeight(k,
network[i]->operator[](j)->getWeight(k)+learningCoeficient* deltas[i][j]*
(i==0? network.sums[0][k]:(float)network[i-1]->operator[](k)->output()));
(i==0? network.sums[0][k]:network[i-1]->operator[](k)->output()));
}
}
}

View File

@@ -31,23 +31,18 @@ int main(int argc)
s.push_back(Shin::NeuronNetwork::Solution(std::vector<double>({0})));
p.push_back(X(std::vector<bool>({1})));
Shin::NeuronNetwork::FeedForwardNetworkQuick q({1,5000,5000,5000,5000});
Shin::NeuronNetwork::FeedForwardNetworkQuick q({1,20000,20000,20000});
Shin::NeuronNetwork::Learning::BackPropagation b(q);
if(argc > 1)
{
std::cerr << "THREADING\n";
q.setThreads(4);
}
for(int i=0;i<5;i++)
for(int i=0;i<100;i++)
{
//b.teach(p[i%2],s[i%2]);
std::cerr << i%2 <<". FOR: [" << p[i%2].representation()[0] << "] res: " << q.solve(p[i%2])[0] << " should be " << s[i%2][0]<<"\n";
}
for(int i=0;i<5;i++)
{
//b.teach(p[i%2],s[i%2]);
std::cerr << i%2 <<". FOR: [" << p[i%2].representation()[0] << "] res: " << q.solve(p[i%2])[0] << " should be " << s[i%2][0]<<"\n";
q.solve(p[i%2])[0];
//std::cerr << i%2 <<". FOR: [" << p[i%2].representation()[0] << "] res: " << q.solve(p[i%2])[0] << " should be " << s[i%2][0]<<"\n";
}
for(int i=0;i<2;i++)
{

View File

@@ -22,7 +22,7 @@ int main()
for (int test=0;test<2;test++)
{
Shin::NeuronNetwork::FeedForwardNetworkQuick q({2,4,1});
Shin::NeuronNetwork::FeedForwardNetworkQuick q({2,40,1});
Shin::NeuronNetwork::Learning::BackPropagation b(q);
srand(time(NULL));