moved Linear sintrigification to .cpp file and fixed err in neuron weights

This commit is contained in:
2016-02-16 23:23:48 +01:00
parent 5bb132d84e
commit 69b530d597
6 changed files with 96 additions and 88 deletions

View File

@@ -7,6 +7,8 @@
#include <pmmintrin.h>
#include <immintrin.h>
#include <cassert>
#include "./StreamingBasisFunction.h"
#include "../../sse_mathfun.h"
@@ -18,74 +20,7 @@ namespace BasisFunction {
public:
Linear() {}
inline virtual float operator()(const std::vector<float>& weights, const std::vector<float>& input) const override {
#ifdef USE_AVX
//TODO: check sizes!!!
std::size_t inputSize=input.size();
size_t alignedPrev=inputSize-inputSize%8;
const float* weightsData=weights.data();
const float* inputData=input.data();
union {
__m256 avx;
float f[8];
} partialSolution;
partialSolution.avx=_mm256_setzero_ps();
for(size_t k=0;k<alignedPrev;k+=8) {
//TODO: asignement!! -- possible speedup
partialSolution.avx=_mm256_add_ps(partialSolution.avx,_mm256_mul_ps(_mm256_loadu_ps(weightsData+k),_mm256_loadu_ps(inputData+k)));
}
for(size_t k=alignedPrev;k<inputSize;k++) {
partialSolution.avx=_mm256_add_ps(partialSolution.avx,_mm256_mul_ps(_mm256_set_ps(weightsData[k],0,0,0,0,0,0,0),_mm256_set_ps(inputData[k],0,0,0,0,0,0,0)));
}
partialSolution.avx = _mm256_add_ps(partialSolution.avx, _mm256_permute2f128_ps(partialSolution.avx , partialSolution.avx , 1));
partialSolution.avx = _mm256_hadd_ps(partialSolution.avx, partialSolution.avx);
partialSolution.avx = _mm256_hadd_ps(partialSolution.avx, partialSolution.avx);
return partialSolution.f[0];
#else
#ifdef USE_SSE
size_t inputSize=input.size();
size_t alignedPrev=inputSize-inputSize%4;
const float* weightsData=weights.data();
const float* inputData=input.data();
vec4f partialSolution;
partialSolution.sse =_mm_setzero_ps();
//TODO prefetch ??
for(register size_t k=0;k<alignedPrev;k+=4) {
partialSolution.sse=_mm_add_ps(partialSolution.sse,_mm_mul_ps(_mm_load_ps(weightsData+k),_mm_load_ps(inputData+k)));
}
for(register size_t k=alignedPrev;k<inputSize;k++) {
partialSolution.sse=_mm_add_ps(partialSolution.sse,_mm_mul_ps(_mm_load_ss(weightsData+k),_mm_load_ss(inputData+k)));
}
#ifdef USE_SSE2 //pre-SSE3 solution
partialSolution.sse= _mm_add_ps(_mm_movehl_ps(partialSolution.sse, partialSolution.sse), partialSolution.sse);
partialSolution.sse=_mm_add_ss(partialSolution.sse, _mm_shuffle_ps(partialSolution.sse,partialSolution.sse, 1));
#else
partialSolution.sse = _mm_hadd_ps(partialSolution.sse, partialSolution.sse);
partialSolution.sse = _mm_hadd_ps(partialSolution.sse, partialSolution.sse);
#endif
return partialSolution.f[0];
#else
register float tmp = 0;
size_t inputSize=input.size();
for(size_t k=0;k<inputSize;k++) {
tmp+=input[k]*weights[k];
}
return tmp;
#endif
#endif
}
virtual float operator()(const std::vector<float>& weights, const std::vector<float>& input) const override;
virtual BasisFunction* clone() const override {
return new Linear();