AVX fix for non-FMA CPU
This commit is contained in:
@@ -18,13 +18,26 @@ float NeuralNetwork::BasisFunction::Linear::operator()(const std::vector<float>
|
||||
|
||||
partialSolution.avx=_mm256_setzero_ps();
|
||||
|
||||
#ifndef USE_FMA
|
||||
__m256 tmp;
|
||||
#endif
|
||||
for(size_t k=0;k<alignedPrev;k+=8) {
|
||||
//TODO: assignement!! -- possible speedup
|
||||
#ifdef USE_FMA
|
||||
partialSolution.avx=_mm256_fmadd_ps(_mm256_loadu_ps(weightsData+k),_mm256_loadu_ps(inputData+k),partialSolution.avx);
|
||||
#else
|
||||
tmp=_mm256_mul_ps(_mm256_loadu_ps(weightsData+k),_mm256_loadu_ps(inputData+k));
|
||||
partialSolution.avx=_mm256_add_ps(tmp,partialSolution.avx);
|
||||
#endif
|
||||
}
|
||||
|
||||
for(size_t k=alignedPrev;k<inputSize;k++) {
|
||||
#ifdef USE_FMA
|
||||
partialSolution.avx=_mm256_fmadd_ps(_mm256_set_ps(weightsData[k],0,0,0,0,0,0,0),_mm256_set_ps(inputData[k],0,0,0,0,0,0,0),partialSolution.avx);
|
||||
#else
|
||||
tmp=_mm256_mul_ps(_mm256_set_ps(weightsData[k],0,0,0,0,0,0,0),_mm256_set_ps(inputData[k],0,0,0,0,0,0,0));
|
||||
partialSolution.avx=_mm256_add_ps(tmp,partialSolution.avx);
|
||||
#endif
|
||||
}
|
||||
|
||||
partialSolution.avx = _mm256_add_ps(partialSolution.avx, _mm256_permute2f128_ps(partialSolution.avx , partialSolution.avx , 1));
|
||||
|
||||
Reference in New Issue
Block a user