AVX fix for non-FMA CPU
This commit is contained in:
@@ -18,7 +18,6 @@ if(CPU_SSE3_AVAILABLE)
|
|||||||
|
|
||||||
endif(CPU_SSE3_AVAILABLE)
|
endif(CPU_SSE3_AVAILABLE)
|
||||||
|
|
||||||
|
|
||||||
OPTION(USE_AVX "If AVX instruction set should be used." OFF)
|
OPTION(USE_AVX "If AVX instruction set should be used." OFF)
|
||||||
OPTION(USE_SSE "If SSE instruction set should be used." OFF)
|
OPTION(USE_SSE "If SSE instruction set should be used." OFF)
|
||||||
OPTION(USE_SSE2 "If SSE 2 instruction set should be used." ON)
|
OPTION(USE_SSE2 "If SSE 2 instruction set should be used." ON)
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ macro(CHECK_CPU)
|
|||||||
MESSAGE("-- -- Found SSE2")
|
MESSAGE("-- -- Found SSE2")
|
||||||
set(CPU_SSE2_AVAILABLE "true" INTERNAL BOOL "SSE2 available on host")
|
set(CPU_SSE2_AVAILABLE "true" INTERNAL BOOL "SSE2 available on host")
|
||||||
ELSE ()
|
ELSE ()
|
||||||
|
MESSAGE("-- -- Not found SSE2")
|
||||||
ENDIF (SSE2_TRUE)
|
ENDIF (SSE2_TRUE)
|
||||||
|
|
||||||
STRING(REGEX REPLACE "^.*(sse3).*$" "\\1" SSE_THERE ${CPUINFO})
|
STRING(REGEX REPLACE "^.*(sse3).*$" "\\1" SSE_THERE ${CPUINFO})
|
||||||
@@ -18,6 +19,7 @@ macro(CHECK_CPU)
|
|||||||
MESSAGE("-- -- Found SSE3")
|
MESSAGE("-- -- Found SSE3")
|
||||||
set(CPU_SSE3_AVAILABLE "true" INTERNAL BOOL "SSE3 available on host")
|
set(CPU_SSE3_AVAILABLE "true" INTERNAL BOOL "SSE3 available on host")
|
||||||
ELSE ()
|
ELSE ()
|
||||||
|
MESSAGE("-- -- Not found SSE3")
|
||||||
ENDIF (SSE3_TRUE)
|
ENDIF (SSE3_TRUE)
|
||||||
|
|
||||||
STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
|
STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
|
||||||
@@ -26,6 +28,7 @@ macro(CHECK_CPU)
|
|||||||
MESSAGE("-- -- Found SSE4.2")
|
MESSAGE("-- -- Found SSE4.2")
|
||||||
set(CPU_SSE4.2_AVAILABLE "true" INTERNAL BOOL "SSE4.2 available on host")
|
set(CPU_SSE4.2_AVAILABLE "true" INTERNAL BOOL "SSE4.2 available on host")
|
||||||
ELSE ()
|
ELSE ()
|
||||||
|
MESSAGE("-- -- Not found SSE4.2")
|
||||||
ENDIF (SSE4.2_TRUE)
|
ENDIF (SSE4.2_TRUE)
|
||||||
|
|
||||||
STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE ${CPUINFO})
|
STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE ${CPUINFO})
|
||||||
@@ -34,7 +37,18 @@ macro(CHECK_CPU)
|
|||||||
MESSAGE("-- -- Found AVX")
|
MESSAGE("-- -- Found AVX")
|
||||||
set(CPU_AVX_AVAILABLE "TRUE" INTERNAL BOOL "AVX available on host")
|
set(CPU_AVX_AVAILABLE "TRUE" INTERNAL BOOL "AVX available on host")
|
||||||
ELSE ()
|
ELSE ()
|
||||||
|
MESSAGE("-- -- Not found AVX")
|
||||||
ENDIF (AVX_TRUE)
|
ENDIF (AVX_TRUE)
|
||||||
|
|
||||||
|
STRING(REGEX REPLACE "^.*(fma).*$" "\\1" SSE_THERE ${CPUINFO})
|
||||||
|
STRING(COMPARE EQUAL "fma" "${SSE_THERE}" FMA_TRUE)
|
||||||
|
IF (FMA_TRUE)
|
||||||
|
MESSAGE("-- -- Found FMA")
|
||||||
|
set(CPU_FMA_AVAILABLE "TRUE" INTERNAL BOOL "FMA available on host")
|
||||||
|
ELSE ()
|
||||||
|
MESSAGE("-- -- Not found FMA")
|
||||||
|
ENDIF (FMA_TRUE)
|
||||||
|
|
||||||
else()
|
else()
|
||||||
MESSAGE("Error detecting CPU features")
|
MESSAGE("Error detecting CPU features")
|
||||||
endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||||
|
|||||||
@@ -18,13 +18,26 @@ float NeuralNetwork::BasisFunction::Linear::operator()(const std::vector<float>
|
|||||||
|
|
||||||
partialSolution.avx=_mm256_setzero_ps();
|
partialSolution.avx=_mm256_setzero_ps();
|
||||||
|
|
||||||
|
#ifndef USE_FMA
|
||||||
|
__m256 tmp;
|
||||||
|
#endif
|
||||||
for(size_t k=0;k<alignedPrev;k+=8) {
|
for(size_t k=0;k<alignedPrev;k+=8) {
|
||||||
//TODO: assignement!! -- possible speedup
|
//TODO: assignement!! -- possible speedup
|
||||||
|
#ifdef USE_FMA
|
||||||
partialSolution.avx=_mm256_fmadd_ps(_mm256_loadu_ps(weightsData+k),_mm256_loadu_ps(inputData+k),partialSolution.avx);
|
partialSolution.avx=_mm256_fmadd_ps(_mm256_loadu_ps(weightsData+k),_mm256_loadu_ps(inputData+k),partialSolution.avx);
|
||||||
|
#else
|
||||||
|
tmp=_mm256_mul_ps(_mm256_loadu_ps(weightsData+k),_mm256_loadu_ps(inputData+k));
|
||||||
|
partialSolution.avx=_mm256_add_ps(tmp,partialSolution.avx);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
for(size_t k=alignedPrev;k<inputSize;k++) {
|
for(size_t k=alignedPrev;k<inputSize;k++) {
|
||||||
|
#ifdef USE_FMA
|
||||||
partialSolution.avx=_mm256_fmadd_ps(_mm256_set_ps(weightsData[k],0,0,0,0,0,0,0),_mm256_set_ps(inputData[k],0,0,0,0,0,0,0),partialSolution.avx);
|
partialSolution.avx=_mm256_fmadd_ps(_mm256_set_ps(weightsData[k],0,0,0,0,0,0,0),_mm256_set_ps(inputData[k],0,0,0,0,0,0,0),partialSolution.avx);
|
||||||
|
#else
|
||||||
|
tmp=_mm256_mul_ps(_mm256_set_ps(weightsData[k],0,0,0,0,0,0,0),_mm256_set_ps(inputData[k],0,0,0,0,0,0,0));
|
||||||
|
partialSolution.avx=_mm256_add_ps(tmp,partialSolution.avx);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
partialSolution.avx = _mm256_add_ps(partialSolution.avx, _mm256_permute2f128_ps(partialSolution.avx , partialSolution.avx , 1));
|
partialSolution.avx = _mm256_add_ps(partialSolution.avx, _mm256_permute2f128_ps(partialSolution.avx , partialSolution.avx , 1));
|
||||||
|
|||||||
Reference in New Issue
Block a user