diff --git a/.gitignore b/.gitignore index 10031a9..d2a5fa3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,3 @@ -NN.kdev4 -.kdev4 -*.o -*.a -*.so -*.nm /doc/html/* !/doc/html/doxy-boot.js !/doc/html/jquery.powertip.min.js diff --git a/CMakeLists.txt b/CMakeLists.txt index e2cd7d4..9710b71 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,22 +2,27 @@ cmake_minimum_required(VERSION 3.2) project(NeuralNetwork CXX) OPTION(BUILD_SHARED_LIBS "Build also shared library." ON) +OPTION(USE_AVX "IF avx should be used." ON) OPTION(USE_SSE "IF sse should be used." ON) OPTION(USE_SSE2 "IF only sse2 should be used." OFF) + OPTION(ENABLE_TESTS "enables tests" ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic -Weffc++ -Wshadow -Wstrict-aliasing -ansi -Woverloaded-virtual -Wdelete-non-virtual-dtor -Wno-unused-function") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native -O3") -if(USE_SSE) +if(USE_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX") +elseif(USE_SSE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -DUSE_SSE") if(USE_SSE2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SSE2") endif(USE_SSE2) -endif(USE_SSE) +endif(USE_AVX) include_directories(./include/) diff --git a/include/NeuralNetwork/BasisFunction/Linear.h b/include/NeuralNetwork/BasisFunction/Linear.h index e4a007d..f3905e8 100644 --- a/include/NeuralNetwork/BasisFunction/Linear.h +++ b/include/NeuralNetwork/BasisFunction/Linear.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "./StreamingBasisFunction.h" @@ -18,8 +19,37 @@ namespace BasisFunction { Linear() {} inline virtual float operator()(const std::vector& weights, const std::vector& input) const override { +#ifdef USE_AVX + //TODO: check sizes!!! + std::size_t inputSize=input.size(); + size_t alignedPrev=inputSize-inputSize%8; -#ifdef USE_SSE + const float* weightsData=weights.data(); + const float* inputData=input.data(); + + union { + __m256 avx; + float f[8]; + } partialSolution; + + partialSolution.avx=_mm256_setzero_ps(); + + for(size_t k=0;k