From c9d14547434d81f8eb49a69ca0494c383ae18fba Mon Sep 17 00:00:00 2001 From: raver119 Date: Mon, 27 Apr 2020 17:37:53 +0300 Subject: [PATCH] MKLDNN tweaks (#415) * one simple test Signed-off-by: raver119 * fix Signed-off-by: raver119 * hmmmm... Signed-off-by: raver119 * mkl matmul skip tweaks Signed-off-by: raver119 * minor fix for MemoryTracker * long shapes in matmul * - 2 new tests for mkldnn tanh - mkldnn isn't used for scalar tanh --- libnd4j/include/memory/impl/MemoryTracker.cpp | 3 + .../ops/declarable/platform/mkldnn/matmul.cpp | 60 ++++++++++++------- .../ops/declarable/platform/mkldnn/tanh.cpp | 4 +- .../tests_cpu/layers_tests/MklDnnTests.cpp | 35 ++++++++--- .../layers_tests/PerformanceTests.cpp | 46 ++++++++++++++ 5 files changed, 118 insertions(+), 30 deletions(-) diff --git a/libnd4j/include/memory/impl/MemoryTracker.cpp b/libnd4j/include/memory/impl/MemoryTracker.cpp index be3019b08..5ebb4fd16 100644 --- a/libnd4j/include/memory/impl/MemoryTracker.cpp +++ b/libnd4j/include/memory/impl/MemoryTracker.cpp @@ -90,6 +90,9 @@ namespace sd { return result; } } + + // safe return + return std::string(""); } #endif diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp index f3ef84e2f..0dd3b21f7 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp @@ -31,6 +31,20 @@ namespace sd { namespace ops { namespace platforms { + dnnl::memory::format_tag get_format_tag(const sd::NDArray &array) { + switch (array.rankOf()) { + case 1: + return dnnl::memory::format_tag::ab; + case 2: + return array.ordering() == 'c' ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba; + case 3: + return array.ordering() == 'c' ? dnnl::memory::format_tag::abc : dnnl::memory::format_tag::cba; + default: + throw std::runtime_error("MKLDNN matmul only supports 2D/3D arrays"); + } + } + + ////////////////////////////////////////////////////////////////////////// static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const bool transX, const bool transY, float alpha = 1.f, float beta = 0.f) { @@ -69,17 +83,15 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b NDArray* zR = xRank <= 3 ? z : new NDArray(z->reshape(z->ordering(), {z->lengthOf() / (z->sizeAt(-2) * z->sizeAt(-1)), z->sizeAt(-2), z->sizeAt(-1)})/*, false*/); // [M,K] x [K,N] = [M,N] - const int M = (xRank > 1) ? xTR->sizeAt(-2) : 1; - const int K = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf(); - const int N = (yRank > 1) ? yTR->sizeAt(-1) : 1; - const int bS = (xRank > 2) ? xTR->sizeAt(0) : 1; // [bS, M,K] x [bS, K,N] = [bS, M,N] + const int64_t M = (xRank > 1) ? xTR->sizeAt(-2) : 1; + const int64_t K = (xRank > 1) ? xTR->sizeAt(-1) : xTR->lengthOf(); + const int64_t N = (yRank > 1) ? yTR->sizeAt(-1) : 1; + const int64_t bS = (xRank > 2) ? xTR->sizeAt(0) : 1; // [bS, M,K] x [bS, K,N] = [bS, M,N] dnnl::memory::dims xShape = xRank < 3 ? dnnl::memory::dims({M, K}) : dnnl::memory::dims({bS, M, K}); dnnl::memory::dims yShape = xRank < 3 ? dnnl::memory::dims({K, N}) : dnnl::memory::dims({bS, K, N}); dnnl::memory::dims zShape = xRank < 3 ? dnnl::memory::dims({M, N}) : dnnl::memory::dims({bS, M, N}); - dnnl::memory::format_tag format = xRank < 3 ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::abc; - // x type dnnl::memory::data_type xType; if(x->dataType() == DataType::FLOAT32) @@ -114,9 +126,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b // memory descriptors for arrays // x - dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, dnnl::memory::format_tag::any); - dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, format); - if(xTR->ews() != 1 || xTR->ordering() != 'c') { + dnnl::memory::desc x_mkl_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR)); + dnnl::memory::desc x_user_md = dnnl::memory::desc(xShape, xType, get_format_tag(*xTR)); + if(xTR->ews() != 1) { x_user_md.data.format_kind = dnnl_blocked; // overrides format x_user_md.data.format_desc.blocking.strides[0] = xRank == 1 ? 1 : xTR->strideAt(0); x_user_md.data.format_desc.blocking.strides[1] = xRank == 1 ? xTR->strideAt(0) : xTR->strideAt(1); @@ -125,9 +137,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b } // y - dnnl::memory::desc y_mkl_md = dnnl::memory::desc(yShape, yType, dnnl::memory::format_tag::any); - dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, format); - if(yTR->ews() != 1 || yTR->ordering() != 'c') { + dnnl::memory::desc y_mkl_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR)); + dnnl::memory::desc y_user_md = dnnl::memory::desc(yShape, yType, get_format_tag(*yTR)); + if(yTR->ews() != 1) { y_user_md.data.format_kind = dnnl_blocked; // overrides format y_user_md.data.format_desc.blocking.strides[0] = yRank == 1 ? 1 : yTR->strideAt(0); y_user_md.data.format_desc.blocking.strides[1] = yRank == 1 ? yTR->strideAt(0) : yTR->strideAt(1); @@ -136,9 +148,9 @@ static void matmulMKLDNN(const NDArray* x, const NDArray* y, NDArray* z, const b } // z - dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, dnnl::memory::format_tag::any); - dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, format); - if(zR->ews() != 1 || zR->ordering() != 'c') { + dnnl::memory::desc z_mkl_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR)); + dnnl::memory::desc z_user_md = dnnl::memory::desc(zShape, zType, get_format_tag(*zR)); + if(zR->ews() != 1) { z_user_md.data.format_kind = dnnl_blocked; // overrides format z_user_md.data.format_desc.blocking.strides[0] = zRank == 1 ? 1 : zR->strideAt(0); z_user_md.data.format_desc.blocking.strides[1] = zRank == 1 ? zR->strideAt(0) : zR->strideAt(1); @@ -289,14 +301,20 @@ PLATFORM_CHECK(matmul, ENGINE_CPU) { auto z = OUTPUT_VARIABLE(0); - const DataType xType = x->dataType(); - const DataType yType = y->dataType(); - const DataType zType = z->dataType(); + const auto xType = x->dataType(); + const auto yType = y->dataType(); + const auto zType = z->dataType(); - float alpha = block.numT() > 0 ? T_ARG(0) : 1.0; - float beta = block.numT() > 1 ? T_ARG(1) : 0.0; + float alpha = block.numT() > 0 ? T_ARG(0) : 1.0f; + float beta = block.numT() > 1 ? T_ARG(1) : 0.0f; - return !(z->ordering() == 'f' && beta != 0.f) && block.isUseMKLDNN() && x->rankOf() < 3 && + // we're skipping if result order is F or arrays are not continuous + bool skip2D = z->rankOf() == 2 && (z->ordering() == 'f' || x->ews() != 1 || y->ews() != 1 || z->ews() != 1); + + // we're skipping 3D cases if they are not C continuoys + bool skip3D = z->rankOf() == 3 && (x->ordering() == 'f' || y->ordering() == 'f' || z->ordering() == 'f' || x->ews() != 1 || y->ews() != 1 || z->ews() != 1); + + return !skip2D && !skip3D && block.isUseMKLDNN() && x->rankOf() < 3 && ( (xType==DataType::FLOAT32 && yType==DataType::FLOAT32 && zType==DataType::FLOAT32) || (xType==DataType::HALF && yType==DataType::HALF && zType==DataType::FLOAT32) || diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/tanh.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/tanh.cpp index a82bc2706..fab32f280 100644 --- a/libnd4j/include/ops/declarable/platform/mkldnn/tanh.cpp +++ b/libnd4j/include/ops/declarable/platform/mkldnn/tanh.cpp @@ -109,7 +109,7 @@ namespace sd { const DataType zType = z->dataType(); const int xRank = x->rankOf(); - bool bSupportedRanks = !x->isEmpty() && xRank < 7 && (xType == DataType::FLOAT32 && zType == DataType::FLOAT32); + bool bSupportedRanks = !x->isEmpty() && xRank < 7 && xRank > 0 && (xType == DataType::FLOAT32 && zType == DataType::FLOAT32); /* Source Destination f32 f32 @@ -214,7 +214,7 @@ namespace sd { const int xRank = x->rankOf(); const int dLdzRank = dLdz->rankOf(); - bool bSupportedRanks = xRank < 7 && dLdzRank == xRank && (!x->isEmpty() && !dLdz->isEmpty()); + bool bSupportedRanks = xRank < 7 && xRank > 0 && dLdzRank == xRank && (!x->isEmpty() && !dLdz->isEmpty()); bSupportedRanks &= (xType == DataType::FLOAT32 && dLdzType == DataType::FLOAT32 && dLdxType == DataType::FLOAT32); if (bSupportedRanks) { diff --git a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp index bb3934994..c91c1c5c7 100644 --- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp @@ -19,15 +19,17 @@ // @author raver119@gmail.com // +#ifdef HAVE_MKLDNN + #include "testlayers.h" #include #include - -#ifdef HAVE_MKLDNN - #include +#include +#include -#endif + +using namespace sd; class MklDnnTests : public testing::Test { public: @@ -44,7 +46,6 @@ static void printer(std::initializer_list h TEST_F(MklDnnTests, helpers_includer) { // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker -#ifdef HAVE_MKLDNN sd::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d; sd::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp; @@ -83,6 +84,26 @@ TEST_F(MklDnnTests, helpers_includer) { printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm, &matmul, &softmax, &softmax_bp, &tanh, &tanh_bp, &xw_plus_b, &xw_plus_b_bp }); - -#endif } + +TEST_F(MklDnnTests, test_tanh_1) { + auto x = NDArrayFactory::create(1.0f); + auto z = NDArrayFactory::create(0.0f); + + sd::ops::tanh op; + auto status = op.execute({&x}, {&z}); + + ASSERT_EQ(Status::OK(), status); +} + +TEST_F(MklDnnTests, test_tanh_2) { + auto x = NDArrayFactory::create('c', {1}, {1.0f}); + auto z = NDArrayFactory::create('c', {1}, {0.0f}); + + sd::ops::tanh op; + auto status = op.execute({&x}, {&z}); + + ASSERT_EQ(Status::OK(), status); +} + +#endif \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp index f00536e58..c6155eb0c 100644 --- a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp @@ -60,6 +60,52 @@ public: #ifdef RELEASE_BUILD +TEST_F(PerformanceTests, test_matmul_c_f_1) { + int iterations = 500; + std::vector valuesC, valuesF; + for (int e = 0; e < iterations; e++) { + auto xc = NDArrayFactory::create('c', {512, 2048}); + auto yc = NDArrayFactory::create('c', {2048, 512}); + auto zc = NDArrayFactory::create('c', {512, 512}); + + auto xf = NDArrayFactory::create('f', {512, 2048}); + auto yf = NDArrayFactory::create('f', {2048, 512}); + auto zf = NDArrayFactory::create('f', {512, 512}); + + auto warm = xc.like(); + warm.linspace(1.0); + + //zc.linspace(1.0); + //zf.linspace(1.0); + + sd::ops::matmul op; + + auto timeStartF = std::chrono::system_clock::now(); + + op.execute({&xf, &yf}, {&zf}); + + auto timeEndF = std::chrono::system_clock::now(); + auto outerTimeF = std::chrono::duration_cast(timeEndF - timeStartF).count(); + + + auto timeStartC = std::chrono::system_clock::now(); + + op.execute({&xc, &yc}, {&zc}); + + auto timeEndC = std::chrono::system_clock::now(); + auto outerTimeC = std::chrono::duration_cast(timeEndC - timeStartC).count(); + + valuesF.emplace_back(outerTimeF); + valuesC.emplace_back(outerTimeC); + } + + std::sort(valuesC.begin(), valuesC.end()); + std::sort(valuesF.begin(), valuesF.end()); + + + nd4j_printf("Median time C: [%lld]; Median time F: [%lld];", valuesC[valuesC.size() / 2], valuesF[valuesF.size() / 2]); +} + TEST_F(PerformanceTests, test_maxpooling2d_1) { std::vector valuesX; // auto x = NDArrayFactory::create('c', {32, 3, 224, 224});