diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java
index db51cb499..dbebac7b9 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java
@@ -21,7 +21,7 @@ package org.deeplearning4j.nn.modelimport.keras.exceptions;
* Indicates that user is attempting to import a Keras model configuration that
* is malformed or invalid in some other way.
*
- * See https://deeplearning4j.org/docs/latest/keras-import-overview for more information.
+ * See https://deeplearning4j.konduit.ai/keras-import/overview for more information.
*
* @author dave@skymind.io
*/
@@ -40,6 +40,6 @@ public class InvalidKerasConfigurationException extends Exception {
}
private static String appendDocumentationURL(String message) {
- return message + ". For more information, see http://deeplearning4j.org/docs/latest/keras-import-overview";
+ return message + ". For more information, see https://deeplearning4j.konduit.ai/keras-import/overview";
}
}
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java
index 6244cf1e8..999682a8b 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java
@@ -21,7 +21,7 @@ package org.deeplearning4j.nn.modelimport.keras.exceptions;
* Indicates that user is attempting to import a Keras model configuration that
* is not currently supported.
*
- * See https://deeplearning4j.org/docs/latest/keras-import-overview
+ * See https://deeplearning4j.konduit.ai/keras-import/overview
* for more information and file an issue at https://github.com/eclipse/deeplearning4j/issues.
*
* @author dave@skymind.io
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
index f5a29d4f5..03f7ada88 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
@@ -103,7 +103,7 @@ public class KerasEmbedding extends KerasLayer {
"on Embedding layers. Zero Masking for the Embedding layer only works with unidirectional LSTM for now."
+ " If you want to have this behaviour for your imported model " +
"in DL4J, apply masking as a pre-processing step to your input." +
- "See http://deeplearning4j.org/docs/latest/deeplearning4j-nn-recurrent#masking for more on this.");
+ "See https://deeplearning4j.konduit.ai/models/recurrent#masking-one-to-many-many-to-one-and-sequence-classification for more on this.");
IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_EMBEDDING_INIT(),
enforceTrainingConfig, conf, kerasMajorVersion);
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/WorkspaceMode.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/WorkspaceMode.java
index 8824ad2ca..2a754d07d 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/WorkspaceMode.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/WorkspaceMode.java
@@ -17,10 +17,10 @@
package org.deeplearning4j.nn.conf;
/**
- * Workspace mode to use. See https://deeplearning4j.org/docs/latest/deeplearning4j-config-workspaces
+ * Workspace mode to use. See https://deeplearning4j.konduit.ai/config/config-memory/config-workspaces
*
* NONE: No workspaces will be used for the network. Highest memory use, least performance.
- * ENABLED: Use workspaces.
+ * ENABLED: Use workspaces. This is the default and should almost always be used
* SINGLE: Deprecated. Now equivalent to ENABLED, which should be used instead.
* SEPARATE: Deprecated. Now equivalent to ENABLED, which sohuld be used instead.
*
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LSTM.java
index 684f0df71..ba0b52d8c 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/LSTM.java
@@ -38,7 +38,7 @@ import java.util.Map;
/**
* LSTM recurrent neural network layer without peephole connections. Supports CuDNN acceleration - see https://deeplearning4j.org/docs/latest/deeplearning4j-config-cudnn for details
+ * href="https://deeplearning4j.konduit.ai/config/backends/config-cudnn">https://deeplearning4j.konduit.ai/config/backends/config-cudnn for details
*
* @author Alex Black
* @see GravesLSTM GravesLSTM class for an alternative LSTM (with peephole connections)
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
index c648efe7b..571afea7b 100755
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
@@ -1540,8 +1540,8 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
* (not) clearing the layer input arrays.
* Note: this method should NOT be used with clearInputs = true, unless you know what you are doing. Specifically:
* when using clearInputs=false, in combination with workspaces, the layer input fields may leak outside of the
- * workspaces in which they were defined - potentially causing a crash. See
- * https://deeplearning4j.org/docs/latest/deeplearning4j-config-workspaces
+ * workspaces in which they were defined - potentially causing a crash. See
+ * https://deeplearning4j.konduit.ai/config/config-memory/config-workspaces
* for more details
*
* @param input An array of ComputationGraph inputs
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java
index 81804a31f..d6a3bc58e 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/convolution/ConvolutionLayer.java
@@ -86,7 +86,7 @@ public class ConvolutionLayer extends BaseLayerhttps://deeplearning4j.org/docs/latest/deeplearning4j-nn-recurrent
- * READ THIS FIRST if you want to understand what the heck is happening here.
+ * RNN tutorial: https://deeplearning4j.konduit.ai/models/recurrent
+ * READ THIS FIRST if you want to understand this code.
*
* Shared code for the standard "forwards" LSTM RNN and the bidirectional LSTM RNN
* This was extracted from GravesLSTM and refactored into static helper functions. The general reasoning for this was
diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
index 8d303d391..00ca839f7 100644
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-scaleout-parallelwrapper/src/main/java/org/deeplearning4j/parallelism/ParallelWrapper.java
@@ -826,7 +826,7 @@ public class ParallelWrapper implements AutoCloseable {
/**
* This method allows you to specify training mode for this instance of PW.
* 1) AVERAGING - stands for parameters averaging. Each X epochs weights and updaters state will be averaged across all models
- * 2) SHARED_GRADIENTS - stands for gradients sharing - more details available here: https://deeplearning4j.org/docs/latest/deeplearning4j-scaleout-intro
+ * 2) SHARED_GRADIENTS - stands for gradients sharing - more details available here: https://deeplearning4j.konduit.ai/distributed-deep-learning/intro
* 3) CUSTOM - this method allows you to specify custom gradients accumulator, this giving you better control of configuration params for training.
*
* @param mode
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/main/java/org/deeplearning4j/spark/util/SparkUtils.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/main/java/org/deeplearning4j/spark/util/SparkUtils.java
index d28f0fa59..0bfad5a8a 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/main/java/org/deeplearning4j/spark/util/SparkUtils.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/main/java/org/deeplearning4j/spark/util/SparkUtils.java
@@ -71,7 +71,7 @@ public class SparkUtils {
+ "for ND4J INDArrays.\nWhen using Kryo, An appropriate Kryo registrator must be used to avoid"
+ " serialization issues (NullPointerException) with off-heap data in INDArrays.\n"
+ "Use nd4j-kryo_2.10 or _2.11 artifact, with sparkConf.set(\"spark.kryo.registrator\", \"org.nd4j.kryo.Nd4jRegistrator\");\n"
- + "See https://deeplearning4j.org/docs/latest/deeplearning4j-scaleout-howto#kryo for more details";
+ + "See https://deeplearning4j.konduit.ai/distributed-deep-learning/howto#how-to-use-kryo-serialization-with-dl-4-j-and-nd-4-j for more details";
private static String sparkExecutorId;
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java
index 38a15ef8d..4903091c6 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/dl4j-spark/src/test/java/org/deeplearning4j/spark/impl/multilayer/TestSparkDl4jMultiLayer.java
@@ -108,7 +108,7 @@ public class TestSparkDl4jMultiLayer extends BaseSparkTest {
.activation(Activation.SOFTMAX).nIn(100).nOut(10).build())
.build();
- //Configuration for Spark training: see https://deeplearning4j.org/docs/latest/deeplearning4j-scaleout-howto for explanation of these configuration options
+ //Configuration for Spark training: see https://deeplearning4j.konduit.ai/distributed-deep-learning/howto for explanation of these configuration options
TrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(batchSizePerWorker)
.averagingFrequency(2)
diff --git a/deeplearning4j/dl4j-integration-tests/src/test/java/org/deeplearning4j/integration/testcases/dl4j/misc/CharacterIterator.java b/deeplearning4j/dl4j-integration-tests/src/test/java/org/deeplearning4j/integration/testcases/dl4j/misc/CharacterIterator.java
index 1020f1706..110bfb731 100644
--- a/deeplearning4j/dl4j-integration-tests/src/test/java/org/deeplearning4j/integration/testcases/dl4j/misc/CharacterIterator.java
+++ b/deeplearning4j/dl4j-integration-tests/src/test/java/org/deeplearning4j/integration/testcases/dl4j/misc/CharacterIterator.java
@@ -195,7 +195,7 @@ public class CharacterIterator implements DataSetIterator {
// dimension 0 = number of examples in minibatch
// dimension 1 = size of each vector (i.e., number of characters)
// dimension 2 = length of each time series/example
- //Why 'f' order here? See https://deeplearning4j.org/docs/latest/deeplearning4j-nn-recurrent data section "Alternative: Implementing a custom DataSetIterator"
+ //Why 'f' order here? See https://deeplearning4j.konduit.ai/models/recurrent data section "Alternative: Implementing a custom DataSetIterator"
INDArray input = Nd4j.create(new int[]{currMinibatchSize, validCharacters.length, exampleLength}, 'f');
INDArray labels = Nd4j.create(new int[]{currMinibatchSize, validCharacters.length, exampleLength}, 'f');
diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
index a12b70194..8c8d5fb22 100755
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@@ -231,30 +231,34 @@ if(SD_CUDA)
file(GLOB_RECURSE CUSTOMOPS_CUDNN_SOURCES false ../include/ops/declarable/platform/cudnn/*.cu)
endif()
- add_library(nd4jobj OBJECT ${LOOPS_SOURCES_CUDA} ${LEGACY_SOURCES}
+ add_library(samediff_obj OBJECT ${LOOPS_SOURCES_CUDA} ${LEGACY_SOURCES}
${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES})
- # Don't output dynamic linked lib when a static lib build is specified unless the tests are built
- if(NOT SD_STATIC_LIB OR SD_BUILD_TESTS)
- add_library(${SD_LIBRARY_NAME} SHARED $)
- endif()
-
-
if (WIN32)
message("MSVC runtime for library: ${MSVC_RT_LIB}")
endif()
- # static library is built only if we're going to build tests, skip otherwise
- if (SD_BUILD_TESTS OR SD_STATIC_LIB)
- add_library(${SD_LIBRARY_NAME}static STATIC $)
+ # build shared library by default or when it's explicitly requested
+ if(NOT SD_STATIC_LIB OR SD_SHARED_LIB)
+ add_library(${SD_LIBRARY_NAME} SHARED $)
+ endif()
+
+ if (SD_STATIC_LIB AND SD_SHARED_LIB)
+ # if both static and shared library are going to be built - static library will have special suffix
+ add_library(${SD_LIBRARY_NAME}static STATIC $)
set_property(TARGET ${SD_LIBRARY_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
install(TARGETS ${SD_LIBRARY_NAME}static DESTINATION .)
+ elseif(SD_STATIC_LIB)
+ # if we only build static library - use this name
+ add_library(${SD_LIBRARY_NAME} STATIC $)
+ set_property(TARGET ${SD_LIBRARY_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
+ install(TARGETS ${SD_LIBRARY_NAME} DESTINATION .)
endif()
# on windows we want to make sure we use MT or MD, but since we use it in one lib, we must use it everywhere to avoid conflicts
- set_property(TARGET nd4jobj PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
+ set_property(TARGET samediff_obj PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
set_property(TARGET ${SD_LIBRARY_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
if(WIN32)
@@ -324,20 +328,28 @@ elseif(SD_CPU)
message("CPU BLAS")
add_definitions(-D__CPUBLAS__=true)
- add_library(nd4jobj OBJECT ${LEGACY_SOURCES}
+ add_library(samediff_obj OBJECT ${LEGACY_SOURCES}
${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
${OPS_SOURCES} ${PERF_SOURCES})
if(IOS)
- add_library(${SD_LIBRARY_NAME} STATIC $)
+ add_library(${SD_LIBRARY_NAME} STATIC $)
else()
- # static library is built only if we're going to build tests, skip otherwise
- if (SD_BUILD_TESTS OR SD_STATIC_LIB)
- add_library(${SD_LIBRARY_NAME}static STATIC $)
+ # build shared library by default or when it's explicitly requested
+ if(NOT SD_STATIC_LIB OR SD_SHARED_LIB)
+ add_library(${SD_LIBRARY_NAME} SHARED $)
endif()
- if(SD_BUILD_TESTS OR NOT SD_STATIC_LIB)
- add_library(${SD_LIBRARY_NAME} SHARED $)
+ if (SD_STATIC_LIB AND SD_SHARED_LIB)
+ # if both static and shared library are going to be built - static library will have special suffix
+ add_library(${SD_LIBRARY_NAME}static STATIC $)
+ set_property(TARGET ${SD_LIBRARY_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
+ install(TARGETS ${SD_LIBRARY_NAME}static DESTINATION .)
+ elseif(SD_STATIC_LIB)
+ # if we only build static library - use this name
+ add_library(${SD_LIBRARY_NAME} STATIC $)
+ set_property(TARGET ${SD_LIBRARY_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$:Debug>")
+ install(TARGETS ${SD_LIBRARY_NAME} DESTINATION .)
endif()
endif()
@@ -350,7 +362,7 @@ elseif(SD_CPU)
if ("${SD_ALL_OPS}" AND "${SD_BUILD_MINIFIER}")
message(STATUS "Building minifier...")
add_executable(minifier ../minifier/minifier.cpp ../minifier/graphopt.cpp)
- target_link_libraries(minifier ${SD_LIBRARY_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES})
+ target_link_libraries(minifier samediff_obj ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES})
endif()
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 4.9)
diff --git a/libnd4j/include/array/NDArray.h b/libnd4j/include/array/NDArray.h
index 7936f6688..ae4df227d 100644
--- a/libnd4j/include/array/NDArray.h
+++ b/libnd4j/include/array/NDArray.h
@@ -981,12 +981,12 @@ namespace sd {
* these methods suited for FlatBuffers use
*/
template
- std::vector getBufferAsVector();
+ std::vector getBufferAsVector() const;
std::vector getShapeAsVector() const;
std::vector getShapeAsVectorInt() const;
- std::vector getShapeInfoAsVector();
- std::vector getShapeInfoAsFlatVector();
- std::vector getShapeAsFlatVector();
+ std::vector getShapeInfoAsVector() const;
+ std::vector getShapeInfoAsFlatVector() const;
+ std::vector getShapeAsFlatVector() const;
/**
* set new order and shape in case of suitable array length (in-place operation)
diff --git a/libnd4j/include/array/NDArray.hXX b/libnd4j/include/array/NDArray.hXX
index 42f5f47f3..786333eec 100644
--- a/libnd4j/include/array/NDArray.hXX
+++ b/libnd4j/include/array/NDArray.hXX
@@ -982,16 +982,16 @@ std::string NDArray::asString(Nd4jLong limit) {
////////////////////////////////////////////////////////////////////////
template
-std::vector NDArray::getBufferAsVector() {
+std::vector NDArray::getBufferAsVector() const {
std::vector vector(lengthOf());
for (Nd4jLong e = 0; e < lengthOf(); e++)
vector[e] = this->e(e);
return vector;
}
-BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT std::vector, NDArray::getBufferAsVector(), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT std::vector, NDArray::getBufferAsVector() const, LIBND4J_TYPES);
////////////////////////////////////////////////////////////////////////
-std::vector NDArray::getShapeAsFlatVector() {
+std::vector NDArray::getShapeAsFlatVector() const {
std::vector vector(this->rankOf());
for (int e = 0; e < this->rankOf(); e++)
vector[e] = static_cast(this->sizeAt(e));
@@ -1019,7 +1019,7 @@ std::vector NDArray::getShapeAsVectorInt() const {
}
////////////////////////////////////////////////////////////////////////
-std::vector NDArray::getShapeInfoAsFlatVector() {
+std::vector NDArray::getShapeInfoAsFlatVector() const {
int magicNumber = shape::shapeInfoLength(this->rankOf());
std::vector vector(magicNumber);
@@ -1030,7 +1030,7 @@ std::vector NDArray::getShapeInfoAsFlatVector() {
}
////////////////////////////////////////////////////////////////////////
-std::vector NDArray::getShapeInfoAsVector() {
+std::vector NDArray::getShapeInfoAsVector() const {
int magicNumber = shape::shapeInfoLength(this->rankOf());
std::vector vector(magicNumber);
for (int e = 0; e < magicNumber; e++)
diff --git a/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp
index 958a90410..a7340bf21 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp
@@ -15,7 +15,8 @@
******************************************************************************/
//
-// @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
//
#include
@@ -27,24 +28,58 @@
namespace sd {
namespace ops {
+//////////////////////////////////////////////////////////////////////////
CONFIGURABLE_OP_IMPL(clipbyavgnorm, 1, 1, true, 1, 0) {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);
const bool isInplace = block.isInplace();
- auto ts = NDArrayFactory::create(T_ARG(0), block.launchContext());
+ auto clipNorm = NDArrayFactory::create(T_ARG(0), block.launchContext());
- helpers::clipByAveraged(block.launchContext(), *input, *output, *block.getIArguments(), ts, isInplace);
+ helpers::clipByNorm(block.launchContext(), *input, *output, *block.getIArguments(), clipNorm, isInplace, true);
return Status::OK();
}
- DECLARE_TYPES(clipbyavgnorm) {
- getOpDescriptor()
- ->setAllowedInputTypes(sd::DataType::ANY)
- ->setAllowedOutputTypes({ALL_FLOATS});
- }
+DECLARE_TYPES(clipbyavgnorm) {
+ getOpDescriptor()
+ ->setAllowedInputTypes(sd::DataType::ANY)
+ ->setAllowedOutputTypes({ALL_FLOATS});
+}
+
+//////////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(clipbyavgnorm_bp, 2, 1, false, 1, 0) {
+
+ auto input = INPUT_VARIABLE(0);
+ auto gradO = INPUT_VARIABLE(1);
+
+ auto gradI = OUTPUT_VARIABLE(0);
+
+ const auto clipNorm = NDArrayFactory::create(gradI->dataType(), T_ARG(0), block.launchContext());
+
+ helpers::clipByNormBp(block.launchContext(), *input, *gradO, *gradI, *block.getIArguments(), clipNorm, true);
+
+ return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(clipbyavgnorm_bp) {
+
+ Nd4jLong *newShape = nullptr;
+ COPY_SHAPE(inputShape->at(1), newShape);
+
+ return SHAPELIST(CONSTANT(newShape));
+}
+
+
+DECLARE_TYPES(clipbyavgnorm_bp) {
+ getOpDescriptor()
+ ->setAllowedInputTypes(0, DataType::ANY)
+ ->setAllowedInputTypes(1, {ALL_FLOATS})
+ ->setAllowedOutputTypes(0, {ALL_FLOATS});
+}
+
}
}
diff --git a/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp
index 43b23ba18..75145f7cc 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp
@@ -31,10 +31,10 @@ namespace ops {
auto input = INPUT_VARIABLE(0);
auto output = OUTPUT_VARIABLE(0);
- const auto clipNorm = NDArrayFactory::create(input->dataType(), T_ARG(0), block.launchContext());
+ const auto clipNorm = NDArrayFactory::create(output->dataType(), T_ARG(0), block.launchContext());
const bool isInplace = block.isInplace();
- helpers::clipByNorm(block.launchContext(), *input, *output, *block.getIArguments(), clipNorm, isInplace);
+ helpers::clipByNorm(block.launchContext(), *input, *output, *block.getIArguments(), clipNorm, isInplace, false);
return Status::OK();
}
@@ -45,15 +45,15 @@ namespace ops {
auto gradO = INPUT_VARIABLE(1);
auto gradI = OUTPUT_VARIABLE(0);
- const auto clipNorm = NDArrayFactory::create(T_ARG(0));
+ const auto clipNorm = NDArrayFactory::create(gradI->dataType(), T_ARG(0), block.launchContext());
- helpers::clipByNormBP(block.launchContext(), *input, *gradO, *gradI, *block.getIArguments(), clipNorm);
+ helpers::clipByNormBp(block.launchContext(), *input, *gradO, *gradI, *block.getIArguments(), clipNorm, false);
return Status::OK();
}
DECLARE_SHAPE_FN(clipbynorm_bp) {
- auto inShapeInfo = inputShape->at(0);
+ auto inShapeInfo = inputShape->at(1);
Nd4jLong *newShape = nullptr;
COPY_SHAPE(inShapeInfo, newShape);
diff --git a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
index fb1fd2e87..1cf750e00 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
@@ -23,8 +23,8 @@
#include
#include
-namespace sd {
-namespace ops {
+namespace sd {
+namespace ops {
//////////////////////////////////////////////////////////////////////////
@@ -85,6 +85,7 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
// ******** input validation ******** //
REQUIRE_TRUE(allOfSameType, 0, "CONCAT op: all of input arrays must have same type !");
+ REQUIRE_TRUE(nonEmptyArrs[0]->dataType() == OUTPUT_VARIABLE(0)->dataType(), 0, "CONCAT op: output array should have the same type as inputs arrays !");
REQUIRE_TRUE(0 <= axis && (axis < rank || (axis == 0 && rank == 0)), 0, "CONCAT op: input axis must be in range [0, %i], but got %i instead!", rank-1, axis);
for(int i = 1; i < numOfNonEmptyArrs; ++i)
diff --git a/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp b/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp
index 1ffe42f4b..3c76450aa 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp
@@ -33,7 +33,7 @@ CUSTOM_OP_IMPL(mergemaxindex, -1, 1, false, 0, 0) {
auto output = OUTPUT_VARIABLE(0);
std::vector inArrs(block.width());
-
+
for(int i = 0; i < block.width(); ++i)
inArrs[i] = INPUT_VARIABLE(i);
@@ -46,7 +46,8 @@ DECLARE_SYN(MergeMaxIndex, mergemaxindex);
DECLARE_TYPES(mergemaxindex) {
getOpDescriptor()
- ->setAllowedInputTypes({ALL_INTS, ALL_FLOATS});
+ ->setAllowedInputTypes({ALL_INTS, ALL_FLOATS})
+ ->setAllowedOutputTypes({ALL_INDICES});
}
}
DECLARE_SHAPE_FN(mergemaxindex) {
diff --git a/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp b/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp
index 401b68d00..e8f659c5d 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp
@@ -52,7 +52,7 @@ namespace ops {
else {
// check the consistency of input dimensions to reverse along
shape::checkDimensions(input->rankOf(), axis);
- helpers::reverse(block.launchContext(), input, output, &axis, false);
+ helpers::reverse(block.launchContext(), input, output, &axis);
}
return Status::OK();
@@ -85,7 +85,7 @@ namespace ops {
// check the consistency of input dimensions to reverse along
shape::checkDimensions(input->rankOf(), axis);
// we just reverse back original array
- helpers::reverse(block.launchContext(), eps, output, &axis, false);
+ helpers::reverse(block.launchContext(), eps, output, &axis);
}
return Status::OK();
diff --git a/libnd4j/include/ops/declarable/headers/transforms.h b/libnd4j/include/ops/declarable/headers/transforms.h
index 29efc4a73..3fe2f1223 100644
--- a/libnd4j/include/ops/declarable/headers/transforms.h
+++ b/libnd4j/include/ops/declarable/headers/transforms.h
@@ -36,6 +36,7 @@ namespace sd {
#if NOT_EXCLUDED(OP_clipbyavgnorm)
DECLARE_CONFIGURABLE_OP(clipbyavgnorm, 1, 1, true, 1, 0);
+ DECLARE_CUSTOM_OP(clipbyavgnorm_bp, 2, 1, false, 1, 0);
#endif
#if NOT_EXCLUDED(OP_cumsum)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/clip.cpp b/libnd4j/include/ops/declarable/helpers/cpu/clip.cpp
index d4240d780..2c2d9a111 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/clip.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/clip.cpp
@@ -15,83 +15,134 @@
******************************************************************************/
//
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
+// @author Yurii Shyrma (iuriish@yahoo.com)
+// @author sgazeos@gmail.com
+// @author raver119@gmail.com
//
#include
-#include
+#include
namespace sd {
namespace ops {
namespace helpers {
//////////////////////////////////////////////////////////////////////////
-template
-static void clipByNorm_(NDArray& input, NDArray& output, const std::vector& dimensions, const NDArray& clipNorm, const bool isInplace) {
+void clipByNorm(sd::LaunchContext* context, NDArray& input, NDArray& output, const std::vector& dimensions, const NDArray& clipNorm, const bool isInplace, const bool useAverage) {
- const int rank = input.rankOf();
- const auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
+ NDArray* z = nullptr;
- const T normActual = norm2.e(0);
- const T normClip = clipNorm.e(0);
+ if(isInplace) {
+ z = &input;
+ }
+ else {
+ output.assign(input);
+ z = &output;
+ }
- if (isInplace) {
+ if(dimensions.empty()) {
- if(norm2.lengthOf() == 1) {
+ const NDArray actualNorm = useAverage ? z->reduceAlongDimension(reduce::Norm2, {}) / z->lengthOf() : z->reduceAlongDimension(reduce::Norm2, {});
- if(normActual > normClip)
- input *= (normClip / normActual);
- }
- else {
-
- auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
-
- auto func = PRAGMA_THREADS_FOR {
- for (auto i = start; i < stop; i++) {
- const T iNormActual = norm2.e(i);
- if (iNormActual > normClip)
- *listOfInSubArrs.at(i) *= normClip / iNormActual;
- }
- };
- samediff::Threads::parallel_tad(func, 0, listOfInSubArrs.size());
- }
+ if(actualNorm.e(0) > clipNorm.e(0))
+ *z *= clipNorm / actualNorm;
}
else {
- if(norm2.lengthOf() == 1) {
+ auto listOfSubArrs = z->allTensorsAlongDimension(dimensions);
- if(normActual > normClip)
- output.assign(input * (normClip / normActual));
- else
- output.assign(input);
- }
- else {
-
- auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
- auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
-
- auto func = PRAGMA_THREADS_FOR {
- for (auto i = start; i < stop; i++) {
- auto inputSubArr = listOfInSubArrs.at(i);
- auto outputSubArr = listOfOutSubArrs.at(i);
- outputSubArr->assign(inputSubArr);
-
- const T iNormActual = norm2.e(i);
-
- if (iNormActual > clipNorm.e(0))
- *outputSubArr *= clipNorm / iNormActual;
- }
- };
- samediff::Threads::parallel_tad(func, 0, listOfInSubArrs.size());
- }
+ auto func = PRAGMA_THREADS_FOR {
+ for (auto i = start; i < stop; i++) {
+ const NDArray actualNorm = useAverage ? listOfSubArrs.at(i)->reduceAlongDimension(reduce::Norm2, {}) / listOfSubArrs.at(i)->lengthOf() : listOfSubArrs.at(i)->reduceAlongDimension(reduce::Norm2, {});
+ if(actualNorm.e(0) > clipNorm.e(0))
+ *listOfSubArrs.at(i) *= clipNorm / actualNorm;
+ }
+ };
+ samediff::Threads::parallel_tad(func, 0, listOfSubArrs.size());
}
}
+
//////////////////////////////////////////////////////////////////////////
-void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector& dimensions, const NDArray& clipNorm, const bool isInplace) {
- BUILD_SINGLE_SELECTOR(output.dataType(), clipByNorm_, (input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
+template
+static void clipByNormBp_(const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage) {
+
+ const int rank = input.rankOf();
+
+ auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
+ auto sums = input.reduceAlongDimension(reduce::Sum, dimensions);
+
+ if(norm2.lengthOf() == 1) {
+
+ const T norm = useAverage ? norm2.e(0) / input.lengthOf() : norm2.e(0);
+
+ auto clipVal = clipNorm.e(0);
+
+ if(norm > clipVal) {
+
+ const T sum = sums.e(0); // reduce to scalar
+ const T factor1 = clipVal / norm;
+ const T factor2 = static_cast(1.f) / (norm * norm); // 1 / (norm*norm*norm)
+
+ auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) {
+ return factor1 * y * (static_cast(1.f) - factor2 * x * sum);
+ };
+
+ const_cast(input).applyPairwiseLambda(const_cast(gradO), lambda, gradI);
+ }
+ else
+ gradI.assign(gradO);
+ }
+ else {
+
+ auto gradISubArrs = gradI.allTensorsAlongDimension({dimensions});
+ auto gradOSubArrs = gradO.allTensorsAlongDimension({dimensions});
+ auto inputSubArrs = input.allTensorsAlongDimension({dimensions});
+
+ auto clipVal = clipNorm.e(0);
+
+ auto func = PRAGMA_THREADS_FOR {
+
+ for (auto i = start; i < stop; i++) {
+
+ auto gradOSubArr = gradOSubArrs.at(i);
+ auto gradISubArr = gradISubArrs.at(i);
+
+ const T norm = useAverage ? norm2.e(i) / gradISubArr->lengthOf() : norm2.e(i);
+
+ if (norm > clipVal) {
+
+ auto inputSubArr = inputSubArrs.at(i);
+
+ const T sum = sums.e(i); // reduce to scalar
+ const T factor1 = clipVal / norm;
+ const T factor2 = static_cast(1.f) / (norm * norm); // 1 / (norm*norm*norm)
+
+ auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) {
+ return factor1 * y * (static_cast(1.f) - factor2 * x * sum);
+ };
+
+ inputSubArr->applyPairwiseLambda(*gradOSubArr, lambda, *gradISubArr);
+ }
+ else
+ gradISubArr->assign(gradOSubArr);
+ }
+ };
+ samediff::Threads::parallel_tad(func, 0, gradISubArrs.size());
+ }
}
+BUILD_SINGLE_TEMPLATE(template void clipByNormBp_, (const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage), FLOAT_TYPES);
+
+//////////////////////////////////////////////////////////////////////////
+void clipByNormBp(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage) {
+
+ const NDArray& castedInput = gradI.dataType() == input.dataType() ? input : input.cast(gradI.dataType());
+
+ BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBp_, (castedInput, gradO, gradI, dimensions, clipNorm, useAverage), FLOAT_TYPES);
+}
+
+
template
@@ -132,125 +183,6 @@ void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, co
BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace), FLOAT_TYPES);
-//////////////////////////////////////////////////////////////////////////
-template
-static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector& dimensions, const NDArray& clipNorm) {
-
- const int rank = input.rankOf();
-
- auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
-
- if(norm2.lengthOf() == 1) {
-
- const T N = norm2.e(0);
-
- auto cn = clipNorm.e(0);
-
- if(N > cn) {
-
- const T sumOfProd = (input * gradO).reduceNumber(reduce::Sum).e(0); // reduce to scalar
- const T factor1 = static_cast(1.f) / N;
- const T factor3 = factor1 / (N * N); // 1 / (N*N*N)
-
- auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
- return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
- };
-
- (const_cast(input)).applyPairwiseLambda(const_cast(gradO), lambda, gradI);
- }
- else
- gradI.assign(gradO);
- }
- else {
-
- auto gradISubArrs = gradI.allTensorsAlongDimension({dimensions});
- auto gradOSubArrs = gradO.allTensorsAlongDimension({dimensions});
- auto inputSubArrs = input.allTensorsAlongDimension({dimensions});
-
- auto cn = clipNorm.e(0);
-
- auto func = PRAGMA_THREADS_FOR {
- for (auto i = start; i < stop; i++) {
- T N = norm2.e(i);
-
- auto gradOSubArr = gradOSubArrs.at(i);
- auto gradISubArr = gradISubArrs.at(i);
-
- if (N > cn) {
- auto inputSubArr = inputSubArrs.at(i);
- const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e(0); // reduce to scalar
- const T factor1 = static_cast(1.f) / N;
- const T factor3 = factor1 / (N * N); // 1 / (N*N*N)
-
- auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
- return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
- };
-
- inputSubArr->applyPairwiseLambda(*gradOSubArr, lambda, *gradISubArr);
- } else
- gradISubArr->assign(gradOSubArr);
- }
- };
- samediff::Threads::parallel_tad(func, 0, gradISubArrs.size());
- }
-}
-
- void clipByNormBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector& dimensions, const NDArray& clipNorm) {
- BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBP_, (input, gradO, gradI, dimensions, clipNorm), FLOAT_TYPES);
- }
-
- BUILD_SINGLE_TEMPLATE(template void clipByNormBP_, (const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector& dimensions, const NDArray& clipNorm), FLOAT_TYPES);
-
-
-//////////////////////////////////////////////////////////////////////////
-template
-static void clipByAveraged_(NDArray& input, NDArray& output, const std::vector& dimensions, const NDArray& clipNorm, const bool isInplace) {
-
- auto cn = clipNorm.e(0);
- if (dimensions.size() == 0) {
- // all-reduce
- T n2 = input.reduceNumber(reduce::Norm2).e(0) / input.lengthOf();
- if (n2 <= cn) {
- if (!isInplace)
- output.assign(input);
- }
- else {
- const T factor = cn / n2;
- auto lambda = LAMBDA_T(_x, factor) { return _x * factor; };
- input.applyLambda(lambda, output);
- }
- }
- else {
- // along dimension
- auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions, false);
- if (!isInplace)
- output.assign(input);
- auto tads = output.allTensorsAlongDimension(dimensions);
- // TODO: make this CUDA-compliant somehow
- for (int e = 0; e < tads.size(); e++) {
- T n2 = norm2.e(e) / tads.at(e)->lengthOf();
- const T factor = cn / n2;
- if (n2 > cn) {
- auto lambda = LAMBDA_T(_x, factor) {return _x * factor;};
- tads.at(e)->applyLambda(lambda, output);
- }
- }
- }
-}
-
- void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector& dimensions, const NDArray& clipNorm, const bool isInplace) {
- BUILD_SINGLE_SELECTOR(input.dataType(), clipByAveraged_, (input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
- }
-
- BUILD_SINGLE_TEMPLATE(template void clipByAveraged_, (NDArray& input, NDArray& output, const std::vector& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
-
-/*
- if (d1 > params[1])
- return params[1];
- else if (d1 < params[0])
- return params[0];
- else return d1;
-*/
template
static void clipByValue_(NDArray& input, double leftBound, double rightBound, NDArray& output) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/merge.cpp b/libnd4j/include/ops/declarable/helpers/cpu/merge.cpp
index 7874d6d67..d748aa6b0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/merge.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/merge.cpp
@@ -29,7 +29,7 @@ namespace helpers {
//////////////////////////////////////////////////////////////////////////
-template
+template
static void mergeMaxIndex_(const std::vector& inArrs, NDArray& output) {
const Nd4jLong numArgs = inArrs.size();
@@ -37,17 +37,18 @@ static void mergeMaxIndex_(const std::vector& inArrs, NDArray& o
auto func = PRAGMA_THREADS_FOR {
for (auto e = start; e < stop; e++) {
- T max = -DataTypeUtils::max();
- Nd4jLong idx = 0;
+ X max = -DataTypeUtils::max();
+ Z idx = static_cast(0);
for (Nd4jLong i = 0; i < numArgs; i++) {
- T v = inArrs[i]->e(e);
+ X v = inArrs[i]->t(e);
if (v > max) {
max = v;
- idx = i;
+ idx = static_cast(i);
}
}
- output.p(e, idx);
+ // FIXME, use .r(e)
+ output.t(e) = static_cast(idx);
}
};
@@ -55,14 +56,14 @@ static void mergeMaxIndex_(const std::vector& inArrs, NDArray& o
}
void mergeMaxIndex(sd::LaunchContext * context, const std::vector& inArrs, NDArray& output) {
- BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
+ BUILD_DOUBLE_SELECTOR(inArrs[0]->dataType(), output.dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES, INDEXING_TYPES);
}
//////////////////////////////////////////////////////////////////////////
template
static void mergeMax_(const std::vector& inArrs, NDArray& output) {
-
+
const Nd4jLong numArgs = inArrs.size();
auto x = inArrs[0];
@@ -89,15 +90,15 @@ void mergeMax(sd::LaunchContext * context, const std::vector& in
//////////////////////////////////////////////////////////////////////////
template
static void mergeMaxBp_(const std::vector& inArrs, std::vector& outArrs) {
-
+
// outArrs.size() == inArrs.size() - 1
const Nd4jLong numArgs = outArrs.size();
// last array is gradient
const auto gradient = inArrs[numArgs]->bufferAsT();
auto length = inArrs[numArgs]->lengthOf();
-
+
bool bSameOrderAndEws1 = (1 == inArrs[numArgs]->ews());
-
+
if (bSameOrderAndEws1) {
auto gradOrdering = inArrs[numArgs]->ordering();
@@ -108,8 +109,8 @@ static void mergeMaxBp_(const std::vector& inArrs, std::vectorews());
}
}
-
-
+
+
if(bSameOrderAndEws1){
auto func = PRAGMA_THREADS_FOR{
for (auto e = start; e < stop; e++) {
@@ -130,7 +131,7 @@ static void mergeMaxBp_(const std::vector& inArrs, std::vectorshapeInfo();
std::vector vbSameShaepeAndStrides(numArgs);
for (int i = 0; i < numArgs; ++i) {
@@ -145,12 +146,12 @@ static void mergeMaxBp_(const std::vector& inArrs, std::vector();
Nd4jLong nMaxIndex = 0;
for (Nd4jLong i = 0; i < numArgs; i++) {
-
+
const auto xOffset = vbSameShaepeAndStrides[i] ? gradOffset : shape::getOffset(inArrs[i]->shapeInfo(), coords);
const T* v = inArrs[i]->bufferAsT();
if (v[xOffset] > max) {
@@ -160,7 +161,7 @@ static void mergeMaxBp_(const std::vector& inArrs, std::vectorshapeInfo(), coords);
-
+
T* z = outArrs[nMaxIndex]->bufferAsT();
z[zOffset] = gradient[gradOffset];
}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 95417dade..bc072682a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -193,13 +193,10 @@ static void reverseSequence_(sd::LaunchContext * context, const NDArray* input,
}
//////////////////////////////////////////////////////////////////////////
-void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector* intArgs, bool isBackProp) {
+void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector* intArgs) {
- // we need to reverse axis only if that's new op
- std::vector dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs;
-
- auto listOut = output->allTensorsAlongDimension(dimensions);
- auto listIn = input->allTensorsAlongDimension(dimensions);
+ auto listOut = output->allTensorsAlongDimension(*intArgs);
+ auto listIn = input->allTensorsAlongDimension(*intArgs);
NDArray *subArrIn, *subArrOut;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/clip.cu b/libnd4j/include/ops/declarable/helpers/cuda/clip.cu
new file mode 100644
index 000000000..8f1be21e4
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/clip.cu
@@ -0,0 +1,334 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+// @author sgazeos@gmail.com
+// @author raver119@gmail.com
+//
+
+
+#include
+#include
+#include
+#include
+
+namespace sd {
+namespace ops {
+namespace helpers {
+
+//////////////////////////////////////////////////////////////////////////
+template
+__global__ static void clipByNormCuda(const void* vClipNorm, const void* vNorm, const Nd4jLong* normShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int* dimensions, const int dimsLen, const bool useAverage) {
+
+ const T clipNorm = *reinterpret_cast(vClipNorm);
+ const T* norm = reinterpret_cast(vNorm);
+ T* z = reinterpret_cast(vz);
+
+ __shared__ Nd4jLong zLen, tadLen, totalThreads;
+
+ if (threadIdx.x == 0) {
+
+ zLen = shape::length(zShapeInfo);
+ tadLen = zLen / shape::length(normShapeInfo);
+ totalThreads = gridDim.x * blockDim.x;
+ }
+
+ __syncthreads();
+
+ int zCoords[MAX_RANK], normCoords[MAX_RANK];
+
+ const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
+
+ shape::index2coords(i, zShapeInfo, zCoords);
+
+ // deduce norm coords
+ for (int j = 0; j < dimsLen; ++j)
+ normCoords[j] = zCoords[dimensions[j]];
+
+ const T actualNorm = useAverage ? norm[shape::getOffset(normShapeInfo, normCoords)] / tadLen : norm[shape::getOffset(normShapeInfo, normCoords)];
+
+ if(actualNorm > clipNorm)
+ z[shape::getOffset(zShapeInfo, zCoords)] *= clipNorm / actualNorm;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template
+__host__ static void clipByNormCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream,
+ const void* vClipNorm, const void* vNorm, const Nd4jLong* normShapeInfo, void* vz, const Nd4jLong* zShapeInfo,
+ const int* dimensions, const int dimsLen, const bool useAverage) {
+
+ clipByNormCuda<<>>(vClipNorm, vNorm, normShapeInfo, vz, zShapeInfo, dimensions, dimsLen, useAverage);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void clipByNorm(sd::LaunchContext* context, NDArray& input, NDArray& output, const std::vector& dims, const NDArray& clipNorm, const bool isInplace, const bool useAverage) {
+
+ NDArray* z = nullptr;
+
+ if(isInplace) {
+ z = &input;
+ }
+ else {
+ output.assign(input);
+ z = &output;
+ }
+
+ if(dims.empty()) {
+
+ const NDArray actualNorm = useAverage ? z->reduceAlongDimension(reduce::Norm2, {}) / z->lengthOf() : z->reduceAlongDimension(reduce::Norm2, {});
+
+ if(actualNorm.e(0) > clipNorm.e(0))
+ *z *= clipNorm / actualNorm;
+ }
+ else {
+
+ const NDArray actualNorms = z->reduceAlongDimension(reduce::Norm2, dims);
+
+ std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(z->rankOf(), dims);
+
+ const int threadsPerBlock = MAX_NUM_THREADS / 2;
+ const int blocksPerGrid = (z->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+
+ PointersManager manager(context, "clipByNorm");
+
+ const int* dimensions = reinterpret_cast(manager.replicatePointer(dimsToExclude.data(), dimsToExclude.size() * sizeof(int)));
+
+ NDArray::prepareSpecialUse({z}, {z, &actualNorms, &clipNorm});
+ BUILD_SINGLE_SELECTOR(z->dataType(), clipByNormCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), clipNorm.specialBuffer(), actualNorms.specialBuffer(), actualNorms.specialShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dimensions, (int)dimsToExclude.size(), useAverage), FLOAT_TYPES);
+ NDArray::registerSpecialUse({z}, {z, &actualNorms, &clipNorm});
+
+ manager.synchronize();
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template
+__global__ static void clipByNormBpCuda(const void* vClipNorm,
+ const void* vx, const Nd4jLong* xShapeInfo, // input
+ const void* vy, const Nd4jLong* yShapeInfo, // gradO
+ const void* vNorm, const Nd4jLong* normShapeInfo,
+ const void* vSum, const Nd4jLong* sumShapeInfo,
+ void* vz, const Nd4jLong* zShapeInfo, // gradI
+ const int* dimensions, const int dimsLen, const bool useAverage) {
+
+ const T clipNorm = *reinterpret_cast(vClipNorm);
+ const T* norm = reinterpret_cast(vNorm);
+ const T* sum = reinterpret_cast(vSum);
+ const T* x = reinterpret_cast(vx);
+ const T* y = reinterpret_cast(vy);
+ T* z = reinterpret_cast(vz);
+
+ __shared__ Nd4jLong zLen, tadLen, totalThreads;
+ __shared__ bool sameOffsets;
+
+ if (threadIdx.x == 0) {
+
+ zLen = shape::length(zShapeInfo);
+ tadLen = zLen / shape::length(normShapeInfo);
+ totalThreads = gridDim.x * blockDim.x;
+
+ sameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo, zShapeInfo);
+ }
+
+ __syncthreads();
+
+ int zCoords[MAX_RANK], normCoords[MAX_RANK];
+
+ const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
+
+ shape::index2coords(i, zShapeInfo, zCoords);
+
+ const auto zOffset = shape::getOffset(zShapeInfo, zCoords);
+ const auto yOffset = sameOffsets ? zOffset : shape::getOffset(yShapeInfo, zCoords);
+
+ // deduce norm coords
+ for (int j = 0; j < dimsLen; ++j)
+ normCoords[j] = zCoords[dimensions[j]];
+
+ const T actualNorm = useAverage ? norm[shape::getOffset(normShapeInfo, normCoords)] / tadLen : norm[shape::getOffset(normShapeInfo, normCoords)];
+
+ if(actualNorm > clipNorm) {
+
+ const T sumVal = sum[shape::getOffset(sumShapeInfo, normCoords)];
+ const auto xOffset = sameOffsets ? zOffset : shape::getOffset(xShapeInfo, zCoords);
+
+ z[zOffset] = (clipNorm / actualNorm) * y[yOffset] * (static_cast(1.f) - (x[xOffset] * sumVal) / (actualNorm * actualNorm));
+ }
+ else
+ z[zOffset] = y[yOffset];
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template
+void clipByNormBp_(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dims, const NDArray& clipNorm, const bool useAverage) {
+
+ const int rank = input.rankOf();
+
+ auto actualNorms = input.reduceAlongDimension(reduce::Norm2, dims);
+
+ if(actualNorms.lengthOf() == 1) {
+
+ const T norm = useAverage ? actualNorms.e(0) / static_cast(input.lengthOf()) : actualNorms.e(0);
+
+ auto clipVal = clipNorm.e(0);
+
+ if(norm > clipVal) {
+
+ const T sum = input.reduceNumber(reduce::Sum).e(0); // reduce to scalar
+ const T factor1 = clipVal / norm;
+ const T factor2 = static_cast(1.f) / (norm * norm); // 1 / (norm*norm*norm)
+
+ auto lambda = LAMBDA_TT(x, y, sum, factor1, factor2) {
+ return factor1 * y * (static_cast(1.f) - factor2 * x * sum);
+ };
+
+ const_cast(input).applyPairwiseLambda(const_cast(gradO), lambda, gradI);
+ }
+ else
+ gradI.assign(gradO);
+ }
+ else {
+
+ const NDArray actualNorms = input.reduceAlongDimension(reduce::Norm2, dims);
+ const NDArray sums = input.reduceAlongDimension(reduce::Sum, dims);
+
+ std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(gradI.rankOf(), dims);
+
+ const int threadsPerBlock = MAX_NUM_THREADS / 2;
+ const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+
+ PointersManager manager(context, "clipByNormBp");
+
+ const int* dimensions = reinterpret_cast(manager.replicatePointer(dimsToExclude.data(), dimsToExclude.size() * sizeof(int)));
+
+ NDArray::prepareSpecialUse({&gradI}, {&actualNorms, &sums, &clipNorm, &input, &gradO});
+ clipByNormBpCuda<<getCudaStream()>>>(clipNorm.specialBuffer(), input.specialBuffer(), input.specialShapeInfo(), gradO.specialBuffer(), gradO.specialShapeInfo(), actualNorms.specialBuffer(), actualNorms.specialShapeInfo(), sums.specialBuffer(), sums.specialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), dimensions, (int)dimsToExclude.size(), useAverage);
+ NDArray::registerSpecialUse({&gradI}, {&actualNorms, &sums, &clipNorm, &input, &gradO});
+
+ manager.synchronize();
+ }
+}
+BUILD_SINGLE_TEMPLATE(template void clipByNormBp_, (sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage), FLOAT_TYPES);
+
+//////////////////////////////////////////////////////////////////////////
+void clipByNormBp(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const std::vector& dimensions, const NDArray& clipNorm, const bool useAverage) {
+
+ const NDArray& castedInput = gradI.dataType() == input.dataType() ? input : input.cast(gradI.dataType());
+ BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBp_, (context, castedInput, gradO, gradI, dimensions, clipNorm, useAverage), FLOAT_TYPES);
+}
+
+
+
+
+
+
+ template
+ void clipByGlobalNorm_(sd::LaunchContext * context, std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace) {
+ NDArray globalNorm = NDArrayFactory::create(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
+
+ for (auto i = 0; i < inputs.size(); i++) {
+ auto input = inputs[i];
+ auto l2norm = input->reduceNumber(reduce::Norm2);
+ globalNorm += l2norm * l2norm;
+ }
+
+ globalNorm.applyTransform(transform::Sqrt, globalNorm); // = sd::math::nd4j_sqrt(globalNorm);
+ outputs[inputs.size()]->p(0, globalNorm);
+ globalNorm.syncToHost();
+ const T factor = static_cast(clipNorm) / globalNorm.e(0);
+
+ for (size_t e = 0; e < inputs.size(); e++) {
+ // all-reduce
+ auto input = inputs[e];
+ auto output = outputs[e];
+
+ if (globalNorm.e(0) <= clipNorm) {
+ output->assign(input);
+ }
+ else {
+
+ auto lambda = LAMBDA_T(_x, factor) { return _x * factor; };
+ input->applyLambda(lambda, *output);
+ }
+ }
+ }
+
+ void clipByGlobalNorm(sd::LaunchContext * context, std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace) {
+ BUILD_SINGLE_SELECTOR(outputs[0]->dataType(), clipByGlobalNorm_, (context, inputs, clipNorm, workspace, outputs, isInplace), FLOAT_TYPES);
+ }
+
+ BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (sd::LaunchContext * context, std::vector const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector& outputs, bool isInplace), FLOAT_TYPES);
+
+
+ template
+ static void __global__ clipByValueKernel(void* input, const Nd4jLong* inputShape, void* output, const Nd4jLong* outputShape, double leftBound, double rightBound) {
+ __shared__ T* outputBuf;
+ __shared__ T* inputBuf;
+ __shared__ Nd4jLong length;
+ __shared__ bool linearBuffers;
+ if (threadIdx.x == 0) {
+ outputBuf = reinterpret_cast(output);
+ inputBuf = reinterpret_cast(input);
+ length = shape::length(inputShape);
+ linearBuffers = shape::elementWiseStride(inputShape) == shape::elementWiseStride(outputShape) && shape::elementWiseStride(inputShape) == 1;
+ }
+ __syncthreads();
+ const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+ const auto step = gridDim.x * blockDim.x;
+
+ for (Nd4jLong e = tid; e < length; e += step) {
+ if (linearBuffers) {
+ if (inputBuf[e] > rightBound) outputBuf[e] = (T) rightBound;
+ else if (inputBuf[e] < leftBound) outputBuf[e] = (T) leftBound;
+ else outputBuf[e] = inputBuf[e];
+ }
+ else {
+ auto inputOffset = shape::getIndexOffset(e, inputShape);
+ auto outputOffset = shape::getIndexOffset(e, outputShape);
+ if (inputBuf[inputOffset] > rightBound) outputBuf[outputOffset] = (T) rightBound;
+ else if (inputBuf[inputOffset] < leftBound) outputBuf[outputOffset] = (T) leftBound;
+ else outputBuf[outputOffset] = inputBuf[outputOffset];
+ }
+ }
+ }
+
+ template
+ static void clipByValue_(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
+ auto stream = context->getCudaStream();
+ if (!input.isActualOnDeviceSide())
+ input.syncToDevice();
+ NDArray::prepareSpecialUse({&output}, {&input});
+ clipByValueKernel<<<256, 512, 8192, *stream>>>(input.specialBuffer(), input.specialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftBound, rightBound);
+ NDArray::registerSpecialUse({&output}, {&input});
+ }
+
+ void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
+ BUILD_SINGLE_SELECTOR(input.dataType(), clipByValue_, (context, input, leftBound, rightBound, output), FLOAT_TYPES);
+ }
+
+ BUILD_SINGLE_TEMPLATE(template void clipByValue_, (sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);, FLOAT_TYPES);
+
+}
+}
+}
+
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
index b6bbeea4c..2ed45356e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
@@ -210,14 +210,10 @@ namespace helpers {
}
//////////////////////////////////////////////////////////////////////////
- void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector* intArgs, bool isBackProp) {
- // we need to reverse axis only if that's new op
- std::vector dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs;
- std::vector axis = ShapeUtils::evalDimsToExclude(input->rankOf(), dimensions);
- auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->shapeInfo(), dimensions);
- auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), dimensions);
-
+ void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector* intArgs) {
+ auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->shapeInfo(), *intArgs);
+ auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), *intArgs);
NDArray::prepareSpecialUse({output}, {input});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index f016491a6..f14b12e35 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -300,269 +300,6 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray
manager.synchronize();
}
-//////////////////////////////////////////////////////////////////////////
-// x - input, y - gradO, z - gradI
-template
-__global__ static void clipByNormBPWholeArrCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, void* vreducBuff, const Z clipNormVal) {
-
- const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-
- if(tid >= shape::length(zShapeInfo))
- return;
-
- const auto x = reinterpret_cast(vx);
- const auto y = reinterpret_cast(vy);
- auto z = reinterpret_cast(vz);
-
- auto reducBuff = reinterpret_cast(vreducBuff);
- uint* count = reinterpret_cast(vreducBuff) + 16384;
-
- __shared__ Z* shMem;
- __shared__ Nd4jLong len;
- __shared__ bool amIinLastBlock;
-
- if (threadIdx.x == 0) {
- extern __shared__ unsigned char shmem[];
- shMem = reinterpret_cast(shmem);
-
- len = shape::length(zShapeInfo); // xLen = yLen = zLen
- }
- __syncthreads();
-
- // fill shared memory with array elements
- const auto xVal = x[shape::getIndexOffset(tid, xShapeInfo)];
- const auto yVal = y[shape::getIndexOffset(tid, yShapeInfo)];
-
- shMem[2*threadIdx.x] = static_cast(xVal * xVal); // for norm
- shMem[2*threadIdx.x + 1] = static_cast(xVal * yVal); // for input * gradO
-
- __syncthreads();
-
- // accumulate sum per block
- for (int activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
-
- if (threadIdx.x < activeThreads && tid + activeThreads < len) {
-
- shMem[2*threadIdx.x] += shMem[2*(threadIdx.x + activeThreads)];
- shMem[2*threadIdx.x + 1] += shMem[2*(threadIdx.x + activeThreads) + 1];
- }
- __syncthreads();
- }
-
- // store accumulated sums in reduction buffer (reducBuff)
- if (threadIdx.x == 0) {
-
- reducBuff[2*blockIdx.x] = shMem[0];
- reducBuff[2*blockIdx.x + 1] = shMem[1];
-
- __threadfence();
-
- amIinLastBlock = gridDim.x == 1 || (atomicInc(count, gridDim.x) == gridDim.x - 1);
- }
- __syncthreads();
-
- // shared memory of last block is used for final summation of values stored in reduction buffer
- if (amIinLastBlock) {
-
- for (int i = threadIdx.x; i < gridDim.x; i += blockDim.x) {
-
- shMem[2*threadIdx.x] = (i == threadIdx.x ) ? reducBuff[2*i] : reducBuff[2*i] + shMem[2*threadIdx.x];
- shMem[2*threadIdx.x + 1] = (i == threadIdx.x ) ? reducBuff[2*i + 1] : reducBuff[2*i + 1] + shMem[2*threadIdx.x + 1];
- }
- __syncthreads();
-
- // accumulate sum
- for (int activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
-
- if (threadIdx.x < activeThreads && threadIdx.x + activeThreads < gridDim.x) {
- shMem[2*threadIdx.x] += shMem[2*(threadIdx.x + activeThreads)];
- shMem[2*threadIdx.x + 1] += shMem[2*(threadIdx.x + activeThreads) + 1];
- }
- __syncthreads();
- }
-
- if (threadIdx.x == 0) {
-
- reducBuff[0] = math::nd4j_sqrt(shMem[0]);
- reducBuff[1] = shMem[1];
- count = 0;
- }
- }
-}
-
-//////////////////////////////////////////////////////////////////////////
-// x - input, y - gradO, z - gradI
-template
-__global__ static void clipByNormBPCalcGradCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, void* vreducBuff, const Z clipNormVal) {
-
- const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-
- const Nd4jLong len = shape::length(zShapeInfo); // xLen = yLen = zLen
-
- if(tid >= len)
- return;
-
- const auto x = reinterpret_cast(vx);
- const auto y = reinterpret_cast(vy);
- auto z = reinterpret_cast(vz);
-
- __shared__ Z norm, sumOfProd;
-
- if (threadIdx.x == 0) {
-
- norm = reinterpret_cast(vreducBuff)[0];
- sumOfProd = reinterpret_cast(vreducBuff)[1];
- }
- __syncthreads();
-
- const auto yOffset = shape::getIndexOffset(tid, yShapeInfo);
- const auto zOffset = shape::getIndexOffset(tid, zShapeInfo);
-
- if(norm > clipNormVal) {
-
- const auto xOffset = shape::getIndexOffset(tid, xShapeInfo);
-
- const Z factor1 = static_cast(1) / norm; // 1 / norm
- const Z factor2 = factor1 / (norm * norm); // 1 / (norm * norm * norm)
-
- z[zOffset] = clipNormVal * (factor1 * y[yOffset] - factor2 * sumOfProd * x[xOffset]);
- }
- else {
- z[zOffset] = y[yOffset];
- }
-}
-
-//////////////////////////////////////////////////////////////////////////
-// x - input, y - gradO, z - gradI
-template
-__global__ static void clipByNormBPTadsCuda(const void* vx, const Nd4jLong* xTadShapeInfo, const Nd4jLong* xTadOffsets, const void* vy, const Nd4jLong* yTadShapeInfo, const Nd4jLong* yTadOffsets, void* vz, const Nd4jLong* zTadShapeInfo, const Nd4jLong* zTadOffsets, const Z clipNormVal) {
-
- const auto x = reinterpret_cast(vx);
- const auto y = reinterpret_cast(vy);
- auto z = reinterpret_cast(vz);
-
- __shared__ Z* shMem;
- __shared__ Nd4jLong tadLen;
-
- if (threadIdx.x == 0) {
-
- extern __shared__ unsigned char shmem[];
- shMem = reinterpret_cast(shmem);
- tadLen = shape::length(zTadShapeInfo); // xTadLen = yTadLen = zTadLen
- }
- __syncthreads();
-
- const auto* xTad = x + xTadOffsets[blockIdx.x];
- const auto* yTad = y + yTadOffsets[blockIdx.x];
- auto* zTad = z + zTadOffsets[blockIdx.x];
-
- // *** FIRST STAGE - ACCUMULATE REQUIRED SUMS *** //
-
- Z norm = 0;
- Z sumOfProd = 0;
-
- for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) {
-
- const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo);
- const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
-
- shMem[2*threadIdx.x] = static_cast(xTad[xOffset] * xTad[xOffset]); // for norm
- shMem[2*threadIdx.x + 1] = static_cast(xTad[xOffset] * yTad[yOffset]); // for input * gradO
-
- __syncthreads();
-
- // accumulate sum per block
- for (uint activeThreads = blockDim.x / 2; activeThreads > 0; activeThreads /= 2) {
-
- if (threadIdx.x < activeThreads && i + activeThreads < tadLen) {
-
- shMem[2*threadIdx.x] += shMem[2*(threadIdx.x + activeThreads)];
- shMem[2*threadIdx.x + 1] += shMem[2*(threadIdx.x + activeThreads) + 1];
- }
- __syncthreads();
- }
-
- norm += shMem[0];
- sumOfProd += shMem[1];
- }
-
- // *** SECOND STAGE - GRADIENT CALCULATION *** //
-
- norm = math::nd4j_sqrt(norm);
-
- for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) {
-
- const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
- const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo);
-
- if(norm > clipNormVal) {
-
- const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo);
-
- const Z factor1 = static_cast(1) / norm; // 1 / norm
- const Z factor2 = factor1 / (norm * norm); // 1 / (norm * norm * norm)
-
- zTad[zOffset] = clipNormVal * (factor1 * yTad[yOffset] - factor2 * sumOfProd * xTad[xOffset]);
- }
- else {
- zTad[zOffset] = yTad[yOffset];
- }
- }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template
-static void clipByNormBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
- const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets,
- const void* vy, const Nd4jLong* yShapeInfo, const Nd4jLong* yTadOffsets,
- void* vz, const Nd4jLong* zShapeInfo, const Nd4jLong* zTadOffsets,
- void* vreducBuff, const double clipNormVal) {
-
- if(xTadOffsets == nullptr) { // means whole array
- clipByNormBPWholeArrCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vreducBuff, static_cast(clipNormVal));
- clipByNormBPCalcGradCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vreducBuff, static_cast(clipNormVal));
- }
- else // means tads using
- clipByNormBPTadsCuda<<>>(vx, xShapeInfo, xTadOffsets, vy, yShapeInfo, yTadOffsets, vz, zShapeInfo, zTadOffsets, static_cast(clipNormVal));
-}
-BUILD_DOUBLE_TEMPLATE(template void clipByNormBPCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* xTadOffsets, const void *vy, const Nd4jLong *yShapeInfo, const Nd4jLong* yTadOffsets, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* zTadOffsets, void* vreducBuff, const double clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
-
-//////////////////////////////////////////////////////////////////////////
-void clipByNormBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector& dimensions, const NDArray& clipNorm) {
-
- PointersManager manager(context, "clipByNormBP");
-
- const double clipNormVal = clipNorm.e(0);
-
- const auto xType = input.dataType();
- const auto zType = gradI.dataType();
-
- const int threadsPerBlock = MAX_NUM_THREADS / 2;
- const int sharedMem = threadsPerBlock * 2 * input.sizeOfT() + 128;
-
- NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
-
-
- if(dimensions.empty() || dimensions.size() == input.rankOf()) { // means whole array
-
- const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
- BUILD_DOUBLE_SELECTOR(xType, zType, clipByNormBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.specialBuffer(), input.specialShapeInfo(), nullptr, gradO.specialBuffer(), gradO.specialShapeInfo(), nullptr, gradI.specialBuffer(), gradI.specialShapeInfo(), nullptr, context->getReductionPointer(), clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
- }
- else { // means tads using
-
- auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input.shapeInfo(), dimensions);
- auto packY = ConstantTadHelper::getInstance()->tadForDimensions(gradO.shapeInfo(), dimensions);
- auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(gradI.shapeInfo(), dimensions);
-
- const int blocksPerGrid = packX.numberOfTads();
- BUILD_DOUBLE_SELECTOR(xType, zType, clipByNormBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), gradO.specialBuffer(), packY.platformShapeInfo(), packY.platformOffsets(), gradI.specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), nullptr, clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
- }
-
- NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
-
- manager.synchronize();
-}
-
template
static __global__ void swapShuffleKernel(T* input, Nd4jLong const* shape, Nd4jLong firstDim, sd::graph::RandomGenerator* rng) {
auto tid = blockIdx.x * blockDim.x;
@@ -692,252 +429,6 @@ void clipByNormBP(sd::LaunchContext* context, const NDArray& input, const NDArra
output.setIdentity();
}
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- template
- static __global__ void clipByNormInplaceKernel(Nd4jLong numOfSubArrs, T* inputBuffer, Nd4jLong const* shape, Nd4jLong const* inputOffsets, T* norm2Buf, Nd4jLong const* norm2shape, T clipNorm) {
- for (int arr = blockIdx.x; arr < numOfSubArrs; arr += gridDim.x) {
- __shared__ T* z;
- __shared__ Nd4jLong len;
- if (threadIdx.x == 0) {
- len = shape::length(shape);
- z = inputBuffer + inputOffsets[arr];
- }
- __syncthreads();
- for (int j = threadIdx.x; j < len; j+= blockDim.x) {
- auto xIndex = shape::getIndexOffset(j, shape);
-
- if(norm2Buf[arr] > clipNorm)
- z[xIndex] *= clipNorm / norm2Buf[arr]; // case with ews = 1 and ordering is 'c'
- }
- }
- }
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- template