apache
diff --git a/‎3rdparty/mshadow‎ b/‎3rdparty/mshadow‎
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 5 additions & 0 deletions b/‎Makefile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎ci/docker/runtime_functions.sh‎
Lines changed: 54 additions & 0 deletions b/‎ci/docker/runtime_functions.sh‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎ci/jenkins/Jenkins_steps.groovy‎
Lines changed: 28 additions & 0 deletions b/‎ci/jenkins/Jenkins_steps.groovy‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎ci/jenkins/Jenkinsfile_unix_cpu‎
Lines changed: 2 additions & 1 deletion b/‎ci/jenkins/Jenkinsfile_unix_cpu‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ci/jenkins/Jenkinsfile_unix_gpu‎
Lines changed: 1 addition & 0 deletions b/‎ci/jenkins/Jenkinsfile_unix_gpu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/mxnet/libinfo.h‎
Lines changed: 5 additions & 1 deletion b/‎include/mxnet/libinfo.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/mxnet/tensor_blob.h‎
Lines changed: 7 additions & 6 deletions b/‎include/mxnet/tensor_blob.h‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎include/mxnet/tuple.h‎
Lines changed: 8 additions & 8 deletions b/‎include/mxnet/tuple.h‎
Lines changed: 8 additions & 8 deletions
@@ -50,6 +50,7 @@ mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
 mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
+mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -295,6 +296,13 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
+if (USE_INT64_TENSOR_SIZE)
+  message(STATUS "Using 64-bit integer for tensor size")
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1)
+else()
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=0)
+endif()
+
 include(cmake/ChooseBlas.cmake)
 if(USE_CUDA AND FIRST_CUDA)
   include(3rdparty/mshadow/cmake/Utils.cmake)
 
@@ -189,6 +189,11 @@ ifeq ($(USE_OPERATOR_TUNING), 1)
 	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
 endif
 
+ifeq ($(USE_INT64_TENSOR_SIZE), 1)
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=1
+else
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=0
+endif
 # verify existence of separate lapack library when using blas/openblas/atlas
 # switch off lapack support in case it can't be found
 # issue covered with this
 
@@ -755,6 +755,53 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_cpu_large_tensor() {
+    set -ex
+    cd /work/build
+    build_ccache_wrappers
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=OFF                          \
+        -DUSE_CUDNN=OFF                         \
+        -DUSE_MKLDNN=OFF                        \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
+}
+
+build_ubuntu_gpu_large_tensor() {
+    set -ex
+    cd /work/build
+    build_ccache_wrappers
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DCUDA_ARCH_NAME=Manual                 \
+        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
+}
+
 build_ubuntu_blc() {
     echo "pass"
 }
@@ -1183,6 +1230,13 @@ nightly_test_KVStore_singleNode() {
     python tests/nightly/test_kvstore.py
 }
 
+#Test Large Tensor Size
+nightly_test_large_tensor() {
+    set -ex
+    export PYTHONPATH=./python/
+    nosetests-3.4 tests/nightly/test_large_array.py
+}
+
 #Tests Amalgamation Build with 5 different sets of flags
 nightly_test_amalgamation() {
     set -ex
 
@@ -119,6 +119,34 @@ def compile_unix_openblas_debug_cpu() {
     }]
 }
 
+def compile_unix_int64_cpu() {
+    return ['CPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_large_tensor', false)
+            utils.pack_lib('ubuntu_cpu_int64', mx_cmake_lib, true)
+          }
+        }
+      }
+    }]
+}
+
+def compile_unix_int64_gpu() {
+    return ['GPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/build-gpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_large_tensor', false)
+            utils.pack_lib('ubuntu_gpu_int64', mx_cmake_lib, true)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_mkl_cpu() {
     return ['CPU: MKL': {
       node(NODE_LINUX_CPU) {
 
@@ -38,7 +38,8 @@ core_logic: {
     custom_steps.compile_unix_openblas_debug_cpu(),
     custom_steps.compile_unix_mkl_cpu(),
     custom_steps.compile_unix_mkldnn_cpu(),
-    custom_steps.compile_unix_mkldnn_mkl_cpu()
+    custom_steps.compile_unix_mkldnn_mkl_cpu(),
+    custom_steps.compile_unix_int64_cpu()
   ])
 
   utils.parallel_stage('Tests', [
 
@@ -40,6 +40,7 @@ core_logic: {
     custom_steps.compile_unix_cmake_mkldnn_gpu(),
     custom_steps.compile_unix_cmake_gpu(),
     custom_steps.compile_unix_tensorrt_gpu(),
+    custom_steps.compile_unix_int64_gpu()
   ]) 
 
   utils.parallel_stage('Tests', [
 
@@ -123,7 +123,9 @@
 #define MXNET_USE_SIGNAL_HANDLER 0
 #endif
 
-
+#ifndef MXNET_USE_INT64_TENSOR_SIZE
+#define MXNET_USE_INT64_TENSOR_SIZE MSHADOW_INT64_TENSOR_SIZE
+#endif
 
 namespace mxnet {
 namespace features {
@@ -177,6 +179,8 @@ enum : unsigned {
   PROFILER,
   DIST_KVSTORE,
   CXX14,
+  INT64_TENSOR_SIZE,
+
   // Signal handler to print stack traces on exceptions
   SIGNAL_HANDLER,
   DEBUG,
 
@@ -218,15 +218,16 @@ class TBlob {
     return shape_.ndim();
   }
   /*!
-   * \brief return size of i-th dimension, start counting from highest dimension
+   * \brief return size of i-th dimension, start counting from highest dimension.
+   * return type needs to be a signed integer.
    * \param idx the dimension count from the highest dimensin
-   * \return the size
+   * \return the size. -1 means unknown size to support zero-size tensor.
    */
   inline index_t size(index_t idx) const {
     return shape_[idx];
   }
   /*! \brief total number of elements in the tensor */
-  inline index_t Size(void) const {
+  inline size_t Size(void) const {
     return shape_.Size();
   }
   /*! \brief get pointer in dtype */
@@ -443,7 +444,7 @@ class FieldEntry<mxnet::TShape>
         throw dmlc::ParamError(os.str());
     }
     if (enforce_nonzero_) {
-      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
+      for (int i = 0; i < v.ndim(); ++i) {
         if (v[i] == 0U) {
           std::ostringstream os;
           os << "value " << v << "for Parameter " << this->key_
@@ -457,7 +458,7 @@ class FieldEntry<mxnet::TShape>
     this->enforce_nonzero_ = true;
     return this->self();
   }
-  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mxnet::index_t ndim) {
+  inline FieldEntry<mxnet::TShape> &set_expect_ndim(int ndim) {
     expect_ndim_ = ndim;
     return this->self();
   }
@@ -466,7 +467,7 @@ class FieldEntry<mxnet::TShape>
   // whether all the entries need to be nonzero
   bool enforce_nonzero_;
   // expected number of dimension, default = 0 means no restriction.
-  mxnet::index_t expect_ndim_;
+  int expect_ndim_;
 };
 
 }  // namespace parameter
 
@@ -569,7 +569,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis_end The ending axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis_begin, int axis_end) const {
     CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
@@ -579,10 +579,10 @@ class TShape : public Tuple<dim_t> {
     s.shape_[1] = 1;
     s.shape_[2] = 1;
 
-    for (size_t i = 0; i < axis_begin; ++i) {
+    for (int i = 0; i < axis_begin; ++i) {
       s.shape_[0] *= d[i];
     }
-    for (size_t i = axis_begin; i <= axis_end; ++i) {
+    for (int i = axis_begin; i <= axis_end; ++i) {
       s.shape_[1] *= d[i];
     }
     for (int i = axis_end + 1; i < ndim(); ++i) {
@@ -595,7 +595,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis The axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis) const {
     return FlatTo3D(axis, axis);
   }
   inline bool operator==(const TShape &s) const {
@@ -712,8 +712,8 @@ template<typename T>
 struct hash<mxnet::Tuple<T> > {
   /*! \brief hash a Tuple into unsigned int */
   size_t operator()(const mxnet::Tuple<T>& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
     for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
@@ -726,8 +726,8 @@ template<>
 struct hash<mxnet::TShape> {
   /*! \brief hash a TShape into unsigned int */
   size_t operator()(const mxnet::TShape& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
     for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }