use (m, m) temp space

Fan · Fan · commit da6d472ab523 · 2019-08-14T16:54:31.000+08:00
diff --git a/3rdparty/tvm b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit afd4b3e4450984358e9d79a7e8e578483cb7b017
+Subproject commit 8bd9d4d5edf0a17ef0726166710024c3147def68
diff --git a/src/operator/c_lapack_api.h b/src/operator/c_lapack_api.h
@@ -249,6 +249,10 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
   #define MXNET_LAPACK_sgetrf LAPACKE_sgetrf
   #define MXNET_LAPACK_dgetrf LAPACKE_dgetrf
 
+  // Internally A is factorized as U * L * VT, and (according to the tech report)
+  // we want to factorize it as UT * L * V, so we pass ut as u and v as vt.
+  // We also have to allocate at least m - 1 DType elements as workspace as the internal
+  // LAPACKE function needs it to store `superb`. (see MKL documentation)
   #define MXNET_LAPACK_CWRAP_GESVD(prefix, dtype) \
   inline int MXNET_LAPACK_##prefix##gesvd(int matrix_layout, int m, int n, dtype* ut, \
                                           int ldut, dtype* s, dtype* v, int ldv, \
@@ -382,6 +386,8 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
   MXNET_LAPACK_CWRAP_SYEVD(ssyevd, float)
   MXNET_LAPACK_CWRAP_SYEVD(dsyevd, double)
 
+  // Note: Supports row-major format only. Internally, column-major is used, so all
+  // inputs/outputs are flipped and transposed. m and n are flipped as well.
   #define MXNET_LAPACK_CWRAP_GESVD(func, dtype) \
   inline int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* ut, \
                                  int ldut, dtype* s, dtype* v, int ldv, \
diff --git a/src/operator/linalg.h b/src/operator/linalg.h
@@ -195,8 +195,7 @@ int linalg_syevd_workspace_query(const Tensor<xpu, 2, DType>& A,
 
 // CPU/GPU-versions of LAPACK function "gesvd". Please refer to the
 // LAPACK documentation for further details.
-// Note:
-// - V is input and output parameter (overwritten by A)
+// Note: V is input and output parameter (it overwrites A)
 
 template<typename xpu, typename DType>
 void linalg_gesvd(const Tensor<xpu, 2, DType>& UT,
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
@@ -1262,25 +1262,16 @@ void linalg_gesvd<cpu, DType>(const Tensor<cpu, 2, DType>& UT, \
                               const Tensor<cpu, 1, DType>& work, \
                               Stream<cpu> *s) { \
   check_gesvd(UT, L, V); \
-  DType lwork(0); \
-  MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, V.size(0), V.size(1), \
-                       UT.dptr_, UT.stride_, L.dptr_, V.dptr_, V.stride_, \
-                       &lwork, -1); \
+  int lwork(work.size(0)); \
   int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, V.size(0), V.size(1), \
                                UT.dptr_, UT.stride_, L.dptr_, V.dptr_, V.stride_, \
-                               work.dptr_, static_cast<int>(lwork))); \
+                               work.dptr_, lwork)); \
   CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
 }
 
 LINALG_CPU_GESVD(sgesvd, float)
 LINALG_CPU_GESVD(dgesvd, double)
 
-// Mangle temp storage requirements for DType and int into a single
-// request as we can only allocate one temp space per operator. We
-// partition this temp space into two chunks again when calling sseyvd.
-// Returned is the number of elements of type DType that the temp space
-// needs to accomodate. This also makes this function signature equivalent
-// to the work space query on GPU.
 #define LINALG_CPU_GESVD_WORKSPACE_QUERY(func, DType) \
 template<> inline \
 int linalg_gesvd_workspace_query<cpu, DType>(const Tensor<cpu, 2, DType>& UT, \
diff --git a/src/operator/numpy/linalg/np_gesvd-inl.h b/src/operator/numpy/linalg/np_gesvd-inl.h
@@ -61,8 +61,7 @@ struct GesvdVecSign {
 
 // (UT, L, V) = gesvd(A) [singular value decomposition]
 // - V can overwrite A
-// - Needs workspace (both DType and int), size of which is determined by a
-//   workspace query
+// - Needs workspace (DType), size of which is determined by a workspace query
 struct gesvd {
   template<typename xpu, typename DType>
   static void op(const Tensor<xpu, 3, DType>& A,
@@ -126,6 +125,7 @@ MSHADOW_XINLINE double gesvd_back_helper_eps(double* X) {
   return 1e-100;
 }
 
+// dA overwritten by L^-1 dA
 struct GesvdBackHelper_dV {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int k, int m, int n, DType* L, int ldl,
@@ -144,14 +144,18 @@ struct GesvdBackHelper_dV {
   }
 };
 
+// X (square) overwritten by X L
+// Y overwritten by the diagonal of X
 struct GesvdBackHelper_G1 {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int k, int m, int n, DType* X, int ldx,
-                                  DType* L, int ldl) {
+                                  DType* L, int ldl, DType* Y, int ldy) {
     const int offl(k * ldl);
+    const int offy(k * ldy);
     const int offx(k * m * ldx);
     DType numer(0.0);
     for (int i = 0; i < m; ++i) {
+      Y[offy + i] = X[offx + i * ldx + i];
       for (int j = 0; j < m; ++j) {
         numer = L[offl + j];
         X[offx + i * ldx + j] *= numer;
@@ -164,16 +168,15 @@ struct GesvdBackHelper_G2 {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int k, int m, int n, DType* X, int ldx,
                                   DType* L, int ldl, DType* dL, int lddl,
-                                  DType* dA, int ldda, DType* V, int ldv) {
+                                  DType* Y, int ldy) {
     const int offx(k * m * ldx);
     const int offl(k * ldl);
     const int offdl(k * lddl);
-    const int offda(k * m * ldda);
-    const int offv(k * m * ldv);
+    const int offy(k * ldy);
     const DType eps(gesvd_back_helper_eps(X));
     DType denom1(0.0), denom2(0.0), elem(0.0);
 
-    for (int i = 0; i < m - 1; ++i) {
+    for (int i = 0; i < m; ++i) {
       for (int j = i + 1; j < m; ++j) {
         denom1 = L[offl + i] - L[offl + j];
         denom2 = L[offl + i] + L[offl + j];
@@ -183,14 +186,7 @@ struct GesvdBackHelper_G2 {
         X[offx + i * ldx + j] = elem * L[offl + j];
         X[offx + j * ldx + i] = elem * L[offl + i];
       }
-    }
-    for (int i = 0; i < m; ++i) {
-      elem = DType(0.0);
-      for (int j = 0; j < n; ++j) {
-        elem += dA[offda + i * ldda + j] * V[offv + i * ldv + j];
-      }
-      elem = -elem + dL[offdl + i];
-      X[offx + i * ldx + i] = elem;
+      X[offx + i * ldx + i] = -Y[offy + i] + dL[offdl + i];
     }
   }
 };
@@ -204,41 +200,49 @@ struct gesvd_backward {
                  const Tensor<xpu, 2, DType>& L,
                  const Tensor<xpu, 3, DType>& V,
                  const Tensor<xpu, 3, DType>& dA,
-                 const Tensor<xpu, 3, DType>& tempMs,
-                 const Tensor<xpu, 3, DType>& tempMr,
+                 const Tensor<xpu, 3, DType>& tempM,
+                 const Tensor<xpu, 2, DType>& tempMd,
                  Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of (UT, L, V) = gesvd(A)
     using namespace mxnet_op;
     if (dA.dptr_ != dV.dptr_) {
       Copy(dA, dV, s);
     }
     // From here on, we work on dA only
+    int k = dA.size(0), m = dA.size(1), n = dA.size(2);
 
     // Need temporal space, same shape as dUT
     // invdV:
     Kernel<GesvdBackHelper_dV, xpu>::Launch
-      (s, dA.size(0), dA.size(1), dA.size(2), L.dptr_, L.stride_, dA.dptr_, dA.stride_);
+      (s, k, m, n, L.dptr_, L.stride_, dA.dptr_, dA.stride_);
 
     // G1:
-    // This copy is just to make sure there are no invalid values (NaN, infinity) in tempM
-    Copy(tempMs, dUT, s);
-    Copy(tempMr, dA, s);
-    gemm::op(dA, V, tempMs, DType(1.0), DType(0.0), false, true, s);
+    // This is just to make sure there are no invalid values (NaN, infinity) in tempM and tempMd
+    tempM.FlatTo1D() = 0;
+    tempMd.FlatTo1D() = 0;
+    gemm::op(dA, V, tempM, DType(1.0), DType(0.0), false, true, s);
     Kernel<GesvdBackHelper_G1, xpu>::Launch
-      (s, dA.size(0), dA.size(1), dA.size(2), tempMs.dptr_, tempMs.stride_,
-       L.dptr_, L.stride_);
-    gemm::op(dUT, UT, tempMs, DType(1.0), DType(1.0), true, false, s);
+      (s, k, m, n, tempM.dptr_, tempM.stride_,
+       L.dptr_, L.stride_, tempMd.dptr_, tempMd.stride_);
+    gemm::op(dUT, UT, tempM, DType(1.0), DType(1.0), true, false, s);
 
     // G2:
     Kernel<GesvdBackHelper_G2, xpu>::Launch
-      (s, dA.size(0), dA.size(1), dA.size(2), tempMs.dptr_, tempMs.stride_,
-       L.dptr_, L.stride_, dL.dptr_, dL.stride_, dA.dptr_, dA.stride_,
-       V.dptr_, V.stride_);
+      (s, k, m, n, tempM.dptr_, tempM.stride_,
+       L.dptr_, L.stride_, dL.dptr_, dL.stride_,
+       tempMd.dptr_, tempMd.stride_);
 
     // G3:
-    gemm::op(tempMs, V, dA, DType(1.0), DType(1.0), false, false, s);
-    gemm::op(UT, dA, tempMr, DType(1.0), DType(0.0), false, false, s);
-    Copy(dA, tempMr, s);
+    gemm::op(tempM, V, dA, DType(1.0), DType(1.0), false, false, s);
+    for (int i = 0; i < n; i += m) {
+      int ncols = n - i < m ? n - i : m;
+      Tensor<xpu, 3, DType> t = Tensor<xpu, 3, DType>(dA.dptr_ + i,
+        Shape3(k, m, ncols), dA.stride_, dA.stream_);
+      Tensor<xpu, 3, DType> out = Tensor<xpu, 3, DType>(tempM.dptr_,
+        Shape3(k, m, ncols), tempM.stride_, tempM.stream_);
+      gemm::op(UT, t, out, DType(1.0), DType(0.0), false, false, s);
+      Copy(t, out, s);
+    }
   }
 };
 
@@ -258,23 +262,21 @@ void NumpyLaGesvdBackward(const nnvm::NodeAttrs& attrs,
   }
   MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
     TBlob tspace(outputs[0]);
-    TBlob tempMs, tempMr;
+    TBlob tempM, tempMd;
+    int kmn = outputs[0].shape_.Size();
+    int kmm = inputs[0].shape_.Size();
+    int km = inputs[1].shape_.Size();
     if (req[0] == kAddTo) {
       Tensor<xpu, 1, OType> tempspace = ctx.requested[0]
-        .get_space_typed<xpu, 1, OType>(Shape1(2 * outputs[0].shape_.Size()), s);
-      tspace = TBlob(tempspace.Slice(0, outputs[0].shape_.Size()))
-        .reshape(outputs[0].shape_);
-      tempMs = TBlob(tempspace.Slice(outputs[0].shape_.Size(),
-                                     outputs[0].shape_.Size() + inputs[0].shape_.Size()))
-        .reshape(inputs[0].shape_);
-      tempMr = TBlob(tempspace.Slice(outputs[0].shape_.Size(),
-                                     2 * outputs[0].shape_.Size()))
-        .reshape(outputs[0].shape_);
+        .get_space_typed<xpu, 1, OType>(Shape1(kmn + kmm + km), s);
+      tspace = TBlob(tempspace.Slice(0, kmn)).reshape(outputs[0].shape_);
+      tempM = TBlob(tempspace.Slice(kmn, kmn + kmm)).reshape(inputs[0].shape_);
+      tempMd = TBlob(tempspace.Slice(kmn + kmm, kmn + kmm + km)).reshape(inputs[1].shape_);
     } else {
       Tensor<xpu, 1, OType> tempspace = ctx.requested[0]
-        .get_space_typed<xpu, 1, OType>(Shape1(outputs[0].shape_.Size()), s);
-      tempMs = TBlob(tempspace.Slice(0, inputs[0].shape_.Size())).reshape(inputs[0].shape_);
-      tempMr = TBlob(tempspace.Slice(0, outputs[0].shape_.Size())).reshape(outputs[0].shape_);
+        .get_space_typed<xpu, 1, OType>(Shape1(kmm + km), s);
+      tempM = TBlob(tempspace.Slice(0, kmm)).reshape(inputs[0].shape_);
+      tempMd = TBlob(tempspace.Slice(kmm, kmm + km)).reshape(inputs[1].shape_);
     }
     laop::op(inputs[0].FlatToKD<xpu, 3, OType>(s),  // dUT
              inputs[1].FlatToKD<xpu, 2, OType>(s),  // dL
@@ -283,8 +285,8 @@ void NumpyLaGesvdBackward(const nnvm::NodeAttrs& attrs,
              inputs[4].FlatToKD<xpu, 2, OType>(s),  // L
              inputs[5].FlatToKD<xpu, 3, OType>(s),  // V
              tspace.FlatToKD<xpu, 3, OType>(s),  // dA
-             tempMs.FlatToKD<xpu, 3, OType>(s),  // tempMs
-             tempMr.FlatToKD<xpu, 3, OType>(s),  // tempMr
+             tempM.FlatToKD<xpu, 3, OType>(s),  // tempM
+             tempMd.FlatToKD<xpu, 2, OType>(s),  // tempMd
              s, attrs);
     if (req[0] == kAddTo) {
       Tensor<xpu, 1, OType> out = outputs[0].FlatTo1D<xpu, OType>(s);
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
@@ -878,8 +878,7 @@ def get_grad(UT, L, V):
     for hybridize in [True, False]:
         for dtype in dtypes:
             for shape in shapes:
-                rtol = 1e-3
-                atol = 1e-3
+                rtol = atol = 0.01
                 test_svd = TestSVD()
                 if hybridize:
                     test_svd.hybridize()