Add trsm CPU support for Tensor LAPACK

Cstandardlib · Cstandardlib · commit f62f4382fe9e · 2025-02-19T11:13:15.000+08:00
diff --git a/source/module_base/module_container/ATen/kernels/lapack.cpp b/source/module_base/module_container/ATen/kernels/lapack.cpp
@@ -199,6 +199,24 @@ struct lapack_geqrf<T, DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct lapack_trsm<T, DEVICE_CPU> {
+    void operator()(
+        char side,
+        char uplo,
+        char transA,
+        char diag,
+        int m,
+        int n,
+        T alpha,
+        T* A,
+        int lda,
+        T* B,
+        int ldb)
+    {
+        lapackConnector::trsm(side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb);
+    }
+};
 
 template struct set_matrix<float,  DEVICE_CPU>;
 template struct set_matrix<double, DEVICE_CPU>;
diff --git a/source/module_base/module_container/ATen/kernels/lapack.h b/source/module_base/module_container/ATen/kernels/lapack.h
@@ -285,6 +285,56 @@ struct lapack_geqrf {
         int lwork);
 };
 
+/**
+ * @brief Functor for solving a system of linear equations with a triangular matrix using LAPACK's TRSM routine.
+ * 
+ * TRSM: triangular solve matrix
+ * Solves one of the following matrix equations:
+ * - op(A) * X = alpha * B
+ * - X * op(A) = alpha * B
+ * where op(A) is either A, A^T, A^H, or A^T.
+ * 
+ */
+template <typename T, typename Device>
+struct lapack_trsm {
+    /**
+     * @brief Solve a system of linear equations with a triangular matrix.
+     * 
+     * Solves one of the following matrix equations:
+     * - op(A) * X = alpha * B
+     * - X * op(A) = alpha * B
+     * where op(A) is either A, A^T, A^H, or A^T.
+     *
+     * @param side Specifies whether op(A) multiplies B from the left or right.
+     *             'L' or 'l' for left, 'R' or 'r' for right.
+     * @param uplo Specifies whether the matrix A is an upper or lower triangular matrix.
+     *             'U' or 'u' for upper, 'L' or 'l' for lower.
+     * @param transA Specifies the form of op(A) to be used in the matrix multiplication.
+     *               'N' or 'n' for no transpose, 'T' or 't' for transpose, 'C' or 'c' for conjugate transpose.
+     * @param diag Specifies whether or not A is unit triangular.
+     *             'U' or 'u' for unit triangular, 'N' or 'n' for non-unit triangular.
+     * @param m The number of rows of the matrix B. m >= 0.
+     * @param n The number of columns of the matrix B. n >= 0.
+     * @param alpha Scalar multiplier applied to op(A) * B.
+     * @param A Pointer to the matrix A.
+     * @param lda Leading dimension of A. lda >= max(1, m) if side == 'L' or lda >= max(1, n) if side == 'R'.
+     * @param B Pointer to the matrix B.
+     * @param ldb Leading dimension of B. ldb >= max(1, m).
+     */
+    void operator()(
+        char side,
+        char uplo,
+        char transA,
+        char diag,
+        int m,
+        int n,
+        T alpha,
+        T* A,
+        int lda,
+        T* B,
+        int ldb);
+};
+
 
 #if defined(__CUDA) || defined(__ROCM)
 // TODO: Use C++ singleton to manage the GPU handles
diff --git a/source/module_base/module_container/base/third_party/lapack.h b/source/module_base/module_container/base/third_party/lapack.h
@@ -124,6 +124,11 @@ void sgeqrf_(const int* m, const int* n, float* a, const int* lda, float* tau, f
 void dgeqrf_(const int* m, const int* n, double* a, const int* lda, double* tau, double* work, const int* lwork, int* info);
 void cgeqrf_(const int* m, const int* n, std::complex<float>* a, const int* lda, std::complex<float>* tau, std::complex<float>* work, const int* lwork, int* info);
 void zgeqrf_(const int* m, const int* n, std::complex<double>* a, const int* lda, std::complex<double>* tau, std::complex<double>* work, const int* lwork, int* info);
+
+void strsm_(const char* side, const char* uplo, const char* transa, const char* diag, const int* m, const int* n, const float* alpha, const float* a, const int* lda, float* b, const int* ldb);
+void dtrsm_(const char* side, const char* uplo, const char* transa, const char* diag, const int* m, const int* n, const double* alpha, const double* a, const int* lda, double* b, const int* ldb);
+void ctrsm_(const char* side, const char* uplo, const char* transa, const char* diag, const int* m, const int* n, const std::complex<float>* alpha, const std::complex<float>* a, const int* lda, std::complex<float>* b, const int* ldb);
+void ztrsm_(const char* side, const char* uplo, const char* transa, const char* diag, const int* m, const int* n, const std::complex<double>* alpha, const std::complex<double>* a, const int* lda, std::complex<double>* b, const int* ldb);
 }
 
 // Class LapackConnector provide the connector to fortran lapack routine.