Skip to content

Commit 8973c25

Browse files
authored
Feature: Make KG-KSDFT support GPU (deepmodeling#6013)
* make KG support GPU * fix bug add tests for KG-GPU * add tests * modify reference Onsager.txt
1 parent 98f7a55 commit 8973c25

File tree

18 files changed

+556
-226
lines changed

18 files changed

+556
-226
lines changed

source/module_base/kernels/cuda/math_kernel_op.cu

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel(
325325
}
326326

327327
template <typename T>
328-
__global__ void vector_mul_vector_kernel(
329-
const int size,
330-
T* result,
331-
const T* vector1,
332-
const typename GetTypeReal<T>::type* vector2)
328+
__global__ void vector_mul_vector_kernel(const int size,
329+
T* result,
330+
const T* vector1,
331+
const typename GetTypeReal<T>::type* vector2,
332+
const bool add)
333333
{
334334
int i = blockIdx.x * blockDim.x + threadIdx.x;
335335
if (i < size)
336336
{
337-
result[i] = vector1[i] * vector2[i];
337+
if (add)
338+
{
339+
result[i] += vector1[i] * vector2[i];
340+
}
341+
else
342+
{
343+
result[i] = vector1[i] * vector2[i];
344+
}
338345
}
339346
}
340347

@@ -548,11 +555,12 @@ template <>
548555
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
549556
double* result,
550557
const double* vector1,
551-
const double* vector2)
558+
const double* vector2,
559+
const bool& add)
552560
{
553561
int thread = thread_per_block;
554562
int block = (dim + thread - 1) / thread;
555-
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
563+
vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);
556564

557565
cudaCheckOnDebug();
558566
}
@@ -561,32 +569,35 @@ template <typename FPTYPE>
561569
inline void vector_mul_vector_complex_wrapper(const int& dim,
562570
std::complex<FPTYPE>* result,
563571
const std::complex<FPTYPE>* vector1,
564-
const FPTYPE* vector2)
572+
const FPTYPE* vector2,
573+
const bool& add)
565574
{
566575
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
567576
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
568577
int thread = thread_per_block;
569578
int block = (dim + thread - 1) / thread;
570-
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
579+
vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);
571580

572581
cudaCheckOnDebug();
573582
}
574583
template <>
575584
void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
576585
std::complex<float>* result,
577586
const std::complex<float>* vector1,
578-
const float* vector2)
587+
const float* vector2,
588+
const bool& add)
579589
{
580-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
590+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
581591
}
582592
template <>
583593
void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
584594
const int& dim,
585595
std::complex<double>* result,
586596
const std::complex<double>* vector1,
587-
const double* vector2)
597+
const double* vector2,
598+
const bool& add)
588599
{
589-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
600+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
590601
}
591602

592603
// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
@@ -1019,6 +1030,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
10191030
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
10201031
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
10211032
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
1033+
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
10221034
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
10231035
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
10241036
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
@@ -1029,6 +1041,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
10291041
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10301042
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10311043
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
1044+
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
10321045
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
10331046
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
10341047
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
@@ -1039,7 +1052,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
10391052
#ifdef __LCAO
10401053
template struct dot_real_op<double, base_device::DEVICE_GPU>;
10411054
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
1042-
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
10431055
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
10441056
#endif
10451057
} // namespace ModuleBase

source/module_base/kernels/math_kernel_op.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,27 @@ template <typename T>
167167
struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
168168
{
169169
using Real = typename GetTypeReal<T>::type;
170-
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
170+
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
171171
{
172+
if (add)
173+
{
172174
#ifdef _OPENMP
173175
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
174176
#endif
175-
for (int i = 0; i < dim; i++)
177+
for (int i = 0; i < dim; i++)
178+
{
179+
result[i] += vector1[i] * vector2[i];
180+
}
181+
}
182+
else
176183
{
177-
result[i] = vector1[i] * vector2[i];
184+
#ifdef _OPENMP
185+
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
186+
#endif
187+
for (int i = 0; i < dim; i++)
188+
{
189+
result[i] = vector1[i] * vector2[i];
190+
}
178191
}
179192
}
180193
};

source/module_base/kernels/math_kernel_op.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,11 @@ template <typename T, typename Device> struct vector_mul_vector_op {
143143
/// \param dim : array size
144144
/// \param vector1 : input array A
145145
/// \param vector2 : input array B
146+
/// \param add : flag to control whether to add the result to the output array
146147
///
147148
/// Output Parameters
148149
/// \param result : output array
149-
void operator()(const int &dim, T *result, const T *vector1,
150-
const Real *vector2);
150+
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
151151
};
152152

153153
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
@@ -350,8 +350,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
350350
// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
351351
template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
352352
using Real = typename GetTypeReal<T>::type;
353-
void operator()(const int &dim, T *result,
354-
const T *vector1, const Real *vector2);
353+
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
355354
};
356355

357356
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)

source/module_base/kernels/rocm/math_kernel_op.hip.cu

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
248248
const int size,
249249
T* result,
250250
const T* vector1,
251-
const typename GetTypeReal<T>::type* vector2)
251+
const typename GetTypeReal<T>::type* vector2,
252+
const bool add)
252253
{
253254
int i = blockIdx.x * blockDim.x + threadIdx.x;
254255
if (i < size)
255256
{
256-
result[i] = vector1[i] * vector2[i];
257+
if (add)
258+
{
259+
result[i] += vector1[i] * vector2[i];
260+
}
261+
else
262+
{
263+
result[i] = vector1[i] * vector2[i];
264+
}
257265
}
258266
}
259267

@@ -471,11 +479,12 @@ template <>
471479
void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
472480
double* result,
473481
const double* vector1,
474-
const double* vector2)
482+
const double* vector2,
483+
const bool& add)
475484
{
476485
int thread = 1024;
477486
int block = (dim + thread - 1) / thread;
478-
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
487+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);
479488

480489
hipCheckOnDebug();
481490
}
@@ -485,32 +494,35 @@ template <typename FPTYPE>
485494
inline void vector_mul_vector_complex_wrapper(const int& dim,
486495
std::complex<FPTYPE>* result,
487496
const std::complex<FPTYPE>* vector1,
488-
const FPTYPE* vector2)
497+
const FPTYPE* vector2,
498+
const bool& add)
489499
{
490500
thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
491501
const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
492502
int thread = 1024;
493503
int block = (dim + thread - 1) / thread;
494-
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
504+
hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);
495505

496506
hipCheckOnDebug();
497507
}
498508
template <>
499509
void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
500510
std::complex<float>* result,
501511
const std::complex<float>* vector1,
502-
const float* vector2)
512+
const float* vector2,
513+
const bool& add)
503514
{
504-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
515+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
505516
}
506517
template <>
507518
void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
508519
const int& dim,
509520
std::complex<double>* result,
510521
const std::complex<double>* vector1,
511-
const double* vector2)
522+
const double* vector2,
523+
const bool& add)
512524
{
513-
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
525+
vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
514526
}
515527
// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
516528
template <>
@@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
931943
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
932944
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
933945
template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
946+
template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
934947
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
935948
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
936949
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
940953
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
941954
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
942955
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
956+
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
943957
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
944958
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
945959
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
948962
#ifdef __LCAO
949963
template struct dot_real_op<double, base_device::DEVICE_GPU>;
950964
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
951-
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
952965
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
953966
template struct matrixCopy<double, base_device::DEVICE_GPU>;
954967
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;

source/module_esolver/esolver_ks_pw.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
943943
//! 7) Use Kubo-Greenwood method to compute conductivities
944944
if (PARAM.inp.cal_cond)
945945
{
946-
EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);
946+
EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);
947947
elec_cond.KG(PARAM.inp.cond_smear,
948948
PARAM.inp.cond_fwhm,
949949
PARAM.inp.cond_wcut,

0 commit comments

Comments
 (0)