Skip to content

Commit 33c13c3

Browse files
committed
sve - initial SVE backend framework
1 parent 60ef3fe commit 33c13c3

File tree

8 files changed

+425
-0
lines changed

8 files changed

+425
-0
lines changed

Makefile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c))
261261
ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
262262
opt.c := $(sort $(wildcard backends/opt/*.c))
263263
avx.c := $(sort $(wildcard backends/avx/*.c))
264+
sve.c := $(sort $(wildcard backends/sve/*.c))
264265
xsmm.c := $(sort $(wildcard backends/xsmm/*.c))
265266
cuda.c := $(sort $(wildcard backends/cuda/*.c))
266267
cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp))
@@ -332,6 +333,7 @@ info:
332333
$(info ------------------------------------)
333334
$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
334335
$(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
336+
$(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
335337
$(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
336338
$(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
337339
$(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
@@ -384,6 +386,17 @@ ifneq ($(AVX),)
384386
BACKENDS_MAKE += $(AVX_BACKENDS)
385387
endif
386388

389+
# SVE Backends
390+
SVE_STATUS = Disabled
391+
AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
392+
SVE := $(filter $(SVE_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
393+
SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
394+
ifneq ($(SVE),)
395+
SVE_STATUS = Enabled
396+
libceed.c += $(sve.c)
397+
BACKENDS_MAKE += $(SVE_BACKENDS)
398+
endif
399+
387400
# Collect list of libraries and paths for use in linking and pkg-config
388401
PKG_LIBS =
389402
# Stubs that will not be RPATH'd

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ There are multiple supported backends, which can be selected at runtime in the e
155155
| `/cpu/self/opt/blocked` | Blocked optimized C implementation | Yes |
156156
| `/cpu/self/avx/serial` | Serial AVX implementation | Yes |
157157
| `/cpu/self/avx/blocked` | Blocked AVX implementation | Yes |
158+
| `/cpu/self/sve/serial` | Serial SVE implementation | Yes |
159+
| `/cpu/self/sve/blocked` | Blocked SVE implementation | Yes |
158160
||
159161
| **CPU Valgrind** |
160162
| `/cpu/self/memcheck/*` | Memcheck backends, undefined value checks | Yes |
@@ -200,6 +202,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t
200202

201203
The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.
202204

205+
The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.
206+
203207
The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
204208
To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
205209
A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.

backends/ceed-backend-list.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,7 @@ CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
3131
CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
3232
CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
3333
CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
34+
CEED_BACKEND(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
35+
CEED_BACKEND(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
3436
CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
3537
CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")

backends/sve/ceed-sve-blocked.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2+
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3+
//
4+
// SPDX-License-Identifier: BSD-2-Clause
5+
//
6+
// This file is part of CEED: http://github.com/ceed
7+
8+
#include <ceed.h>
9+
#include <ceed/backend.h>
10+
#include <stdbool.h>
11+
#include <string.h>
12+
13+
#include "ceed-sve.h"
14+
15+
//------------------------------------------------------------------------------
16+
// Backend Init
17+
//------------------------------------------------------------------------------
18+
static int CeedInit_Sve(const char *resource, Ceed ceed) {
19+
Ceed ceed_ref;
20+
21+
CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked"), ceed,
22+
CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
23+
CeedCallBackend(CeedSetDeterministic(ceed, true));
24+
25+
// Create reference CEED that implementation will be dispatched through unless overridden
26+
CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
27+
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
28+
29+
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
30+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
31+
} else {
32+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
33+
}
34+
return CEED_ERROR_SUCCESS;
35+
}
36+
37+
//------------------------------------------------------------------------------
38+
// Backend Register
39+
//------------------------------------------------------------------------------
40+
CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
41+
//------------------------------------------------------------------------------

backends/sve/ceed-sve-serial.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2+
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3+
//
4+
// SPDX-License-Identifier: BSD-2-Clause
5+
//
6+
// This file is part of CEED: http://github.com/ceed
7+
8+
#include <ceed.h>
9+
#include <ceed/backend.h>
10+
#include <stdbool.h>
11+
#include <string.h>
12+
13+
#include "ceed-sve.h"
14+
15+
//------------------------------------------------------------------------------
16+
// Backend Init
17+
//------------------------------------------------------------------------------
18+
static int CeedInit_Sve(const char *resource, Ceed ceed) {
19+
Ceed ceed_ref;
20+
21+
CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve/serial"), ceed, CEED_ERROR_BACKEND,
22+
"SVE backend cannot use resource: %s", resource);
23+
CeedCallBackend(CeedSetDeterministic(ceed, true));
24+
25+
// Create reference CEED that implementation will be dispatched through unless overridden
26+
CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
27+
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
28+
29+
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
30+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
31+
} else {
32+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
33+
}
34+
return CEED_ERROR_SUCCESS;
35+
}
36+
37+
//------------------------------------------------------------------------------
38+
// Backend Register
39+
//------------------------------------------------------------------------------
40+
CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }
41+
42+
//------------------------------------------------------------------------------

backends/sve/ceed-sve-tensor-f32.c

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2+
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3+
//
4+
// SPDX-License-Identifier: BSD-2-Clause
5+
//
6+
// This file is part of CEED: http://github.com/ceed
7+
8+
#include <ceed.h>
9+
#include <ceed/backend.h>
10+
#ifdef __ARM_FEATURE_SVE
11+
#include <arm_sve.h>
12+
#endif
13+
#include <stdbool.h>
14+
15+
#include "ceed-sve.h"
16+
17+
//------------------------------------------------------------------------------
18+
// Blocked Tensor Contract
19+
//------------------------------------------------------------------------------
20+
static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
21+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
22+
const CeedInt JJ) {
23+
CeedInt t_stride_0 = B, t_stride_1 = 1;
24+
25+
if (t_mode == CEED_TRANSPOSE) {
26+
t_stride_0 = 1;
27+
t_stride_1 = J;
28+
}
29+
30+
for (CeedInt a = 0; a < A; a++) {
31+
for (CeedInt b = 0; b < B; b++) {
32+
// Blocks of JJ rows
33+
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
34+
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
35+
// C vectorization by compiler
36+
for (int32_t c = 0; c < C; c += svcntd()) {
37+
svbool_t pg = svwhilelt_b32(c, C);
38+
// Load u, v into vectors
39+
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
40+
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
41+
// Basis matrix value
42+
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
43+
44+
// fmadd
45+
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
46+
}
47+
}
48+
}
49+
}
50+
}
51+
52+
// Remainder of rows
53+
const CeedInt j = (J / JJ) * JJ;
54+
55+
if (j < J) {
56+
for (CeedInt a = 0; a < A; a++) {
57+
for (CeedInt b = 0; b < B; b++) {
58+
// Blocks of JJ rows
59+
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
60+
// C vectorization by compiler
61+
for (int32_t c = 0; c < C; c += svcntd()) {
62+
svbool_t pg = svwhilelt_b32(c, C);
63+
// Load u, v into vectors
64+
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
65+
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
66+
// Basis matrix value
67+
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
68+
69+
// fmadd
70+
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
71+
}
72+
}
73+
}
74+
}
75+
}
76+
return CEED_ERROR_SUCCESS;
77+
}
78+
79+
//------------------------------------------------------------------------------
80+
// Blocked Tensor Contract
81+
//------------------------------------------------------------------------------
82+
static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
83+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
84+
const CeedInt JJ) {
85+
CeedInt t_stride_0 = B, t_stride_1 = 1;
86+
87+
if (t_mode == CEED_TRANSPOSE) {
88+
t_stride_0 = 1;
89+
t_stride_1 = J;
90+
}
91+
92+
for (CeedInt a = 0; a < A; a++) {
93+
for (CeedInt b = 0; b < B; b++) {
94+
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
95+
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
96+
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
97+
}
98+
}
99+
}
100+
}
101+
102+
const CeedInt j = (J / JJ) * JJ;
103+
104+
if (j < J) {
105+
for (CeedInt a = 0; a < A; a++) {
106+
for (CeedInt b = 0; b < B; b++) {
107+
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
108+
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
109+
}
110+
}
111+
}
112+
}
113+
return CEED_ERROR_SUCCESS;
114+
}
115+
116+
//------------------------------------------------------------------------------
117+
// Tensor Contract - Common Sizes
118+
//------------------------------------------------------------------------------
119+
static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
120+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
121+
return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
122+
}
123+
static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
124+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
125+
return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
126+
}
127+
128+
//------------------------------------------------------------------------------
129+
// Tensor Contract Apply
130+
//------------------------------------------------------------------------------
131+
static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
132+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
133+
if (!add) {
134+
for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
135+
}
136+
137+
if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
138+
else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
139+
return CEED_ERROR_SUCCESS;
140+
}
141+
142+
//------------------------------------------------------------------------------
143+
// Tensor Contract Create
144+
//------------------------------------------------------------------------------
145+
int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
146+
Ceed ceed;
147+
148+
CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
149+
CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
150+
return CEED_ERROR_SUCCESS;
151+
}
152+
153+
//------------------------------------------------------------------------------

0 commit comments

Comments
 (0)