Skip to content

Commit 9a2c4fe

Browse files
[SYCL] Implement per-aspect device code split (#7302)
This is a piece of implementation of [optional kernels features](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features) from SYCL 2020 spec, in accordance with our [design doc](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OptionalDeviceFeatures.md#changes-to-the-device-code-split-algorithm). Note: this PR doesn't fully implement the feature, but rather lays a foundation for subsequent incremental updates. The main TODO items for future PRs: - refactor handling of `largeGRF` to use new splitter - add handling for other kernel properties like `reqd_sub_group_size`, `reqd_work_group_size` - add handling of global variables of optional types
1 parent 83d3448 commit 9a2c4fe

File tree

7 files changed

+523
-6
lines changed

7 files changed

+523
-6
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
; This test emulates two translation units with 3 kernels:
2+
; TU0_kernel0 - 1st translation unit, no aspects used
3+
; TU0_kernel1 - 1st translation unit, aspect 1 is used
4+
; TU1_kernel2 - 2nd translation unit, no aspects used
5+
6+
; The test is intended to check that sycl-post-link correctly separates kernels
7+
; that use aspects from kernels which doesn't use aspects regardless of device
8+
; code split mode
9+
10+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
11+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
12+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
13+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \
14+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
15+
; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \
16+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
17+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \
18+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
19+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \
20+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
21+
; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
22+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
23+
24+
; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
25+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
26+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
27+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \
28+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
29+
; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \
30+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
31+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \
32+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
33+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \
34+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
35+
; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
36+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
37+
38+
; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
39+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
40+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
41+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \
42+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
43+
; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \
44+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
45+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \
46+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
47+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \
48+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
49+
; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
50+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
51+
52+
; Regardless of device code split mode, each kernel should go into a separate
53+
; device image
54+
55+
; CHECK-M2-IR: define {{.*}} @TU0_kernel0
56+
; CHECK-M2-SYMS: TU0_kernel0
57+
58+
; CHECK-M1-IR: define {{.*}} @TU0_kernel1
59+
; CHECK-M1-SYMS: TU0_kernel1
60+
61+
; CHECK-M0-IR: define {{.*}} @TU1_kernel2
62+
; CHECK-M0-SYMS: TU1_kernel2
63+
64+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
65+
target triple = "spir64-unknown-linux"
66+
67+
; FIXME: device globals should also be properly distributed across device images
68+
; if they are of optional type
69+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
70+
71+
define dso_local spir_kernel void @TU0_kernel0() #0 {
72+
entry:
73+
call spir_func void @foo()
74+
ret void
75+
}
76+
77+
define dso_local spir_func void @foo() {
78+
entry:
79+
%a = alloca i32, align 4
80+
%call = call spir_func i32 @bar(i32 1)
81+
%add = add nsw i32 2, %call
82+
store i32 %add, i32* %a, align 4
83+
ret void
84+
}
85+
86+
; Function Attrs: nounwind
87+
define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) {
88+
entry:
89+
%arg.addr = alloca i32, align 4
90+
store i32 %arg, i32* %arg.addr, align 4
91+
%0 = load i32, i32* %arg.addr, align 4
92+
ret i32 %0
93+
}
94+
95+
define dso_local spir_kernel void @TU0_kernel1() #0 !sycl_used_aspects !2 {
96+
entry:
97+
call spir_func void @foo1()
98+
ret void
99+
}
100+
101+
; Function Attrs: nounwind
102+
define dso_local spir_func void @foo1() {
103+
entry:
104+
%a = alloca i32, align 4
105+
store i32 2, i32* %a, align 4
106+
ret void
107+
}
108+
109+
define dso_local spir_kernel void @TU1_kernel2() #1 {
110+
entry:
111+
call spir_func void @foo2()
112+
ret void
113+
}
114+
115+
; Function Attrs: nounwind
116+
define dso_local spir_func void @foo2() {
117+
entry:
118+
%a = alloca i32, align 4
119+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
120+
%add = add nsw i32 4, %0
121+
store i32 %add, i32* %a, align 4
122+
ret void
123+
}
124+
125+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
126+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
127+
128+
!opencl.spir.version = !{!0, !0}
129+
!spirv.Source = !{!1, !1}
130+
131+
!0 = !{i32 1, i32 2}
132+
!1 = !{i32 4, i32 100000}
133+
!2 = !{i32 1}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; The test is intended to check that sycl-post-link correctly groups kernels
2+
; by unique sets of aspects used in them
3+
4+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
5+
; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE
6+
;
7+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \
8+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \
9+
; RUN: --implicit-check-not kernel2
10+
;
11+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \
12+
; RUN: --implicit-check-not kernel3 --implicit-check-not kernel1 \
13+
; RUN: --implicit-check-not kernel2
14+
;
15+
; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
16+
; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3
17+
18+
; CHECK-TABLE: Code
19+
; CHECK-TABLE-NEXT: _0.sym
20+
; CHECK-TABLE-NEXT: _1.sym
21+
; CHECK-TABLE-NEXT: _2.sym
22+
; CHECK-TABLE-EMPTY:
23+
24+
; CHECK-M0-SYMS: kernel3
25+
26+
; CHECK-M1-SYMS: kernel0
27+
28+
; CHECK-M2-SYMS: kernel1
29+
; CHECK-M2-SYMS: kernel2
30+
31+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
32+
target triple = "spir64-unknown-linux"
33+
34+
define dso_local spir_kernel void @kernel0() #0 !sycl_used_aspects !1 {
35+
entry:
36+
ret void
37+
}
38+
39+
define dso_local spir_kernel void @kernel1() #0 !sycl_used_aspects !2 {
40+
entry:
41+
ret void
42+
}
43+
44+
define dso_local spir_kernel void @kernel2() #0 !sycl_used_aspects !3 {
45+
entry:
46+
ret void
47+
}
48+
49+
define dso_local spir_kernel void @kernel3() #0 !sycl_used_aspects !4 {
50+
entry:
51+
ret void
52+
}
53+
54+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
55+
56+
!1 = !{i32 1}
57+
!2 = !{i32 1, i32 2}
58+
!3 = !{i32 2, i32 1}
59+
!4 = !{i32 2, i32 3, i32 4}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
; This test is intended to check that per-aspect device code split works as
2+
; expected with SYCL_EXTERNAL functions
3+
4+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
5+
; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE
6+
;
7+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \
8+
; RUN: --implicit-check-not foo --implicit-check-not kernel1
9+
;
10+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \
11+
; RUN: --implicit-check-not foo --implicit-check-not kernel0
12+
;
13+
; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
14+
; RUN: --implicit-check-not kernel0 --implicit-check-not foo \
15+
; RUN: --implicit-check-not bar
16+
;
17+
; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK-M2-IR \
18+
; RUN: --implicit-check-not kernel0 --implicit-check-not bar
19+
20+
; We expect to see 3 modules generated:
21+
;
22+
; CHECK-TABLE: Code
23+
; CHECK-TABLE-NEXT: _0.sym
24+
; CHECK-TABLE-NEXT: _1.sym
25+
; CHECK-TABLE-NEXT: _2.sym
26+
; CHECK-TABLE-EMPTY:
27+
28+
; sycl-post-link aims to achieve two goals while doing splitting:
29+
; - each kernel must be self-contained, i.e. all functions called from a
30+
; kernel must reside in the same device image
31+
; - each entry point should be assigned to a correct device image in
32+
; accordance with selected device code split mode
33+
;
34+
; In this test @bar and @foo are SYCL_EXTERNAL functions and they are treated
35+
; as entry points.
36+
;
37+
; @bar uses the same list of aspects as @kernel0 which calls it and therefore
38+
; they can be put into the same device image. There also goes @baz, because of
39+
; the same list of used aspects.
40+
;
41+
; CHECK-M0-SYMS: bar
42+
; CHECK-M0-SYMS: baz
43+
; CHECK-M0-SYMS: kernel0
44+
;
45+
; List of aspects used by @foo is different from the one attached to @kernel1
46+
; which calls @foo (for example, @kernel1 uses an extra optional feature besides
47+
; ones used in @foo). As a result, @foo should be both included into the same
48+
; device image as @kernel1 to make it self contained, but at the same time it
49+
; should also present in a separate device image, because it is an entry point
50+
; with unique set of used aspects.
51+
;
52+
; CHECK-M1-SYMS: foo
53+
;
54+
; CHECK-M2-SYMS: kernel1
55+
;
56+
; @kernel1 uses @foo and therefore @foo should be present in the same module as
57+
; @kernel1 as well
58+
; CHECK-M2-IR-DAG: define spir_func void @foo
59+
; CHECK-M2-IR-DAG: define spir_kernel void @kernel1
60+
61+
62+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
63+
target triple = "spir64-unknown-linux"
64+
65+
define spir_func void @foo() #0 !sycl_used_aspects !1 {
66+
ret void
67+
}
68+
69+
define spir_func void @bar() #1 !sycl_used_aspects !2 {
70+
ret void
71+
}
72+
73+
define spir_func void @baz() #1 !sycl_used_aspects !2 {
74+
ret void
75+
}
76+
77+
define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 {
78+
entry:
79+
call void @bar()
80+
ret void
81+
}
82+
83+
define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 {
84+
entry:
85+
call void @foo()
86+
ret void
87+
}
88+
89+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
90+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
91+
92+
!1 = !{i32 1}
93+
!2 = !{i32 2}
94+
!3 = !{i32 3, i32 1}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
; This test is intended to check that we do not perform per-aspect split if
2+
; it was disabled through one or another sycl-post-link option
3+
4+
; RUN: sycl-post-link -symbols -S %s -o %t.table
5+
; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE
6+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK-IR
7+
;
8+
; -lower-esimd is needed so sycl-post-link does not complain about no actions
9+
; specified
10+
; RUN: sycl-post-link -lower-esimd -ir-output-only -S %s -o %t.ll
11+
; RUN: FileCheck %s -input-file=%t.ll --check-prefix CHECK-IR
12+
13+
; We expect to see only one module generated:
14+
;
15+
; CHECK-TABLE: Code
16+
; CHECK-TABLE-NEXT: _0.ll
17+
; CHECK-TABLE-EMPTY:
18+
19+
; Regardless of used aspects and sycl-module-id metadata, all kernel and
20+
; functions should still be present.
21+
22+
; CHECK-IR-DAG: define spir_func void @foo
23+
; CHECK-IR-DAG: define spir_func void @bar
24+
; CHECK-IR-DAG: define spir_kernel void @kernel0
25+
; CHECK-IR-DAG: define spir_kernel void @kernel1
26+
27+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
28+
target triple = "spir64-unknown-linux"
29+
30+
define spir_func void @foo() #0 !sycl_used_aspects !1 {
31+
ret void
32+
}
33+
34+
define spir_func void @bar() #1 !sycl_used_aspects !2 {
35+
ret void
36+
}
37+
38+
define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 {
39+
entry:
40+
ret void
41+
}
42+
43+
define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 {
44+
entry:
45+
call void @foo()
46+
ret void
47+
}
48+
49+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
50+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
51+
52+
!1 = !{i32 1}
53+
!2 = !{i32 2}
54+
!3 = !{i32 3, i32 1}
55+

0 commit comments

Comments
 (0)