llvm-project-source: update SPIRV-LLVM-Translator 8.0.0 -> 9.0.0

Remove all the backported patches which are available in 9.0.0 release.

Few patches were recommended from llvm-patches repo:
https://github.com/intel/intel-graphics-compiler/blob/master/documentation/build_ubuntu.md
3906cc086f

Signed-off-by: Naveen Saini <naveen.kumar.saini@intel.com>
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
This commit is contained in:
Naveen Saini 2019-09-12 22:21:43 +08:00 committed by Anuj Mittal
parent 420247ef2d
commit 1b076fd8db
10 changed files with 285 additions and 1552 deletions

View File

@ -1,156 +0,0 @@
From 39a3ac0065c23d1e2d55dfd8792cc28a146a4307 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Tue, 19 Feb 2019 15:19:06 +0000
Subject: [PATCH 1/2] [OpenCL] Change type of block pointer for OpenCL
Summary:
For some reason OpenCL blocks in LLVM IR are represented as function pointers.
These pointers do not point to any real function and never get called. Actually
they point to some structure, which in turn contains pointer to the real block
invoke function.
This patch changes represntation of OpenCL blocks in LLVM IR from function
pointers to pointers to `%struct.__block_literal_generic`.
Such representation allows to avoid unnecessary bitcasts and simplifies
further processing (e.g. translation to SPIR-V ) of the module for targets
which do not support function pointers.
Patch by: Alexey Sotkin.
Reviewers: Anastasia, yaxunl, svenvh
Reviewed By: Anastasia
Subscribers: alexbatashev, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D58277
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@354337 91177308-0d34-0410-b5e6-96231b3b80d8
Upstream-Status: Backport
[https://github.com/llvm-mirror/clang/commit/283f308bdb5893bab1f36791711346e746045f94]
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
---
lib/CodeGen/CodeGenTypes.cpp | 4 +++-
test/CodeGenOpenCL/blocks.cl | 18 ++++++++----------
test/CodeGenOpenCL/cl20-device-side-enqueue.cl | 18 +++++++++---------
3 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/lib/CodeGen/CodeGenTypes.cpp b/lib/CodeGen/CodeGenTypes.cpp
index 2acf1ac..93b3ebf 100644
--- a/lib/CodeGen/CodeGenTypes.cpp
+++ b/lib/CodeGen/CodeGenTypes.cpp
@@ -637,7 +637,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
case Type::BlockPointer: {
const QualType FTy = cast<BlockPointerType>(Ty)->getPointeeType();
- llvm::Type *PointeeType = ConvertTypeForMem(FTy);
+ llvm::Type *PointeeType = CGM.getLangOpts().OpenCL
+ ? CGM.getGenericBlockLiteralType()
+ : ConvertTypeForMem(FTy);
unsigned AS = Context.getTargetAddressSpace(FTy);
ResultType = llvm::PointerType::get(PointeeType, AS);
break;
diff --git a/test/CodeGenOpenCL/blocks.cl b/test/CodeGenOpenCL/blocks.cl
index 675240c..19aacc3 100644
--- a/test/CodeGenOpenCL/blocks.cl
+++ b/test/CodeGenOpenCL/blocks.cl
@@ -35,11 +35,10 @@ void foo(){
// SPIR: %[[block_captured:.*]] = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }>* %[[block]], i32 0, i32 3
// SPIR: %[[i_value:.*]] = load i32, i32* %i
// SPIR: store i32 %[[i_value]], i32* %[[block_captured]],
- // SPIR: %[[blk_ptr:.*]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 }>* %[[block]] to i32 ()*
- // SPIR: %[[blk_gen_ptr:.*]] = addrspacecast i32 ()* %[[blk_ptr]] to i32 () addrspace(4)*
- // SPIR: store i32 () addrspace(4)* %[[blk_gen_ptr]], i32 () addrspace(4)** %[[block_B:.*]],
- // SPIR: %[[blk_gen_ptr:.*]] = load i32 () addrspace(4)*, i32 () addrspace(4)** %[[block_B]]
- // SPIR: %[[block_literal:.*]] = bitcast i32 () addrspace(4)* %[[blk_gen_ptr]] to %struct.__opencl_block_literal_generic addrspace(4)*
+ // SPIR: %[[blk_ptr:.*]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 }>* %[[block]] to %struct.__opencl_block_literal_generic*
+ // SPIR: %[[blk_gen_ptr:.*]] = addrspacecast %struct.__opencl_block_literal_generic* %[[blk_ptr]] to %struct.__opencl_block_literal_generic addrspace(4)*
+ // SPIR: store %struct.__opencl_block_literal_generic addrspace(4)* %[[blk_gen_ptr]], %struct.__opencl_block_literal_generic addrspace(4)** %[[block_B:.*]],
+ // SPIR: %[[block_literal:.*]] = load %struct.__opencl_block_literal_generic addrspace(4)*, %struct.__opencl_block_literal_generic addrspace(4)** %[[block_B]]
// SPIR: %[[invoke_addr:.*]] = getelementptr inbounds %struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* %[[block_literal]], i32 0, i32 2
// SPIR: %[[blk_gen_ptr:.*]] = bitcast %struct.__opencl_block_literal_generic addrspace(4)* %[[block_literal]] to i8 addrspace(4)*
// SPIR: %[[invoke_func_ptr:.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %[[invoke_addr]]
@@ -50,11 +49,10 @@ void foo(){
// AMDGCN: %[[block_captured:.*]] = getelementptr inbounds <{ i32, i32, i8*, i32 }>, <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block]], i32 0, i32 3
// AMDGCN: %[[i_value:.*]] = load i32, i32 addrspace(5)* %i
// AMDGCN: store i32 %[[i_value]], i32 addrspace(5)* %[[block_captured]],
- // AMDGCN: %[[blk_ptr:.*]] = bitcast <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block]] to i32 () addrspace(5)*
- // AMDGCN: %[[blk_gen_ptr:.*]] = addrspacecast i32 () addrspace(5)* %[[blk_ptr]] to i32 ()*
- // AMDGCN: store i32 ()* %[[blk_gen_ptr]], i32 ()* addrspace(5)* %[[block_B:.*]],
- // AMDGCN: %[[blk_gen_ptr:.*]] = load i32 ()*, i32 ()* addrspace(5)* %[[block_B]]
- // AMDGCN: %[[block_literal:.*]] = bitcast i32 ()* %[[blk_gen_ptr]] to %struct.__opencl_block_literal_generic*
+ // AMDGCN: %[[blk_ptr:.*]] = bitcast <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block]] to %struct.__opencl_block_literal_generic addrspace(5)*
+ // AMDGCN: %[[blk_gen_ptr:.*]] = addrspacecast %struct.__opencl_block_literal_generic addrspace(5)* %[[blk_ptr]] to %struct.__opencl_block_literal_generic*
+ // AMDGCN: store %struct.__opencl_block_literal_generic* %[[blk_gen_ptr]], %struct.__opencl_block_literal_generic* addrspace(5)* %[[block_B:.*]],
+ // AMDGCN: %[[block_literal:.*]] = load %struct.__opencl_block_literal_generic*, %struct.__opencl_block_literal_generic* addrspace(5)* %[[block_B]]
// AMDGCN: %[[invoke_addr:.*]] = getelementptr inbounds %struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic* %[[block_literal]], i32 0, i32 2
// AMDGCN: %[[blk_gen_ptr:.*]] = bitcast %struct.__opencl_block_literal_generic* %[[block_literal]] to i8*
// AMDGCN: %[[invoke_func_ptr:.*]] = load i8*, i8** %[[invoke_addr]]
diff --git a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index 4732194..8445016 100644
--- a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -11,7 +11,7 @@ typedef struct {int a;} ndrange_t;
// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
-// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G = addrspace(1) constant %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*)
// For anonymous blocks without captures, emit block literals as global variable.
// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) }
@@ -77,9 +77,9 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
- // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
- // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
- // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
+ // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to %struct.__opencl_block_literal_generic*
+ // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to %struct.__opencl_block_literal_generic*
+ // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[BL]] to i8 addrspace(4)*
// COMMON-LABEL: call i32 @__enqueue_kernel_basic(
// COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}},
// COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK1:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
@@ -95,8 +95,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
// COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
// COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL2:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
- // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
- // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
+ // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to %struct.__opencl_block_literal_generic*
+ // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[BL]] to i8 addrspace(4)*
// COMMON-LABEL: call i32 @__enqueue_kernel_basic_events
// COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]],
// COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
@@ -300,13 +300,13 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// Emits global block literal [[BLG8]] and invoke function [[INVG8]].
// The full type of these expressions are long (and repeated elsewhere), so we
// capture it as part of the regex for convenience and clarity.
- // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A
+ // COMMON: store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %block_A
void (^const block_A)(void) = ^{
return;
};
// Emits global block literal [[BLG9]] and invoke function [[INVG9]].
- // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B
+ // COMMON: store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %block_B
void (^const block_B)(local void *) = ^(local void *a) {
return;
};
@@ -346,7 +346,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL3:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
- // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* {{.*}} to i8 addrspace(4)*
+ // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* {{.*}} to i8 addrspace(4)*
// COMMON-LABEL: call i32 @__enqueue_kernel_basic(
// COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}},
// COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK3:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
--
1.8.3.1

View File

@ -1,986 +0,0 @@
From 177cce531fd3665bb964a03db51890e0241e3e72 Mon Sep 17 00:00:00 2001
From: Alexey Sotkin <alexey.sotkin@intel.com>
Date: Thu, 21 Feb 2019 17:14:36 +0300
Subject: [PATCH] Update LowerOpenCL pass to handle new blocks represntation in
LLVM IR
Upstream-Status: Backport [https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/bd6ddfaf7232cd81c7f2fe9877e66f286731bd8e]
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
---
lib/SPIRV/SPIRVLowerOCLBlocks.cpp | 249 ++++--------------------------
test/global_block.ll | 71 ++++-----
test/literal-struct.ll | 31 ++--
test/transcoding/block_w_struct_return.ll | 47 +++---
test/transcoding/enqueue_kernel.ll | 237 ++++++++++++++++------------
5 files changed, 235 insertions(+), 400 deletions(-)
diff --git a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
index c80bf04..b42a4ec 100644
--- a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
+++ b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
@@ -40,207 +40,34 @@
// In both cases values with function type used as intermediate representation
// for block literal structure.
//
-// This pass is designed to find such cases and simplify them to avoid any
-// function pointer types occurrences in LLVM IR in 4 steps.
-//
-// 1. Find all function pointer allocas, like
-// %block = alloca void () *
-//
-// Then find a single store to that alloca:
-// %blockLit = alloca <{ i32, i32, ...}>, align 4
-// %0 = bitcast <{ i32, i32, ... }>* %blockLit to void ()*
-// > store void ()* %0, void ()** %block, align 4
-//
-// And replace the alloca users by new instructions which used stored value
-// %blockLit itself instead of function pointer alloca %block.
-//
-// 2. Find consecutive casts from block literal type to i8 addrspace(4)*
-// used function pointers as an intermediate type:
-// %0 = bitcast <{ i32, i32 }> %block to void() *
-// %1 = addrspacecast void() * %0 to i8 addrspace(4)*
-// And simplify them:
-// %2 = addrspacecast <{ i32, i32 }> %block to i8 addrspace(4)*
-//
-// 3. Find all unused instructions with function pointer type occured after
-// pp.1-2 and remove them.
-//
-// 4. Find unused globals with function pointer type, like
-// @block = constant void ()*
-// bitcast ({ i32, i32 }* @__block_literal_global to void ()*
-//
-// And remove them.
+// In LLVM IR produced by clang, blocks are represented with the following
+// structure:
+// %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+// Pointers to block invoke functions are stored in the third field. Clang
+// replaces inderect function calls in all cases except if block is passed as a
+// function argument. Note that it is somewhat unclear if the OpenCL C spec
+// should allow passing blocks as function argumernts. This pass is not supposed
+// to work correctly with such functions.
+// Clang though has to store function pointers to this structure. Purpose of
+// this pass is to replace store of function pointers(not allowed in SPIR-V)
+// with null pointers.
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "spv-lower-ocl-blocks"
-#include "OCLUtil.h"
#include "SPIRVInternal.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
-#include "llvm/PassSupport.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/Regex.h"
using namespace llvm;
namespace {
-static void
-removeUnusedFunctionPtrInst(Instruction *I,
- SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
- for (unsigned OpIdx = 0, Ops = I->getNumOperands(); OpIdx != Ops; ++OpIdx) {
- Instruction *OpI = dyn_cast<Instruction>(I->getOperand(OpIdx));
- I->setOperand(OpIdx, nullptr);
- if (OpI && OpI != I && OpI->user_empty())
- FuncPtrInsts.insert(OpI);
- }
- I->eraseFromParent();
-}
-
-static bool isFuncPtrAlloca(const AllocaInst *AI) {
- auto *ET = dyn_cast<PointerType>(AI->getAllocatedType());
- return ET && ET->getElementType()->isFunctionTy();
-}
-
-static bool hasFuncPtrType(const Value *V) {
- auto *PT = dyn_cast<PointerType>(V->getType());
- return PT && PT->getElementType()->isFunctionTy();
-}
-
-static bool isFuncPtrInst(const Instruction *I) {
- if (auto *AI = dyn_cast<AllocaInst>(I))
- return isFuncPtrAlloca(AI);
-
- for (auto &Op : I->operands()) {
- if (auto *AI = dyn_cast<AllocaInst>(Op))
- return isFuncPtrAlloca(AI);
-
- auto *OpI = dyn_cast<Instruction>(&Op);
- if (OpI && OpI != I && hasFuncPtrType(OpI))
- return true;
- }
- return false;
-}
-
-static StoreInst *findSingleStore(AllocaInst *AI) {
- StoreInst *Store = nullptr;
- for (auto *U : AI->users()) {
- if (!isa<StoreInst>(U))
- continue; // not a store
- if (Store)
- return nullptr; // there are more than one stores
- Store = dyn_cast<StoreInst>(U);
- }
- return Store;
-}
-
-static void fixFunctionPtrAllocaUsers(AllocaInst *AI) {
- // Find and remove a single store to alloca
- auto *SingleStore = findSingleStore(AI);
- assert(SingleStore && "More than one store to the function pointer alloca");
- auto *StoredVal = SingleStore->getValueOperand();
- SingleStore->eraseFromParent();
-
- // Find loads from the alloca and replace thier users
- for (auto *U : AI->users()) {
- auto *LI = dyn_cast<LoadInst>(U);
- if (!LI)
- continue;
-
- for (auto *U : LI->users()) {
- auto *UInst = cast<Instruction>(U);
- auto *Cast = CastInst::CreatePointerBitCastOrAddrSpaceCast(
- StoredVal, UInst->getType(), "", UInst);
- UInst->replaceAllUsesWith(Cast);
- }
- }
-}
-
-static int getBlockLiteralIdx(const Function &F) {
- StringRef FName = F.getName();
- if (isEnqueueKernelBI(FName))
- return FName.contains("events") ? 7 : 4;
- if (isKernelQueryBI(FName))
- return FName.contains("for_ndrange") ? 2 : 1;
- if (FName.startswith("__") && FName.contains("_block_invoke"))
- return F.hasStructRetAttr() ? 1 : 0;
-
- return -1; // No block literal argument
-}
-
-static bool hasBlockLiteralArg(const Function &F) {
- return getBlockLiteralIdx(F) != -1;
-}
-
-static bool simplifyFunctionPtrCasts(Function &F) {
- bool Changed = false;
- int BlockLiteralIdx = getBlockLiteralIdx(F);
- for (auto *U : F.users()) {
- auto *Call = dyn_cast<CallInst>(U);
- if (!Call)
- continue;
- if (Call->getFunction()->getName() == F.getName().str() + "_kernel")
- continue; // Skip block invoke function calls inside block invoke kernels
-
- const DataLayout &DL = F.getParent()->getDataLayout();
- auto *BlockLiteral = Call->getOperand(BlockLiteralIdx);
- auto *BlockLiteralVal = GetUnderlyingObject(BlockLiteral, DL);
- if (isa<GlobalVariable>(BlockLiteralVal))
- continue; // nothing to do with globals
-
- auto *BlockLiteralAlloca = cast<AllocaInst>(BlockLiteralVal);
- assert(!BlockLiteralAlloca->getAllocatedType()->isFunctionTy() &&
- "Function type shouldn't be there");
-
- auto *NewBlockLiteral = CastInst::CreatePointerBitCastOrAddrSpaceCast(
- BlockLiteralAlloca, BlockLiteral->getType(), "", Call);
- BlockLiteral->replaceAllUsesWith(NewBlockLiteral);
- Changed |= true;
- }
- return Changed;
-}
-
-static void
-findFunctionPtrAllocas(Module &M,
- SmallVectorImpl<AllocaInst *> &FuncPtrAllocas) {
- for (auto &F : M) {
- if (F.isDeclaration())
- continue;
- for (auto &I : instructions(F)) {
- auto *AI = dyn_cast<AllocaInst>(&I);
- if (!AI || !isFuncPtrAlloca(AI))
- continue;
- FuncPtrAllocas.push_back(AI);
- }
- }
-}
-
-static void
-findUnusedFunctionPtrInsts(Module &M,
- SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
- for (auto &F : M) {
- if (F.isDeclaration())
- continue;
- for (auto &I : instructions(F))
- if (I.user_empty() && isFuncPtrInst(&I))
- FuncPtrInsts.insert(&I);
- }
-}
-
-static void
-findUnusedFunctionPtrGlbs(Module &M,
- SmallVectorImpl<GlobalVariable *> &FuncPtrGlbs) {
- for (auto &GV : M.globals()) {
- if (!GV.user_empty())
- continue;
- auto *GVType = dyn_cast<PointerType>(GV.getType()->getElementType());
- if (GVType && GVType->getElementType()->isFunctionTy())
- FuncPtrGlbs.push_back(&GV);
- }
+static bool isBlockInvoke(Function &F) {
+ static Regex BlockInvokeRegex("_block_invoke_?[0-9]*$");
+ return BlockInvokeRegex.match(F.getName());
}
class SPIRVLowerOCLBlocks : public ModulePass {
@@ -250,44 +77,24 @@ public:
bool runOnModule(Module &M) {
bool Changed = false;
-
- // 1. Find function pointer allocas and fix their users
- SmallVector<AllocaInst *, 16> FuncPtrAllocas;
- findFunctionPtrAllocas(M, FuncPtrAllocas);
-
- Changed |= !FuncPtrAllocas.empty();
- for (auto *AI : FuncPtrAllocas)
- fixFunctionPtrAllocaUsers(AI);
-
- // 2. Simplify consecutive casts which use function pointer types
- for (auto &F : M)
- if (hasBlockLiteralArg(F))
- Changed |= simplifyFunctionPtrCasts(F);
-
- // 3. Cleanup unused instructions with function pointer type
- // which are occured after pp. 1-2
- SmallSetVector<Instruction *, 16> FuncPtrInsts;
- findUnusedFunctionPtrInsts(M, FuncPtrInsts);
-
- Changed |= !FuncPtrInsts.empty();
- while (!FuncPtrInsts.empty()) {
- Instruction *I = FuncPtrInsts.pop_back_val();
- removeUnusedFunctionPtrInst(I, FuncPtrInsts);
+ for (Function &F : M) {
+ if (!isBlockInvoke(F))
+ continue;
+ for (User *U : F.users()) {
+ if (!isa<Constant>(U))
+ continue;
+ Constant *Null = Constant::getNullValue(U->getType());
+ if (U != Null) {
+ U->replaceAllUsesWith(Null);
+ Changed = true;
+ }
+ }
}
-
- // 4. Find and remove unused global variables with function pointer type
- SmallVector<GlobalVariable *, 16> FuncPtrGlbs;
- findUnusedFunctionPtrGlbs(M, FuncPtrGlbs);
-
- Changed |= !FuncPtrGlbs.empty();
- for (auto *GV : FuncPtrGlbs)
- GV->eraseFromParent();
-
return Changed;
}
static char ID;
-}; // class SPIRVLowerOCLBlocks
+};
char SPIRVLowerOCLBlocks::ID = 0;
diff --git a/test/global_block.ll b/test/global_block.ll
index 4fc453b..b558213 100644
--- a/test/global_block.ll
+++ b/test/global_block.ll
@@ -17,7 +17,7 @@
; RUN: spirv-val %t.spv
; RUN: llvm-spirv -r %t.spv -o - | llvm-dis | FileCheck %s --check-prefix=CHECK-LLVM
-target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"
; CHECK-SPIRV: Name [[block_invoke:[0-9]+]] "_block_invoke"
@@ -27,71 +27,56 @@ target triple = "spir-unknown-unknown"
; CHECK-SPIRV: TypePointer [[int8Ptr:[0-9]+]] 8 [[int8]]
; CHECK-SPIRV: TypeFunction [[block_invoke_type:[0-9]+]] [[int]] [[int8Ptr]] [[int]]
-;; This variable is not needed in SPIRV
-; CHECK-SPIRV-NOT: Name {{[0-9]+}} block_kernel.b1
-; CHECK-LLVM-NOT: @block_kernel.b1
-@block_kernel.b1 = internal addrspace(2) constant i32 (i32) addrspace(4)* addrspacecast (i32 (i32) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i32 (i32) addrspace(1)*) to i32 (i32) addrspace(4)*), align 8
+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@block_kernel.b1 = internal addrspace(2) constant %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), align 4
+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (i32 (i8 addrspace(4)*, i32)* @_block_invoke to i8*) to i8 addrspace(4)*) }, align 4
-; Function Attrs: convergent nounwind
-define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+; Function Attrs: convergent noinline nounwind optnone
+define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
- %res.addr = alloca i32 addrspace(1)*, align 8
- store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
-
+ %res.addr = alloca i32 addrspace(1)*, align 4
+ store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 4
; CHECK-SPIRV: FunctionCall [[int]] {{[0-9]+}} [[block_invoke]] {{[0-9]+}} [[five]]
; CHECK-LLVM: %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* {{.*}}, i32 5)
- %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
-
- %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
- store i32 %call, i32 addrspace(1)* %0, align 4, !tbaa !14
+ %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
+ %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 4
+ store i32 %call, i32 addrspace(1)* %0, align 4
ret void
}
-; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 0 [[block_invoke_type]]
+; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 2 [[block_invoke_type]]
; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int8Ptr]] {{[0-9]+}}
; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int]] {{[0-9]+}}
; CHECK-LLVM: define internal spir_func i32 @_block_invoke(i8 addrspace(4)* {{.*}}, i32 %{{.*}})
-; Function Attrs: convergent nounwind
+; Function Attrs: convergent noinline nounwind optnone
define internal spir_func i32 @_block_invoke(i8 addrspace(4)* %.block_descriptor, i32 %i) #1 {
entry:
- %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
+ %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
%i.addr = alloca i32, align 4
- store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
- store i32 %i, i32* %i.addr, align 4, !tbaa !14
- %0 = load i32, i32* %i.addr, align 4, !tbaa !14
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
+ store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
+ store i32 %i, i32* %i.addr, align 4
+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
+ %0 = load i32, i32* %i.addr, align 4
%add = add nsw i32 %0, 1
ret i32 %add
}
-attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent }
!llvm.module.flags = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
!opencl.ocl.version = !{!1}
!opencl.spir.version = !{!1}
-!opencl.used.extensions = !{!2}
-!opencl.used.optional.core.features = !{!2}
-!opencl.compiler.options = !{!2}
-!llvm.ident = !{!3}
+!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 2, i32 0}
-!2 = !{}
-!3 = !{!"clang version 7.0.0"}
-!4 = !{i32 1}
-!5 = !{!"none"}
-!6 = !{!"int*"}
-!7 = !{!""}
-!8 = !{i1 false}
-!9 = !{i32 0}
-!10 = !{!11, !11, i64 0}
-!11 = !{!"any pointer", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C/C++ TBAA"}
-!14 = !{!15, !15, i64 0}
-!15 = !{!"int", !12, i64 0}
+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"int*"}
+!6 = !{!""}
diff --git a/test/literal-struct.ll b/test/literal-struct.ll
index b88187f..dec957a 100644
--- a/test/literal-struct.ll
+++ b/test/literal-struct.ll
@@ -2,7 +2,7 @@
; structs, i.e. structs whose type has no name. Typicaly clang generate such
; structs if the kernel contains OpenCL 2.0 blocks. The IR was produced with
; the following command:
-; clang -cc1 -triple spir -cl-std=cl2.0 -O0 -finclude-default-header literal-struct.cl -emit-llvm -o test/literal-struct.ll
+; clang -cc1 -triple spir -cl-std=cl2.0 -O0 literal-struct.cl -emit-llvm -o test/literal-struct.ll
; literal-struct.cl:
; void foo()
@@ -17,25 +17,28 @@
; RUN: llvm-spirv %t.bc -o %t.spv
; RUN: spirv-val %t.spv
-; CHECK-DAG: TypeInt [[Int:[0-9]+]] 32 0
-; CHECK-DAG: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] {{$}}
+; CHECK: TypeInt [[Int:[0-9]+]] 32 0
+; CHECK: TypeInt [[Int8:[0-9]+]] 8 0
+; CHECK: TypePointer [[Int8Ptr:[0-9]+]] 8 [[Int8]]
+; CHECK: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] [[Int8Ptr]]
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir"
-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__foo_block_invoke to i8*) to i8 addrspace(4)*) }, align 4
; CHECK: ConstantComposite [[StructType]]
-; This is artificial case is added to cover ConstantNull instrucitions with TypeStruct.
-@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } zeroinitializer, align 4
+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } zeroinitializer, align 4
; CHECK: ConstantNull [[StructType]]
; Function Attrs: convergent noinline nounwind optnone
define spir_func void @foo() #0 {
entry:
- %myBlock = alloca void () addrspace(4)*, align 4
- store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %myBlock, align 4
- call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
+ %myBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 4
+ store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %myBlock, align 4
+ call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
ret void
}
@@ -43,14 +46,14 @@ entry:
define internal spir_func void @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) #0 {
entry:
%.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
ret void
}
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { convergent }
!llvm.module.flags = !{!0}
@@ -60,4 +63,4 @@ attributes #1 = { convergent }
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 2, i32 0}
-!2 = !{!"clang version 8.0.0 "}
+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
diff --git a/test/transcoding/block_w_struct_return.ll b/test/transcoding/block_w_struct_return.ll
index a68820f..ebd2c5f 100644
--- a/test/transcoding/block_w_struct_return.ll
+++ b/test/transcoding/block_w_struct_return.ll
@@ -16,6 +16,8 @@
; res[tid] = kernelBlock(aa).a - 6;
; }
+; clang -cc1 -triple spir -cl-std=cl2.0 -disable-llvm-passes -finclude-default-header block_w_struct_return.cl -emit-llvm -o test/transcoding/block_w_struct_return.ll
+
; RUN: llvm-as %s -o %t.bc
; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt
; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV
@@ -28,12 +30,14 @@
; CHECK-SPIRV: Name [[BlockInv:[0-9]+]] "__block_ret_struct_block_invoke"
; CHECK-SPIRV: 4 TypeInt [[IntTy:[0-9]+]] 32
+; CHECK-SPIRV: 4 TypeInt [[Int8Ty:[0-9]+]] 8
+; CHECK-SPIRV: 4 TypePointer [[Int8Ptr:[0-9]+]] 8 [[Int8Ty]]
; CHECK-SPIRV: 3 TypeStruct [[StructTy:[0-9]+]] [[IntTy]]
; CHECK-SPIRV: 4 TypePointer [[StructPtrTy:[0-9]+]] 7 [[StructTy]]
; CHECK-SPIRV: 4 Variable [[StructPtrTy]] [[StructArg:[0-9]+]] 7
; CHECK-SPIRV: 4 Variable [[StructPtrTy]] [[StructRet:[0-9]+]] 7
-; CHECK-SPIRV: 4 PtrCastToGeneric {{[0-9]+}} [[BlockLit:[0-9]+]] {{[0-9]+}}
+; CHECK-SPIRV: 4 PtrCastToGeneric [[Int8Ptr]] [[BlockLit:[0-9]+]] {{[0-9]+}}
; CHECK-SPIRV: 7 FunctionCall {{[0-9]+}} {{[0-9]+}} [[BlockInv]] [[StructRet]] [[BlockLit]] [[StructArg]]
; CHECK-LLVM: %[[StructA:.*]] = type { i32 }
@@ -42,20 +46,21 @@
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir64-unknown-unknown"
+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
%struct.A = type { i32 }
-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 16, i32 8, i8 addrspace(4)* addrspacecast (i8* bitcast (void (%struct.A*, i8 addrspace(4)*, %struct.A*)* @__block_ret_struct_block_invoke to i8*) to i8 addrspace(4)*) }, align 8
; Function Attrs: convergent noinline nounwind optnone
-define spir_kernel void @block_ret_struct(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 !kernel_arg_host_accessible !8 !kernel_arg_pipe_depth !9 !kernel_arg_pipe_io !7 !kernel_arg_buffer_location !7 {
+define spir_kernel void @block_ret_struct(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%res.addr = alloca i32 addrspace(1)*, align 8
- %kernelBlock = alloca void (%struct.A*, %struct.A*) addrspace(4)*, align 8
+ %kernelBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 8
%tid = alloca i64, align 8
%aa = alloca %struct.A, align 4
%tmp = alloca %struct.A, align 4
store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8
- store void (%struct.A*, %struct.A*) addrspace(4)* addrspacecast (void (%struct.A*, %struct.A*) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void (%struct.A*, %struct.A*) addrspace(1)*) to void (%struct.A*, %struct.A*) addrspace(4)*), void (%struct.A*, %struct.A*) addrspace(4)** %kernelBlock, align 8
+ store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %kernelBlock, align 8
%call = call spir_func i64 @_Z13get_global_idj(i32 0) #4
store i64 %call, i64* %tid, align 8
%0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8
@@ -64,7 +69,7 @@ entry:
store i32 -1, i32 addrspace(1)* %arrayidx, align 4
%a = getelementptr inbounds %struct.A, %struct.A* %aa, i32 0, i32 0
store i32 5, i32* %a, align 4
- call spir_func void @__block_ret_struct_block_invoke(%struct.A* sret %tmp, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), %struct.A* byval align 4 %aa) #5
+ call spir_func void @__block_ret_struct_block_invoke(%struct.A* sret %tmp, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), %struct.A* byval align 4 %aa) #5
%a1 = getelementptr inbounds %struct.A, %struct.A* %tmp, i32 0, i32 0
%2 = load i32, i32* %a1, align 4
%sub = sub nsw i32 %2, 6
@@ -79,10 +84,10 @@ entry:
define internal spir_func void @__block_ret_struct_block_invoke(%struct.A* noalias sret %agg.result, i8 addrspace(4)* %.block_descriptor, %struct.A* byval align 4 %a) #1 {
entry:
%.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 8
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 8
store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 8
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 8
%a1 = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 0
store i32 6, i32* %a1, align 4
%0 = bitcast %struct.A* %agg.result to i8*
@@ -97,30 +102,22 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture r
; Function Attrs: convergent nounwind readnone
declare spir_func i64 @_Z13get_global_idj(i32) #3
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { argmemonly nounwind }
attributes #3 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { convergent nounwind readnone }
attributes #5 = { convergent }
!llvm.module.flags = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
!opencl.ocl.version = !{!1}
!opencl.spir.version = !{!1}
-!opencl.used.extensions = !{!2}
-!opencl.used.optional.core.features = !{!2}
-!opencl.compiler.options = !{!2}
-!llvm.ident = !{!3}
+!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 2, i32 0}
-!2 = !{}
-!3 = !{!"clang version 7.0.0"}
-!4 = !{i32 1}
-!5 = !{!"none"}
-!6 = !{!"int*"}
-!7 = !{!""}
-!8 = !{i1 false}
-!9 = !{i32 0}
-
+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"int*"}
+!6 = !{!""}
diff --git a/test/transcoding/enqueue_kernel.ll b/test/transcoding/enqueue_kernel.ll
index 1f0b360..761043e 100644
--- a/test/transcoding/enqueue_kernel.ll
+++ b/test/transcoding/enqueue_kernel.ll
@@ -51,11 +51,12 @@
; ModuleID = 'enqueue_kernel.cl'
source_filename = "enqueue_kernel.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
-target triple = "spir-unknown-unknown"
+target triple = "spir"
%opencl.queue_t = type opaque
%struct.ndrange_t = type { i32 }
%opencl.clk_event_t = type opaque
+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
@@ -66,89 +67,123 @@ target triple = "spir-unknown-unknown"
; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32
; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8
-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt0:[0-9]+]] 0
-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 17
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 21
; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt2:[0-9]+]] 2
-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 20
-; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 24
; CHECK-SPIRV: TypePointer {{[0-9]+}} 7 {{[0-9]+}}
+; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
+; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
; CHECK-SPIRV: TypePointer [[Int32LocPtrTy:[0-9]+]] 7 [[Int32Ty]]
; CHECK-SPIRV: TypeDeviceEvent [[EventTy:[0-9]+]]
-; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
; CHECK-SPIRV: TypePointer [[EventPtrTy:[0-9]+]] 8 [[EventTy]]
; CHECK-SPIRV: TypeFunction [[BlockTy1:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
; CHECK-SPIRV: TypeFunction [[BlockTy2:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
; CHECK-SPIRV: TypeFunction [[BlockTy3:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
; CHECK-SPIRV: ConstantNull [[EventPtrTy]] [[EventNull:[0-9]+]]
-; CHECK-LLVM: [[BlockTy1:%[0-9a-z\.]+]] = type { i32, i32 }
-; CHECK-LLVM: [[BlockTy2:%[0-9a-z\.]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i8 }>
-; CHECK-LLVM: [[BlockTy3:%[0-9a-z\.]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
-; CHECK-LLVM: [[BlockTy4:%[0-9a-z\.]+]] = type <{ i32, i32 }>
+; CHECK-LLVM: [[BlockTy1:%[0-9a-z\.]+]] = type { i32, i32, i8 addrspace(4)* }
+; CHECK-LLVM: [[BlockTy2:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>
+; CHECK-LLVM: [[BlockTy3:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
+; CHECK-LLVM: [[BlockTy4:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)* }>
-; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
-; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
+; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
+; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
-@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3 to i8*) to i8 addrspace(4)*) }, align 4
+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4 to i8*) to i8 addrspace(4)*) }, align 4
; Function Attrs: convergent noinline nounwind optnone
-define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
+ %a.addr = alloca i32 addrspace(1)*, align 4
+ %b.addr = alloca i32 addrspace(1)*, align 4
+ %i.addr = alloca i32, align 4
+ %c0.addr = alloca i8, align 1
%default_queue = alloca %opencl.queue_t*, align 4
%flags = alloca i32, align 4
%ndrange = alloca %struct.ndrange_t, align 4
%clk_event = alloca %opencl.clk_event_t*, align 4
%event_wait_list = alloca %opencl.clk_event_t*, align 4
%event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4
- %block = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, align 4
- %block3 = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
+ %block = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, align 4
+ %tmp = alloca %struct.ndrange_t, align 4
+ %block3 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
+ %tmp4 = alloca %struct.ndrange_t, align 4
%c = alloca i8, align 1
+ %tmp11 = alloca %struct.ndrange_t, align 4
+ %block_sizes = alloca [1 x i32], align 4
+ %tmp12 = alloca %struct.ndrange_t, align 4
+ %block_sizes13 = alloca [3 x i32], align 4
+ store i32 addrspace(1)* %a, i32 addrspace(1)** %a.addr, align 4
+ store i32 addrspace(1)* %b, i32 addrspace(1)** %b.addr, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i8 %c0, i8* %c0.addr, align 1
store i32 0, i32* %flags, align 4
%arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
%0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4
store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4
%1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
%2 = load i32, i32* %flags, align 4
- %block.size = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
- store i32 17, i32* %block.size, align 4
- %block.align = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
+ %3 = bitcast %struct.ndrange_t* %tmp to i8*
+ %4 = bitcast %struct.ndrange_t* %ndrange to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %3, i8* align 4 %4, i32 4, i1 false)
+ %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
+ store i32 21, i32* %block.size, align 4
+ %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
store i32 4, i32* %block.align, align 4
- %block.captured = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
- store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured, align 4
- %block.captured1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
- store i32 %i, i32* %block.captured1, align 4
- %block.captured2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
- store i8 %c0, i8* %block.captured2, align 4
- %3 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block to void ()*
- %4 = addrspacecast void ()* %3 to i8 addrspace(4)*
+ %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
+ store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 4
+ %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
+ %5 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
+ store i32 addrspace(1)* %5, i32 addrspace(1)** %block.captured, align 4
+ %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
+ %6 = load i32, i32* %i.addr, align 4
+ store i32 %6, i32* %block.captured1, align 4
+ %block.captured2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 5
+ %7 = load i8, i8* %c0.addr, align 1
+ store i8 %7, i8* %block.captured2, align 4
+ %8 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block to %struct.__opencl_block_literal_generic*
+ %9 = addrspacecast %struct.__opencl_block_literal_generic* %8 to i8 addrspace(4)*
; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]]
; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
; [[ConstInt0]] [[EventNull]] [[EventNull]]
; [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
-; CHECK-LLVM: [[Block2:%[0-9]+]] = addrspacecast [[BlockTy2]]* %block to i8 addrspace(4)*
+; CHECK-LLVM: [[Block2:%[0-9]+]] = bitcast [[BlockTy2]]* %block to %struct.__opencl_block_literal_generic*
+; CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block2]] to i8 addrspace(4)*
; CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)*
-; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2]])
-
- %5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %4)
- %6 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
- %7 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
- %block.size5 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
- store i32 20, i32* %block.size5, align 4
- %block.align6 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
+; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2Ptr]])
+
+ %10 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %tmp, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
+ %11 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
+ %12 = load i32, i32* %flags, align 4
+ %13 = bitcast %struct.ndrange_t* %tmp4 to i8*
+ %14 = bitcast %struct.ndrange_t* %ndrange to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %13, i8* align 4 %14, i32 4, i1 false)
+ %15 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
+ %16 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
+ %block.size5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
+ store i32 24, i32* %block.size5, align 4
+ %block.align6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
store i32 4, i32* %block.align6, align 4
- %block.captured7 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
- store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured7, align 4
- %block.captured8 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
- store i32 %i, i32* %block.captured8, align 4
- %block.captured9 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
- store i32 addrspace(1)* %b, i32 addrspace(1)** %block.captured9, align 4
- %8 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to void ()*
- %9 = addrspacecast void ()* %8 to i8 addrspace(4)*
+ %block.invoke7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
+ store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke7, align 4
+ %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
+ %17 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
+ store i32 addrspace(1)* %17, i32 addrspace(1)** %block.captured8, align 4
+ %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
+ %18 = load i32, i32* %i.addr, align 4
+ store i32 %18, i32* %block.captured9, align 4
+ %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 5
+ %19 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4
+ store i32 addrspace(1)* %19, i32 addrspace(1)** %block.captured10, align 4
+ %20 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to %struct.__opencl_block_literal_generic*
+ %21 = addrspacecast %struct.__opencl_block_literal_generic* %20 to i8 addrspace(4)*
+
; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event1:[0-9]+]]
; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event2:[0-9]+]]
@@ -158,16 +193,24 @@ entry:
; [[ConstInt2]] [[Event1]] [[Event2]]
; [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
-; CHECK-LLVM: [[Block3:%[0-9]+]] = addrspacecast [[BlockTy3]]* %block3 to i8 addrspace(4)*
+; CHECK-LLVM: [[Block3:%[0-9]+]] = bitcast [[BlockTy3]]* %block3 to %struct.__opencl_block_literal_generic*
+; CHECK-LLVM: [[Block3Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block3]] to i8 addrspace(4)
; CHECK-LLVM: [[BlockInv3:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8 addrspace(4)*
-; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3]])
-
- %10 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
- %11 = alloca [1 x i32]
- %12 = getelementptr [1 x i32], [1 x i32]* %11, i32 0, i32 0
- %13 = load i8, i8* %c, align 1
- %14 = zext i8 %13 to i32
- store i32 %14, i32* %12, align 4
+; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3Ptr]])
+
+ %22 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %11, i32 %12, %struct.ndrange_t* %tmp4, i32 2, %opencl.clk_event_t* addrspace(4)* %15, %opencl.clk_event_t* addrspace(4)* %16, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %21)
+ %23 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
+ %24 = load i32, i32* %flags, align 4
+ %25 = bitcast %struct.ndrange_t* %tmp11 to i8*
+ %26 = bitcast %struct.ndrange_t* %ndrange to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %25, i8* align 4 %26, i32 4, i1 false)
+ %arraydecay = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+ %27 = addrspacecast %opencl.clk_event_t** %arraydecay to %opencl.clk_event_t* addrspace(4)*
+ %28 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
+ %29 = getelementptr [1 x i32], [1 x i32]* %block_sizes, i32 0, i32 0
+ %30 = load i8, i8* %c, align 1
+ %31 = zext i8 %30 to i32
+ store i32 %31, i32* %29, align 4
; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf31:[0-9]+]]
; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb1:[0-9]+]]
@@ -182,14 +225,18 @@ entry:
; CHECK-LLVM: [[BlockInv0:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8 addrspace(4)*
; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv0]], i8 addrspace(4)* [[Block0]], i32 1, i32* {{.*}})
- %15 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %12)
- %16 = alloca [3 x i32]
- %17 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 0
- store i32 1, i32* %17, align 4
- %18 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 1
- store i32 2, i32* %18, align 4
- %19 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 2
- store i32 4, i32* %19, align 4
+ %32 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %23, i32 %24, %struct.ndrange_t* %tmp11, i32 2, %opencl.clk_event_t* addrspace(4)* %27, %opencl.clk_event_t* addrspace(4)* %28, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %29)
+ %33 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
+ %34 = load i32, i32* %flags, align 4
+ %35 = bitcast %struct.ndrange_t* %tmp12 to i8*
+ %36 = bitcast %struct.ndrange_t* %ndrange to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %35, i8* align 4 %36, i32 4, i1 false)
+ %37 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 0
+ store i32 1, i32* %37, align 4
+ %38 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 1
+ store i32 2, i32* %38, align 4
+ %39 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 2
+ store i32 4, i32* %39, align 4
; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf41:[0-9]+]]
; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf42:[0-9]+]]
@@ -206,24 +253,27 @@ entry:
; CHECK-LLVM: [[BlockInv1:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8 addrspace(4)*
; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}})
- %20 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %17)
+ %40 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %33, i32 %34, %struct.ndrange_t* %tmp12, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %37)
ret void
}
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1) #1
+
; Function Attrs: convergent noinline nounwind optnone
define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 {
entry:
%.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
- %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
- store <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
- %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
+ store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
+ %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 5
%0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4
%conv = sext i8 %0 to i32
- %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 2
+ %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
%1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4
- %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
+ %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
%2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2
store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
@@ -243,19 +293,19 @@ declare i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i
define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #2 {
entry:
%.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
- %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
- store <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
- %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
+ store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
+ %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5
%0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4
- %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
+ %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
%1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1
%2 = load i32, i32 addrspace(1)* %arrayidx, align 4
- %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 2
+ %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
%3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4
- %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
+ %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
%4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4
%arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4
store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
@@ -276,11 +326,11 @@ define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspac
entry:
%.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
%p.addr = alloca i8 addrspace(3)*, align 4
- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4
- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
ret void
}
@@ -300,13 +350,13 @@ entry:
%p1.addr = alloca i8 addrspace(3)*, align 4
%p2.addr = alloca i8 addrspace(3)*, align 4
%p3.addr = alloca i8 addrspace(3)*, align 4
- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4
store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4
store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4
- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
ret void
}
@@ -329,27 +379,20 @@ declare i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*,
; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*)
; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)
-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
-attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
!opencl.ocl.version = !{!1}
!opencl.spir.version = !{!1}
-!opencl.used.extensions = !{!2}
-!opencl.used.optional.core.features = !{!2}
-!opencl.compiler.options = !{!2}
-!llvm.ident = !{!3}
+!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 2, i32 0}
-!2 = !{}
-!3 = !{!"clang version 7.0.0"}
-!4 = !{i32 1, i32 1, i32 0, i32 0}
-!5 = !{!"none", !"none", !"none", !"none"}
-!6 = !{!"int*", !"int*", !"int", !"char"}
-!7 = !{!"", !"", !"", !""}
-!8 = !{i1 false, i1 false, i1 false, i1 false}
-!9 = !{i32 0, i32 0, i32 0, i32 0}
+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
+!3 = !{i32 1, i32 1, i32 0, i32 0}
+!4 = !{!"none", !"none", !"none", !"none"}
+!5 = !{!"int*", !"int*", !"int", !"char"}
+!6 = !{!"", !"", !"", !""}
--
2.7.4

View File

@ -1,38 +1,39 @@
From 7bbd0058362ac3bb5edd7a82d43e1785810776b3 Mon Sep 17 00:00:00 2001
From 559fb8f82295ec4dc64a132b6566939b85c1b6fe Mon Sep 17 00:00:00 2001
From: Anuj Mittal <anuj.mittal@intel.com>
Date: Fri, 29 Mar 2019 08:56:53 +0800
Date: Thu, 15 Aug 2019 22:34:31 +0800
Subject: [PATCH] dont export targets for binaries
The projects using LLVM cmake modules look for target binaries in
sysroot as a result which isn't desirable in this case and isn't needed
either.
Upstream-Status: Inappropriate [cross-compile specific]
Upstream-Status: Inappropriate [cross-compile specific]
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
Signed-off-by: Naveen Saini <naveen.kumar.saini@intel.com>
---
llvm/cmake/modules/AddLLVM.cmake | 9 ---------
llvm/cmake/modules/TableGen.cmake | 6 ------
2 files changed, 15 deletions(-)
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 0df6845..b79f4fa 100644
index 619e986b8aa..d2bc1a25dd9 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -866,12 +866,6 @@ macro(add_llvm_tool name)
@@ -898,12 +898,6 @@ macro(add_llvm_tool name)
if ( ${name} IN_LIST LLVM_TOOLCHAIN_TOOLS OR NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
if( LLVM_BUILD_TOOLS )
- set(export_to_llvmexports)
- if(${name} IN_LIST LLVM_DISTRIBUTION_COMPONENTS OR
- NOT LLVM_DISTRIBUTION_COMPONENTS)
- set(export_to_llvmexports EXPORT LLVMExports)
- set_property(GLOBAL PROPERTY LLVM_HAS_EXPORTS True)
- endif()
-
install(TARGETS ${name}
${export_to_llvmexports}
RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR}
@@ -884,9 +878,6 @@ macro(add_llvm_tool name)
@@ -917,9 +911,6 @@ macro(add_llvm_tool name)
endif()
endif()
endif()
@ -43,18 +44,19 @@ index 0df6845..b79f4fa 100644
endmacro(add_llvm_tool name)
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 3c84ae7..141219f 100644
index 36c026b5c0f..537acd696d8 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -164,14 +164,8 @@ macro(add_tablegen target project)
@@ -148,15 +148,9 @@ macro(add_tablegen target project)
endif()
if (${project} STREQUAL LLVM AND NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
if (${project} STREQUAL LLVM AND NOT LLVM_INSTALL_TOOLCHAIN_ONLY AND LLVM_BUILD_UTILS)
- set(export_to_llvmexports)
- if(${target} IN_LIST LLVM_DISTRIBUTION_COMPONENTS OR
- NOT LLVM_DISTRIBUTION_COMPONENTS)
- set(export_to_llvmexports EXPORT LLVMExports)
- endif()
-
install(TARGETS ${target}
${export_to_llvmexports}
RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR})
@ -62,5 +64,5 @@ index 3c84ae7..141219f 100644
- set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${target})
endmacro()
--
2.7.4
2.17.1

View File

@ -1,9 +1,9 @@
From 91db4c3cf7f290a3cab5caa316fc25a60dd409f1 Mon Sep 17 00:00:00 2001
From: Anuj Mittal <anuj.mittal@intel.com>
Date: Fri, 16 Aug 2019 20:25:16 +0800
Subject: [PATCH] llvm-spirv: skip including tests
From 48e50f06b1bbed94cdf5207587161d4bfce7366e Mon Sep 17 00:00:00 2001
From: Naveen Saini <naveen.kumar.saini@intel.com>
Date: Wed, 21 Aug 2019 14:35:31 +0800
Subject: [PATCH] llvm-spirv: skip building tests
Some of these need clang to be built and since we're building this in-tree,
Some of these need clang to be built and since we're building this in-tree,
that leads to problems when compiling libcxx, compiler-rt which aren't built
in-tree.
@ -13,12 +13,13 @@ all components, disable the building of tests altogether.
Upstream-Status: Inappropriate
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
Signed-off-by: Naveen Saini <naveen.kumar.saini@intel.com>
---
CMakeLists.txt | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d632a50..81ddf62 100644
index 1208741..20ca3e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,13 +15,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
@ -32,7 +33,7 @@ index d632a50..81ddf62 100644
- )
- endif(LLVM_INCLUDE_TESTS)
-
find_package(LLVM 8.0.0 REQUIRED
find_package(LLVM 9.0.0 REQUIRED
COMPONENTS
Analysis
@@ -56,9 +49,6 @@ set(LLVM_SPIRV_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include)
@ -46,5 +47,5 @@ index d632a50..81ddf62 100644
install(
FILES
--
2.7.4
2.17.1

View File

@ -1,294 +0,0 @@
From c94ec28600255098ffb9d73d1b386a7c8a535590 Mon Sep 17 00:00:00 2001
From: Andrew Savonichev <andrew.savonichev@intel.com>
Date: Thu, 21 Feb 2019 11:02:10 +0000
Subject: [PATCH 2/2] [OpenCL] Simplify LLVM IR generated for OpenCL blocks
Summary:
Emit direct call of block invoke functions when possible, i.e. in case the
block is not passed as a function argument.
Also doing some refactoring of `CodeGenFunction::EmitBlockCallExpr()`
Reviewers: Anastasia, yaxunl, svenvh
Reviewed By: Anastasia
Subscribers: cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D58388
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@354568 91177308-0d34-0410-b5e6-96231b3b80d8
Upstream-Status: Backport
[https://github.com/llvm-mirror/clang/commit/eae71f8d05ce550c4e2595c9b7082cc2c7882c58]
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
---
lib/CodeGen/CGBlocks.cpp | 77 +++++++++++++-------------
lib/CodeGen/CGOpenCLRuntime.cpp | 30 +++++++---
lib/CodeGen/CGOpenCLRuntime.h | 4 ++
test/CodeGenOpenCL/blocks.cl | 10 +---
test/CodeGenOpenCL/cl20-device-side-enqueue.cl | 34 +++++++++---
5 files changed, 91 insertions(+), 64 deletions(-)
diff --git a/lib/CodeGen/CGBlocks.cpp b/lib/CodeGen/CGBlocks.cpp
index fa3c3ee..10a0238 100644
--- a/lib/CodeGen/CGBlocks.cpp
+++ b/lib/CodeGen/CGBlocks.cpp
@@ -1261,52 +1261,49 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
ReturnValueSlot ReturnValue) {
const BlockPointerType *BPT =
E->getCallee()->getType()->getAs<BlockPointerType>();
-
llvm::Value *BlockPtr = EmitScalarExpr(E->getCallee());
-
- // Get a pointer to the generic block literal.
- // For OpenCL we generate generic AS void ptr to be able to reuse the same
- // block definition for blocks with captures generated as private AS local
- // variables and without captures generated as global AS program scope
- // variables.
- unsigned AddrSpace = 0;
- if (getLangOpts().OpenCL)
- AddrSpace = getContext().getTargetAddressSpace(LangAS::opencl_generic);
-
- llvm::Type *BlockLiteralTy =
- llvm::PointerType::get(CGM.getGenericBlockLiteralType(), AddrSpace);
-
- // Bitcast the callee to a block literal.
- BlockPtr =
- Builder.CreatePointerCast(BlockPtr, BlockLiteralTy, "block.literal");
-
- // Get the function pointer from the literal.
- llvm::Value *FuncPtr =
- Builder.CreateStructGEP(CGM.getGenericBlockLiteralType(), BlockPtr,
- CGM.getLangOpts().OpenCL ? 2 : 3);
-
- // Add the block literal.
+ llvm::Type *GenBlockTy = CGM.getGenericBlockLiteralType();
+ llvm::Value *Func = nullptr;
+ QualType FnType = BPT->getPointeeType();
+ ASTContext &Ctx = getContext();
CallArgList Args;
- QualType VoidPtrQualTy = getContext().VoidPtrTy;
- llvm::Type *GenericVoidPtrTy = VoidPtrTy;
if (getLangOpts().OpenCL) {
- GenericVoidPtrTy = CGM.getOpenCLRuntime().getGenericVoidPointerType();
- VoidPtrQualTy =
- getContext().getPointerType(getContext().getAddrSpaceQualType(
- getContext().VoidTy, LangAS::opencl_generic));
- }
-
- BlockPtr = Builder.CreatePointerCast(BlockPtr, GenericVoidPtrTy);
- Args.add(RValue::get(BlockPtr), VoidPtrQualTy);
-
- QualType FnType = BPT->getPointeeType();
+ // For OpenCL, BlockPtr is already casted to generic block literal.
+
+ // First argument of a block call is a generic block literal casted to
+ // generic void pointer, i.e. i8 addrspace(4)*
+ llvm::Value *BlockDescriptor = Builder.CreatePointerCast(
+ BlockPtr, CGM.getOpenCLRuntime().getGenericVoidPointerType());
+ QualType VoidPtrQualTy = Ctx.getPointerType(
+ Ctx.getAddrSpaceQualType(Ctx.VoidTy, LangAS::opencl_generic));
+ Args.add(RValue::get(BlockDescriptor), VoidPtrQualTy);
+ // And the rest of the arguments.
+ EmitCallArgs(Args, FnType->getAs<FunctionProtoType>(), E->arguments());
+
+ // We *can* call the block directly unless it is a function argument.
+ if (!isa<ParmVarDecl>(E->getCalleeDecl()))
+ Func = CGM.getOpenCLRuntime().getInvokeFunction(E->getCallee());
+ else {
+ llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
+ Func = Builder.CreateAlignedLoad(FuncPtr, getPointerAlign());
+ }
+ } else {
+ // Bitcast the block literal to a generic block literal.
+ BlockPtr = Builder.CreatePointerCast(
+ BlockPtr, llvm::PointerType::get(GenBlockTy, 0), "block.literal");
+ // Get pointer to the block invoke function
+ llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
- // And the rest of the arguments.
- EmitCallArgs(Args, FnType->getAs<FunctionProtoType>(), E->arguments());
+ // First argument is a block literal casted to a void pointer
+ BlockPtr = Builder.CreatePointerCast(BlockPtr, VoidPtrTy);
+ Args.add(RValue::get(BlockPtr), Ctx.VoidPtrTy);
+ // And the rest of the arguments.
+ EmitCallArgs(Args, FnType->getAs<FunctionProtoType>(), E->arguments());
- // Load the function.
- llvm::Value *Func = Builder.CreateAlignedLoad(FuncPtr, getPointerAlign());
+ // Load the function.
+ Func = Builder.CreateAlignedLoad(FuncPtr, getPointerAlign());
+ }
const FunctionType *FuncTy = FnType->castAs<FunctionType>();
const CGFunctionInfo &FnInfo =
diff --git a/lib/CodeGen/CGOpenCLRuntime.cpp b/lib/CodeGen/CGOpenCLRuntime.cpp
index 7f6f595..75003e5 100644
--- a/lib/CodeGen/CGOpenCLRuntime.cpp
+++ b/lib/CodeGen/CGOpenCLRuntime.cpp
@@ -123,6 +123,23 @@ llvm::PointerType *CGOpenCLRuntime::getGenericVoidPointerType() {
CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
}
+// Get the block literal from an expression derived from the block expression.
+// OpenCL v2.0 s6.12.5:
+// Block variable declarations are implicitly qualified with const. Therefore
+// all block variables must be initialized at declaration time and may not be
+// reassigned.
+static const BlockExpr *getBlockExpr(const Expr *E) {
+ const Expr *Prev = nullptr; // to make sure we do not stuck in infinite loop.
+ while(!isa<BlockExpr>(E) && E != Prev) {
+ Prev = E;
+ E = E->IgnoreCasts();
+ if (auto DR = dyn_cast<DeclRefExpr>(E)) {
+ E = cast<VarDecl>(DR->getDecl())->getInit();
+ }
+ }
+ return cast<BlockExpr>(E);
+}
+
/// Record emitted llvm invoke function and llvm block literal for the
/// corresponding block expression.
void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
@@ -137,20 +154,17 @@ void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E,
EnqueuedBlockMap[E].Kernel = nullptr;
}
+llvm::Function *CGOpenCLRuntime::getInvokeFunction(const Expr *E) {
+ return EnqueuedBlockMap[getBlockExpr(E)].InvokeFunc;
+}
+
CGOpenCLRuntime::EnqueuedBlockInfo
CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E) {
CGF.EmitScalarExpr(E);
// The block literal may be assigned to a const variable. Chasing down
// to get the block literal.
- if (auto DR = dyn_cast<DeclRefExpr>(E)) {
- E = cast<VarDecl>(DR->getDecl())->getInit();
- }
- E = E->IgnoreImplicit();
- if (auto Cast = dyn_cast<CastExpr>(E)) {
- E = Cast->getSubExpr();
- }
- auto *Block = cast<BlockExpr>(E);
+ const BlockExpr *Block = getBlockExpr(E);
assert(EnqueuedBlockMap.find(Block) != EnqueuedBlockMap.end() &&
"Block expression not emitted");
diff --git a/lib/CodeGen/CGOpenCLRuntime.h b/lib/CodeGen/CGOpenCLRuntime.h
index 750721f..4effc7e 100644
--- a/lib/CodeGen/CGOpenCLRuntime.h
+++ b/lib/CodeGen/CGOpenCLRuntime.h
@@ -92,6 +92,10 @@ public:
/// \param Block block literal emitted for the block expression.
void recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF,
llvm::Value *Block);
+
+ /// \return LLVM block invoke function emitted for an expression derived from
+ /// the block expression.
+ llvm::Function *getInvokeFunction(const Expr *E);
};
}
diff --git a/test/CodeGenOpenCL/blocks.cl b/test/CodeGenOpenCL/blocks.cl
index 19aacc3..ab5a2c6 100644
--- a/test/CodeGenOpenCL/blocks.cl
+++ b/test/CodeGenOpenCL/blocks.cl
@@ -39,11 +39,8 @@ void foo(){
// SPIR: %[[blk_gen_ptr:.*]] = addrspacecast %struct.__opencl_block_literal_generic* %[[blk_ptr]] to %struct.__opencl_block_literal_generic addrspace(4)*
// SPIR: store %struct.__opencl_block_literal_generic addrspace(4)* %[[blk_gen_ptr]], %struct.__opencl_block_literal_generic addrspace(4)** %[[block_B:.*]],
// SPIR: %[[block_literal:.*]] = load %struct.__opencl_block_literal_generic addrspace(4)*, %struct.__opencl_block_literal_generic addrspace(4)** %[[block_B]]
- // SPIR: %[[invoke_addr:.*]] = getelementptr inbounds %struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* %[[block_literal]], i32 0, i32 2
// SPIR: %[[blk_gen_ptr:.*]] = bitcast %struct.__opencl_block_literal_generic addrspace(4)* %[[block_literal]] to i8 addrspace(4)*
- // SPIR: %[[invoke_func_ptr:.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %[[invoke_addr]]
- // SPIR: %[[invoke_func:.*]] = addrspacecast i8 addrspace(4)* %[[invoke_func_ptr]] to i32 (i8 addrspace(4)*)*
- // SPIR: call {{.*}}i32 %[[invoke_func]](i8 addrspace(4)* %[[blk_gen_ptr]])
+ // SPIR: call {{.*}}i32 @__foo_block_invoke(i8 addrspace(4)* %[[blk_gen_ptr]])
// AMDGCN: %[[block_invoke:.*]] = getelementptr inbounds <{ i32, i32, i8*, i32 }>, <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block:.*]], i32 0, i32 2
// AMDGCN: store i8* bitcast (i32 (i8*)* @__foo_block_invoke to i8*), i8* addrspace(5)* %[[block_invoke]]
// AMDGCN: %[[block_captured:.*]] = getelementptr inbounds <{ i32, i32, i8*, i32 }>, <{ i32, i32, i8*, i32 }> addrspace(5)* %[[block]], i32 0, i32 3
@@ -53,11 +50,8 @@ void foo(){
// AMDGCN: %[[blk_gen_ptr:.*]] = addrspacecast %struct.__opencl_block_literal_generic addrspace(5)* %[[blk_ptr]] to %struct.__opencl_block_literal_generic*
// AMDGCN: store %struct.__opencl_block_literal_generic* %[[blk_gen_ptr]], %struct.__opencl_block_literal_generic* addrspace(5)* %[[block_B:.*]],
// AMDGCN: %[[block_literal:.*]] = load %struct.__opencl_block_literal_generic*, %struct.__opencl_block_literal_generic* addrspace(5)* %[[block_B]]
- // AMDGCN: %[[invoke_addr:.*]] = getelementptr inbounds %struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic* %[[block_literal]], i32 0, i32 2
// AMDGCN: %[[blk_gen_ptr:.*]] = bitcast %struct.__opencl_block_literal_generic* %[[block_literal]] to i8*
- // AMDGCN: %[[invoke_func_ptr:.*]] = load i8*, i8** %[[invoke_addr]]
- // AMDGCN: %[[invoke_func:.*]] = bitcast i8* %[[invoke_func_ptr]] to i32 (i8*)*
- // AMDGCN: call {{.*}}i32 %[[invoke_func]](i8* %[[blk_gen_ptr]])
+ // AMDGCN: call {{.*}}i32 @__foo_block_invoke(i8* %[[blk_gen_ptr]])
int (^ block_B)(void) = ^{
return i;
diff --git a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index 8445016..1566912 100644
--- a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -312,9 +312,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
};
// Uses global block literal [[BLG8]] and invoke function [[INVG8]].
- // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2)
- // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)*
- // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ // COMMON: call spir_func void @__device_side_enqueue_block_invoke_11(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
block_A();
// Emits global block literal [[BLG8]] and block kernel [[INVGK8]]. [[INVGK8]] calls [[INVG8]].
@@ -333,15 +331,35 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
unsigned size = get_kernel_work_group_size(block_A);
// Uses global block literal [[BLG8]] and invoke function [[INVG8]]. Make sure no redundant block literal and invoke functions are emitted.
- // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2)
- // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)*
- // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ // COMMON: call spir_func void @__device_side_enqueue_block_invoke_11(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
block_A();
+ // Make sure that block invoke function is resolved correctly after sequence of assignements.
+ // COMMON: store %struct.__opencl_block_literal_generic addrspace(4)*
+ // COMMON-SAME: addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)*
+ // COMMON-SAME: bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to %struct.__opencl_block_literal_generic addrspace(1)*)
+ // COMMON-SAME: to %struct.__opencl_block_literal_generic addrspace(4)*),
+ // COMMON-SAME: %struct.__opencl_block_literal_generic addrspace(4)** %b1,
+ bl_t b1 = block_G;
+ // COMMON: store %struct.__opencl_block_literal_generic addrspace(4)*
+ // COMMON-SAME: addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)*
+ // COMMON-SAME: bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to %struct.__opencl_block_literal_generic addrspace(1)*)
+ // COMMON-SAME: to %struct.__opencl_block_literal_generic addrspace(4)*),
+ // COMMON-SAME: %struct.__opencl_block_literal_generic addrspace(4)** %b2,
+ bl_t b2 = b1;
+ // COMMON: call spir_func void @block_G_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)*
+ // COMMON-SAME: bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*)
+ // COOMON-SAME: to i8 addrspace(4)*), i8 addrspace(3)* null)
+ b2(0);
+ // Uses global block literal [[BL_GLOBAL]] and block kernel [[INV_G_K]]. [[INV_G_K]] calls [[INV_G]].
+ // COMMON: call i32 @__get_kernel_preferred_work_group_size_multiple_impl(
+ // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INV_G_K:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
+ // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ size = get_kernel_preferred_work_group_size_multiple(b2);
+
void (^block_C)(void) = ^{
callee(i, a);
};
-
// Emits block literal on stack and block kernel [[INVLK3]].
// COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL3:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
@@ -404,8 +422,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}})
// COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}})
// COMMON: define internal spir_kernel void [[INVGK8]](i8 addrspace(4)*{{.*}})
+// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
// COMMON: define internal spir_kernel void [[INVLK3]](i8 addrspace(4)*{{.*}})
// COMMON: define internal spir_kernel void [[INVGK9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
-// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
// COMMON: define internal spir_kernel void [[INVGK10]](i8 addrspace(4)*{{.*}})
// COMMON: define internal spir_kernel void [[INVGK11]](i8 addrspace(4)*{{.*}})
--
1.8.3.1

View File

@ -1,27 +0,0 @@
From a2c093c8db7b4e3a5612d0fcce9e4fd1756d6e4b Mon Sep 17 00:00:00 2001
From: Alexey Sotkin <alexey.sotkin@intel.com>
Date: Mon, 5 Aug 2019 18:18:01 +0300
Subject: [PATCH] Remove extra semicolon
Upstream-Status: Backport
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
---
lib/SPIRV/libSPIRV/SPIRVEnum.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/SPIRV/libSPIRV/SPIRVEnum.h b/lib/SPIRV/libSPIRV/SPIRVEnum.h
index c93a484..3a071e7 100644
--- a/lib/SPIRV/libSPIRV/SPIRVEnum.h
+++ b/lib/SPIRV/libSPIRV/SPIRVEnum.h
@@ -124,7 +124,7 @@ template <> inline void SPIRVMap<SPIRVExtensionKind, std::string>::init() {
add(SPV_INTEL_device_side_avc_motion_estimation,
"SPV_INTEL_device_side_avc_motion_estimation");
add(SPV_KHR_no_integer_wrap_decoration, "SPV_KHR_no_integer_wrap_decoration");
-};
+}
template <> inline void SPIRVMap<SPIRVExtInstSetKind, std::string>::init() {
add(SPIRVEIS_OpenCL, "OpenCL.std");
--
2.7.4

View File

@ -1,61 +0,0 @@
From 29e2813a2ab7d5569860bb07892dfef7b5374d96 Mon Sep 17 00:00:00 2001
From: Yaxun Liu <Yaxun.Liu@amd.com>
Date: Tue, 26 Feb 2019 16:20:41 +0000
Subject: [PATCH] [OpenCL] Fix assertion due to blocks
A recent change caused assertion in CodeGenFunction::EmitBlockCallExpr when a block is called.
There is code
Func = CGM.getOpenCLRuntime().getInvokeFunction(E->getCallee());
getCalleeDecl calls Expr::getReferencedDeclOfCallee, which does not handle
BlockExpr and returns nullptr, which causes isa to assert.
This patch fixes that.
Differential Revision: https://reviews.llvm.org/D58658
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@354893 91177308-0d34-0410-b5e6-96231b3b80d8
Upstream-Status: Backport
[https://github.com/llvm-mirror/clang/commit/29e2813a2ab7d5569860bb07892dfef7b5374d96]
Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
---
lib/AST/Expr.cpp | 2 ++
test/CodeGenOpenCL/blocks.cl | 6 ++++++
2 files changed, 8 insertions(+)
diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp
index aef1eab..85690c7 100644
--- a/lib/AST/Expr.cpp
+++ b/lib/AST/Expr.cpp
@@ -1358,6 +1358,8 @@ Decl *Expr::getReferencedDeclOfCallee() {
return DRE->getDecl();
if (MemberExpr *ME = dyn_cast<MemberExpr>(CEE))
return ME->getMemberDecl();
+ if (auto *BE = dyn_cast<BlockExpr>(CEE))
+ return BE->getBlockDecl();
return nullptr;
}
diff --git a/test/CodeGenOpenCL/blocks.cl b/test/CodeGenOpenCL/blocks.cl
index ab5a2c6..c3e2685 100644
--- a/test/CodeGenOpenCL/blocks.cl
+++ b/test/CodeGenOpenCL/blocks.cl
@@ -90,6 +90,12 @@ int get42() {
return blockArgFunc(^{return 42;});
}
+// COMMON-LABEL: define {{.*}}@call_block
+// call {{.*}}@__call_block_block_invoke
+int call_block() {
+ return ^int(int num) { return num; } (11);
+}
+
// CHECK-DEBUG: !DIDerivedType(tag: DW_TAG_member, name: "__size"
// CHECK-DEBUG: !DIDerivedType(tag: DW_TAG_member, name: "__align"
--
1.8.3.1

View File

@ -0,0 +1,111 @@
From eeb816d95f0910bd246e37bb2bb3923acf0edf6b Mon Sep 17 00:00:00 2001
From: Aleksander Us <aleksander.us@intel.com>
Date: Mon, 26 Aug 2019 15:47:41 +0300
Subject: [PATCH] [BasicBlockUtils] Add metadata fixing in
SplitBlockPredecessors.
In case when BB is header of some loop and predecessor is latch of
this loop, metadata was not attached to newly created basic block.
This led to loss of loop metadata for other passes.
Upstream-Status: Submitted [https://reviews.llvm.org/D66892]
https://github.com/intel/llvm-patches/commit/8af4449e2d201707f7f2f832b473a0439e255f32
Signed-off-by: Naveen Saini <naveen.kumar.saini@intel.com>
---
lib/Transforms/Utils/BasicBlockUtils.cpp | 23 ++++++++----
test/Transforms/LoopSimplify/loop_metadata.ll | 36 +++++++++++++++++++
2 files changed, 52 insertions(+), 7 deletions(-)
create mode 100644 test/Transforms/LoopSimplify/loop_metadata.ll
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5fa371377c8..3a90ae061fb 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -579,24 +579,33 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
// The new block unconditionally branches to the old block.
BranchInst *BI = BranchInst::Create(BB, NewBB);
+ bool IsBBHeader = LI && LI->isLoopHeader(BB);
+ Loop *BBLoop = LI ? LI->getLoopFor(BB) : nullptr;
// Splitting the predecessors of a loop header creates a preheader block.
- if (LI && LI->isLoopHeader(BB))
+ if (IsBBHeader)
// Using the loop start line number prevents debuggers stepping into the
// loop body for this instruction.
- BI->setDebugLoc(LI->getLoopFor(BB)->getStartLoc());
+ BI->setDebugLoc(BBLoop->getStartLoc());
else
BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
// Move the edges from Preds to point to NewBB instead of BB.
- for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+ for (BasicBlock *Pred : Preds) {
+ Instruction *PI = Pred->getTerminator();
// This is slightly more strict than necessary; the minimum requirement
// is that there be no more than one indirectbr branching to BB. And
// all BlockAddress uses would need to be updated.
- assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+ assert(!isa<IndirectBrInst>(PI) &&
"Cannot split an edge from an IndirectBrInst");
- assert(!isa<CallBrInst>(Preds[i]->getTerminator()) &&
- "Cannot split an edge from a CallBrInst");
- Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+ assert(!isa<CallBrInst>(PI) && "Cannot split an edge from a CallBrInst");
+ if (IsBBHeader && BBLoop->contains(Pred) && BBLoop->isLoopLatch(Pred)) {
+ // Update loop metadata if it exists.
+ if (MDNode *LoopMD = PI->getMetadata(LLVMContext::MD_loop)) {
+ BI->setMetadata(LLVMContext::MD_loop, LoopMD);
+ PI->setMetadata(LLVMContext::MD_loop, nullptr);
+ }
+ }
+ PI->replaceUsesOfWith(BB, NewBB);
}
// Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
diff --git a/test/Transforms/LoopSimplify/loop_metadata.ll b/test/Transforms/LoopSimplify/loop_metadata.ll
new file mode 100644
index 00000000000..c15c92fe3ae
--- /dev/null
+++ b/test/Transforms/LoopSimplify/loop_metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -loop-simplify < %s | FileCheck %s
+
+; CHECK: for.cond.loopexit:
+; CHECK: br label %for.cond, !llvm.loop !0
+; CHECK: br i1 %cmp1, label %for.body1, label %for.cond.loopexit
+
+define void @foo() {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.cond1, %entry
+ %j = phi i32 [ 0, %entry ], [ %add, %for.cond1 ]
+ %cmp = icmp ult i32 %j, 8
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %dummy1 = add i32 1, 1
+ %add = add nuw nsw i32 %j, 1
+ br label %for.cond1
+
+for.cond1: ; preds = %for.body1, %for.body
+ %i.0 = phi i32 [ 1, %for.body ], [ %inc, %for.body1 ]
+ %cmp1 = icmp ult i32 %i.0, 8
+ br i1 %cmp1, label %for.body1, label %for.cond, !llvm.loop !0
+
+for.body1: ; preds = %for.cond1
+ %dummy2 = add i32 1, 1
+ %inc = add nuw nsw i32 %i.0, 1
+ br label %for.cond1
+
+for.end: ; preds = %for.cond
+ ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.full"}
--
2.18.0

View File

@ -0,0 +1,146 @@
From 35e218a886f4c066eabd18685240d55270bd5a6d Mon Sep 17 00:00:00 2001
From: Aleksander Us <aleksander.us@intel.com>
Date: Mon, 26 Aug 2019 15:45:47 +0300
Subject: [PATCH] [IndVarSimplify] Do not use SCEV expander for IVCount in
LFTR when possible.
SCEV analysis cannot properly cache instruction with poison flags
(for example, add nsw outside of loop will not be reused by expander).
This can lead to generating of additional instructions by SCEV expander.
Example IR:
...
%maxval = add nuw nsw i32 %a1, %a2
...
for.body:
...
%cmp22 = icmp ult i32 %ivadd, %maxval
br i1 %cmp22, label %for.body, label %for.end
...
SCEV expander will generate copy of %maxval in preheader but without
nuw/nsw flags. This can be avoided by explicit check that iv count
value gives the same SCEV expressions as calculated by LFTR.
Upstream-Status: Submitted [https://reviews.llvm.org/D66890]
https://github.com/intel/llvm-patches/commit/fd6a6c97341a56fd21bc32bc940afea751312e8f
Signed-off-by: Naveen Saini <naveen.kumar.saini@intel.com>
---
lib/Transforms/Scalar/IndVarSimplify.cpp | 12 +++++++++-
test/Transforms/IndVarSimplify/add_nsw.ll | 23 ++++++++++++++++++++
test/Transforms/IndVarSimplify/lftr-reuse.ll | 9 +++-----
test/Transforms/IndVarSimplify/udiv.ll | 1 +
4 files changed, 38 insertions(+), 7 deletions(-)
create mode 100644 test/Transforms/IndVarSimplify/add_nsw.ll
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index f9fc698a4a9..5e04dac8aa6 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -2375,6 +2375,17 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
if (UsePostInc)
IVLimit = SE->getAddExpr(IVLimit, SE->getOne(IVLimit->getType()));
+ // If computed limit is equal to old limit then do not use SCEV expander
+ // because it can lost NUW/NSW flags and create extra instructions.
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(BI->getOperand(0))) {
+ Value *Limit = Cmp->getOperand(0);
+ if (!L->isLoopInvariant(Limit))
+ Limit = Cmp->getOperand(1);
+ if (SE->getSCEV(Limit) == IVLimit)
+ return Limit;
+ }
+
// Expand the code for the iteration count.
assert(SE->isLoopInvariant(IVLimit, L) &&
"Computed iteration count is not loop invariant!");
@@ -2383,7 +2394,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
// SCEV expression (IVInit) for a pointer type IV value (IndVar).
Type *LimitTy = ExitCount->getType()->isPointerTy() ?
IndVar->getType() : ExitCount->getType();
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
}
}
diff --git a/test/Transforms/IndVarSimplify/add_nsw.ll b/test/Transforms/IndVarSimplify/add_nsw.ll
new file mode 100644
index 00000000000..abd1cbb6c51
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/add_nsw.ll
@@ -0,0 +1,23 @@
+; RUN: opt -indvars -S %s | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-n8:16:32"
+
+; CHECK: for.body.preheader:
+; CHECK-NOT: add
+; CHECK: for.body:
+
+define void @foo(i32 %a1, i32 %a2) {
+entry:
+ %maxval = add nuw nsw i32 %a1, %a2
+ %cmp = icmp slt i32 %maxval, 1
+ br i1 %cmp, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %j.02 = phi i32 [ 0, %entry ], [ %add31, %for.body ]
+ %add31 = add nuw nsw i32 %j.02, 1
+ %cmp22 = icmp slt i32 %add31, %maxval
+ br i1 %cmp22, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
diff --git a/test/Transforms/IndVarSimplify/lftr-reuse.ll b/test/Transforms/IndVarSimplify/lftr-reuse.ll
index 14ae9738696..509d662b767 100644
--- a/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/test/Transforms/IndVarSimplify/lftr-reuse.ll
@@ -67,11 +67,9 @@ define void @expandOuterRecurrence(i32 %arg) nounwind {
; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]]
; CHECK-NEXT: br i1 [[CMP1]], label [[OUTER_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK: outer.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG]], -1
; CHECK-NEXT: br label [[OUTER:%.*]]
; CHECK: outer:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[TMP0]], [[OUTER_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[OUTER_INC:%.*]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC]] ], [ 0, [[OUTER_PREHEADER]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ], [ 0, [[OUTER_PREHEADER]] ]
; CHECK-NEXT: [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]]
; CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]]
@@ -81,14 +79,13 @@ define void @expandOuterRecurrence(i32 %arg) nounwind {
; CHECK: inner:
; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[INNER_PH]] ], [ [[J_INC:%.*]], [[INNER]] ]
; CHECK-NEXT: [[J_INC]] = add nuw nsw i32 [[J]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[J_INC]], [[INDVARS_IV]]
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[J_INC]], [[SUB3]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]]
; CHECK: outer.inc.loopexit:
; CHECK-NEXT: br label [[OUTER_INC]]
; CHECK: outer.inc:
; CHECK-NEXT: [[I_INC]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], -1
-; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[I_INC]], [[TMP0]]
+; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp ne i32 [[I_INC]], [[SUB1]]
; CHECK-NEXT: br i1 [[EXITCOND1]], label [[OUTER]], label [[EXIT_LOOPEXIT:%.*]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
diff --git a/test/Transforms/IndVarSimplify/udiv.ll b/test/Transforms/IndVarSimplify/udiv.ll
index b3f2c2a6a66..3530343ef4a 100644
--- a/test/Transforms/IndVarSimplify/udiv.ll
+++ b/test/Transforms/IndVarSimplify/udiv.ll
@@ -133,6 +133,7 @@ declare i32 @printf(i8* nocapture, ...) nounwind
; CHECK-LABEL: @foo(
; CHECK: for.body.preheader:
; CHECK-NOT: udiv
+; CHECK: for.body:
define void @foo(double* %p, i64 %n) nounwind {
entry:
--
2.18.0

View File

@ -1,14 +1,11 @@
FILESEXTRAPATHS_prepend_intel-x86-common := "${THISDIR}/files:"
SRC_URI_append_intel-x86-common = " \
file://0001-OpenCL-Change-type-of-block-pointer-for-OpenCL.patch;patchdir=clang \
file://0002-OpenCL-Simplify-LLVM-IR-generated-for-OpenCL-blocks.patch;patchdir=clang \
file://0003-OpenCL-Fix-assertion-due-to-blocks.patch;patchdir=clang \
file://0001-dont-export-targets-for-binaries.patch \
git://github.com/KhronosGroup/SPIRV-LLVM-Translator.git;protocol=https;branch=llvm_release_80;destsuffix=git/llvm/projects/llvm-spirv;name=spirv \
file://0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch;patchdir=llvm/projects/llvm-spirv \
file://0002-Remove-extra-semicolon.patch;patchdir=llvm/projects/llvm-spirv \
file://BasicBlockUtils-Add-metadata-fixing-in-SplitBlockPre.patch;patchdir=llvm \
file://IndVarSimplify-Do-not-use-SCEV-expander-for-IVCount-.patch;patchdir=llvm \
git://github.com/KhronosGroup/SPIRV-LLVM-Translator.git;protocol=https;branch=llvm_release_90;destsuffix=git/llvm/projects/llvm-spirv;name=spirv \
file://0001-skip-building-tests.patch;patchdir=llvm/projects/llvm-spirv \
"
SRCREV_spirv = "1d48cd84d04a2f60b43ea3f66eb7c86f4e5973a9"
SRCREV_spirv = "70420631144a6a25613ae37178f2cc1d3607b65b"