#include "polly/CodeGen/LoopGeneratorsKMP.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Module.h"
using namespace llvm;
using namespace polly;
void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
const std::string Name = "__kmpc_fork_call";
Function *F = M->getFunction(Name);
Type *KMPCMicroTy = StructType::getTypeByName(M->getContext(), "kmpc_micro");
if (!KMPCMicroTy) {
Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(),
Builder.getInt32Ty()->getPointerTo()};
KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true);
}
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
KMPCMicroTy->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast(
SubFn, KMPCMicroTy->getPointerTo());
Value *Args[] = {SourceLocationInfo,
Builder.getInt32(4) ,
Task,
LB,
UB,
Stride,
SubFnParam};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
if (PollyNumThreads > 0) {
Value *GlobalThreadID = createCallGlobalThreadNum();
createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads));
}
createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
}
Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const {
std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(),
Builder.getInt32Ty()->getPointerTo(),
LongType,
LongType,
LongType,
Builder.getPtrTy()};
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
Function *SubFn = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_polly_subfn", M);
Function::arg_iterator AI = SubFn->arg_begin();
AI->setName("polly.kmpc.global_tid");
std::advance(AI, 1);
AI->setName("polly.kmpc.bound_tid");
std::advance(AI, 1);
AI->setName("polly.kmpc.lb");
std::advance(AI, 1);
AI->setName("polly.kmpc.ub");
std::advance(AI, 1);
AI->setName("polly.kmpc.inc");
std::advance(AI, 1);
AI->setName("polly.kmpc.shared");
return SubFn;
}
std::tuple<Value *, Function *>
ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride,
AllocaInst *StructData,
SetVector<Value *> Data, ValueMapT &Map) {
Function *SubFn = createSubFnDefinition();
LLVMContext &Context = SubFn->getContext();
BasicBlock *PrevBB = Builder.GetInsertBlock();
BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
BasicBlock *CheckNextBB =
BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
BasicBlock *PreHeaderBB =
BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);
DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(CheckNextBB, HeaderBB);
DT.addNewBlock(PreHeaderBB, HeaderBB);
Builder.SetInsertPoint(HeaderBB);
Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
"polly.par.lastIterPtr");
Value *StridePtr =
Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr");
Function::arg_iterator AI = SubFn->arg_begin();
Value *IDPtr = &*AI;
std::advance(AI, 2);
Value *LB = &*AI;
std::advance(AI, 1);
Value *UB = &*AI;
std::advance(AI, 1);
Value *Stride = &*AI;
std::advance(AI, 1);
Value *Shared = &*AI;
extractValuesFromStruct(Data, StructData->getAllocatedType(), Shared, Map);
const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4);
Value *ID = Builder.CreateAlignedLoad(Builder.getInt32Ty(), IDPtr, Alignment,
"polly.par.global_tid");
Builder.CreateAlignedStore(LB, LBPtr, Alignment);
Builder.CreateAlignedStore(UB, UBPtr, Alignment);
Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment);
Builder.CreateAlignedStore(Stride, StridePtr, Alignment);
Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1),
"polly.indvar.UBAdjusted");
Value *ChunkSize =
ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1));
OMPGeneralSchedulingType Scheduling =
getSchedType(PollyChunkSize, PollyScheduling);
switch (Scheduling) {
case OMPGeneralSchedulingType::Dynamic:
case OMPGeneralSchedulingType::Guided:
case OMPGeneralSchedulingType::Runtime:
{
UB = AdjustedUB;
createCallDispatchInit(ID, LB, UB, Stride, ChunkSize);
Value *HasWork =
createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
Value *HasIteration =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
Builder.getInt32(1), "polly.hasIteration");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
Builder.SetInsertPoint(CheckNextBB);
HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
HasIteration =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
Builder.getInt32(1), "polly.hasWork");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB");
}
break;
case OMPGeneralSchedulingType::StaticChunked:
case OMPGeneralSchedulingType::StaticNonChunked:
{
Builder.CreateAlignedStore(AdjustedUB, UBPtr, Alignment);
createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize);
Value *ChunkedStride = Builder.CreateAlignedLoad(
LongType, StridePtr, Alignment, "polly.kmpc.stride");
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB.temp");
Value *UBInRange =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, UB, AdjustedUB,
"polly.indvar.UB.inRange");
UB = Builder.CreateSelect(UBInRange, UB, AdjustedUB, "polly.indvar.UB");
Builder.CreateAlignedStore(UB, UBPtr, Alignment);
Value *HasIteration = Builder.CreateICmp(
llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked) {
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB.entry");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB.entry");
}
Builder.SetInsertPoint(CheckNextBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked) {
Value *NextLB =
Builder.CreateAdd(LB, ChunkedStride, "polly.indvar.nextLB");
Value *NextUB = Builder.CreateAdd(UB, ChunkedStride);
Value *NextUBOutOfBounds =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SGT, NextUB,
AdjustedUB, "polly.indvar.nextUB.outOfBounds");
NextUB = Builder.CreateSelect(NextUBOutOfBounds, AdjustedUB, NextUB,
"polly.indvar.nextUB");
Builder.CreateAlignedStore(NextLB, LBPtr, Alignment);
Builder.CreateAlignedStore(NextUB, UBPtr, Alignment);
Value *HasWork =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, NextLB,
AdjustedUB, "polly.hasWork");
Builder.CreateCondBr(HasWork, PreHeaderBB, ExitBB);
} else {
Builder.CreateBr(ExitBB);
}
Builder.SetInsertPoint(PreHeaderBB);
}
break;
}
Builder.CreateBr(CheckNextBB);
Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
BasicBlock *AfterBB;
Value *IV = createLoop(LB, UB, SequentialLoopStride, Builder, LI, DT, AfterBB,
ICmpInst::ICMP_SLE, nullptr, true,
false);
BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
Builder.SetInsertPoint(ExitBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked ||
Scheduling == OMPGeneralSchedulingType::StaticNonChunked) {
createCallStaticFini(ID);
}
Builder.CreateRetVoid();
Builder.SetInsertPoint(&*LoopBody);
return std::make_tuple(IV, SubFn);
}
Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() {
const std::string Name = "__kmpc_global_thread_num";
Function *F = M->getFunction(Name);
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
CallInst *Call = Builder.CreateCall(F, {SourceLocationInfo});
Call->setDebugLoc(DLGenerated);
return Call;
}
void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID,
Value *NumThreads) {
const std::string Name = "__kmpc_push_num_threads";
Function *F = M->getFunction(Name);
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
Builder.getInt32Ty()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
Value *IsLastPtr,
Value *LBPtr, Value *UBPtr,
Value *StridePtr,
Value *ChunkSize) {
const std::string Name =
is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty(),
Builder.getInt32Ty()->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {
SourceLocationInfo,
GlobalThreadID,
Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
IsLastPtr,
LBPtr,
UBPtr,
StridePtr,
ConstantInt::get(LongType, 1),
ChunkSize};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
const std::string Name = "__kmpc_for_static_fini";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID,
Value *LB, Value *UB,
Value *Inc,
Value *ChunkSize) {
const std::string Name =
is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty(),
LongType,
LongType,
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {
SourceLocationInfo,
GlobalThreadID,
Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
LB,
UB,
Inc,
ChunkSize};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
Value *IsLastPtr,
Value *LBPtr,
Value *UBPtr,
Value *StridePtr) {
const std::string Name =
is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty()->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr,
StridePtr};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
return Call;
}
GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() {
const std::string LocName = ".loc.dummy";
GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName);
if (SourceLocDummy == nullptr) {
const std::string StructName = "struct.ident_t";
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), StructName);
if (!IdentTy) {
Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(),
Builder.getInt32Ty(), Builder.getInt32Ty(),
Builder.getPtrTy()};
IdentTy =
StructType::create(M->getContext(), LocMembers, StructName, false);
}
const auto ArrayType =
llvm::ArrayType::get(Builder.getInt8Ty(), 23);
GlobalVariable *StrVar =
new GlobalVariable(*M, ArrayType, true, GlobalValue::PrivateLinkage,
nullptr, ".str.ident");
StrVar->setAlignment(llvm::Align(1));
SourceLocDummy = new GlobalVariable(
*M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName);
SourceLocDummy->setAlignment(llvm::Align(8));
Constant *InitStr = ConstantDataArray::getString(
M->getContext(), "Source location dummy.", true);
Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP(
ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)}));
Constant *LocInitStruct = ConstantStruct::get(
IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0),
Builder.getInt32(0), StrPtr});
StrVar->setInitializer(InitStr);
SourceLocDummy->setInitializer(LocInitStruct);
}
return SourceLocDummy;
}
bool ParallelLoopGeneratorKMP::is64BitArch() {
return (LongType->getIntegerBitWidth() == 64);
}
OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType(
int ChunkSize, OMPGeneralSchedulingType Scheduling) const {
if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked)
return OMPGeneralSchedulingType::StaticNonChunked;
return Scheduling;
}