//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass to convert gpu.launch_func op into a sequence of // GPU runtime calls. As most of GPU runtimes does not have a stable published // ABI, this pass uses a slim runtime layer that builds on top of the public // API from GPU runtime headers. // //===----------------------------------------------------------------------===// #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/Async/IR/Async.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" namespace mlir { #define GEN_PASS_DEF_GPUTOLLVMCONVERSIONPASS #include "mlir/Conversion/Passes.h.inc" } // namespace mlir using namespace mlir; static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; namespace { class GpuToLLVMConversionPass : public impl::GpuToLLVMConversionPassBase { public: using Base::Base; // Run the dialect converter on the module. void runOnOperation() override; }; struct FunctionCallBuilder { FunctionCallBuilder(StringRef functionName, Type returnType, ArrayRef argumentTypes) : functionName(functionName), functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} LLVM::CallOp create(Location loc, OpBuilder &builder, ArrayRef arguments) const; StringRef functionName; LLVM::LLVMFunctionType functionType; }; template class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { public: explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToLLVMPattern(typeConverter) {} protected: Value getNumElements(ConversionPatternRewriter &rewriter, Location loc, MemRefType type, MemRefDescriptor desc) const { return type.hasStaticShape() ? ConvertToLLVMPattern::createIndexConstant( rewriter, loc, type.getNumElements()) // For identity maps (verified by caller), the number of // elements is stride[0] * size[0]. : rewriter.create(loc, desc.stride(rewriter, loc, 0), desc.size(rewriter, loc, 0)); } MLIRContext *context = &this->getTypeConverter()->getContext(); Type llvmVoidType = LLVM::LLVMVoidType::get(context); LLVM::LLVMPointerType llvmPointerType = this->getTypeConverter()->getPointerType(IntegerType::get(context, 8)); Type llvmPointerPointerType = this->getTypeConverter()->getPointerType(llvmPointerType); Type llvmInt8Type = IntegerType::get(context, 8); Type llvmInt32Type = IntegerType::get(context, 32); Type llvmInt64Type = IntegerType::get(context, 64); Type llvmIntPtrType = IntegerType::get( context, this->getTypeConverter()->getPointerBitwidth(0)); FunctionCallBuilder moduleLoadCallBuilder = { "mgpuModuleLoad", llvmPointerType /* void *module */, {llvmPointerType /* void *cubin */}}; FunctionCallBuilder moduleUnloadCallBuilder = { "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; FunctionCallBuilder moduleGetFunctionCallBuilder = { "mgpuModuleGetFunction", llvmPointerType /* void *function */, { llvmPointerType, /* void *module */ llvmPointerType /* char *name */ }}; FunctionCallBuilder launchKernelCallBuilder = { "mgpuLaunchKernel", llvmVoidType, { llvmPointerType, /* void* f */ llvmIntPtrType, /* intptr_t gridXDim */ llvmIntPtrType, /* intptr_t gridyDim */ llvmIntPtrType, /* intptr_t gridZDim */ llvmIntPtrType, /* intptr_t blockXDim */ llvmIntPtrType, /* intptr_t blockYDim */ llvmIntPtrType, /* intptr_t blockZDim */ llvmInt32Type, /* unsigned int sharedMemBytes */ llvmPointerType, /* void *hstream */ llvmPointerPointerType, /* void **kernelParams */ llvmPointerPointerType /* void **extra */ }}; FunctionCallBuilder streamCreateCallBuilder = { "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; FunctionCallBuilder streamDestroyCallBuilder = { "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}}; FunctionCallBuilder streamSynchronizeCallBuilder = { "mgpuStreamSynchronize", llvmVoidType, {llvmPointerType /* void *stream */}}; FunctionCallBuilder streamWaitEventCallBuilder = { "mgpuStreamWaitEvent", llvmVoidType, {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}}; FunctionCallBuilder eventCreateCallBuilder = { "mgpuEventCreate", llvmPointerType /* void *event */, {}}; FunctionCallBuilder eventDestroyCallBuilder = { "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}}; FunctionCallBuilder eventSynchronizeCallBuilder = { "mgpuEventSynchronize", llvmVoidType, {llvmPointerType /* void *event */}}; FunctionCallBuilder eventRecordCallBuilder = { "mgpuEventRecord", llvmVoidType, {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}}; FunctionCallBuilder hostRegisterCallBuilder = { "mgpuMemHostRegisterMemRef", llvmVoidType, {llvmIntPtrType /* intptr_t rank */, llvmPointerType /* void *memrefDesc */, llvmIntPtrType /* intptr_t elementSizeBytes */}}; FunctionCallBuilder hostUnregisterCallBuilder = { "mgpuMemHostUnregisterMemRef", llvmVoidType, {llvmIntPtrType /* intptr_t rank */, llvmPointerType /* void *memrefDesc */, llvmIntPtrType /* intptr_t elementSizeBytes */}}; FunctionCallBuilder allocCallBuilder = { "mgpuMemAlloc", llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, llvmPointerType /* void *stream */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}}; FunctionCallBuilder memcpyCallBuilder = { "mgpuMemcpy", llvmVoidType, {llvmPointerType /* void *dst */, llvmPointerType /* void *src */, llvmIntPtrType /* intptr_t sizeBytes */, llvmPointerType /* void *stream */}}; FunctionCallBuilder memsetCallBuilder = { "mgpuMemset32", llvmVoidType, {llvmPointerType /* void *dst */, llvmInt32Type /* unsigned int value */, llvmIntPtrType /* intptr_t sizeBytes */, llvmPointerType /* void *stream */}}; FunctionCallBuilder setDefaultDeviceCallBuilder = { "mgpuSetDefaultDevice", llvmVoidType, {llvmInt32Type /* uint32_t devIndex */}}; FunctionCallBuilder createSparseEnvCallBuilder = { "mgpuCreateSparseEnv", llvmPointerType, {llvmPointerType /* void *stream */}}; FunctionCallBuilder destroySparseEnvCallBuilder = { "mgpuDestroySparseEnv", llvmVoidType, {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createDnVecCallBuilder = { "mgpuCreateDnVec", llvmPointerType, {llvmIntPtrType, llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder destroyDnVecCallBuilder = { "mgpuDestroyDnVec", llvmVoidType, {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createCooCallBuilder = { "mgpuCreateCoo", llvmPointerType, {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createCsrCallBuilder = { "mgpuCreateCsr", llvmPointerType, {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder destroySpMatCallBuilder = { "mgpuDestroySpMat", llvmVoidType, {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVBufferSizeCallBuilder = { "mgpuSpMVBufferSize", llvmIntPtrType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVCallBuilder = { "mgpuSpMV", llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType /* void *stream */}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertHostRegisterOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertHostUnregisterOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertHostUnregisterOpToGpuRuntimeCallPattern( LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) { } private: LogicalResult matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertAllocOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertDeallocOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::DeallocOp deallocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertAsyncYieldToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(async::YieldOp yieldOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite pattern to convert gpu.wait operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertWaitOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::WaitOp waitOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertWaitAsyncOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::WaitOp waitOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite patter to convert gpu.launch_func operations into a sequence of /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). /// /// In essence, a gpu.launch_func operations gets compiled into the following /// sequence of runtime calls: /// /// * moduleLoad -- loads the module given the cubin / hsaco data /// * moduleGetFunction -- gets a handle to the actual kernel function /// * getStreamHelper -- initializes a new compute stream on GPU /// * launchKernel -- launches the kernel on a stream /// * streamSynchronize -- waits for operations on the stream to finish /// /// Intermediate data structures are allocated on the stack. class ConvertLaunchFuncOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, StringRef gpuBinaryAnnotation, bool kernelBarePtrCallConv) : ConvertOpToGpuRuntimeCallPattern(typeConverter), gpuBinaryAnnotation(gpuBinaryAnnotation), kernelBarePtrCallConv(kernelBarePtrCallConv) {} private: Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, OpBuilder &builder) const; Value generateKernelNameConstant(StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) const; LogicalResult matchAndRewrite(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; llvm::SmallString<32> gpuBinaryAnnotation; bool kernelBarePtrCallConv; }; class EraseGpuModuleOpPattern : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(gpu::GPUModuleOp op, PatternRewriter &rewriter) const override { // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN, or HSACO data. rewriter.eraseOp(op); return success(); } }; /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertMemcpyOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::MemcpyOp memcpyOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite pattern to convert gpu.memset operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertMemsetOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertMemsetOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::MemsetOp memsetOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; /// A rewrite pattern to convert gpu.set_default_device to a GPU runtime call. /// Currently supports CUDA and ROCm (HIP) class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern( LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern( typeConverter) {} LogicalResult matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertCreateSparseEnvOpToGpuRuntimeCallPattern( LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern( typeConverter) {} private: LogicalResult matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertDestroySparseEnvOpToGpuRuntimeCallPattern( LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern( typeConverter) {} private: LogicalResult matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertCreateDnVecOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertDestroyDnVecOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertCreateCooOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertCreateCooOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::CreateCooOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertCreateCsrOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertCreateCsrOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::CreateCsrOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertDestroySpMatOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertDestroySpMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::DestroySpMatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern( LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) { } private: LogicalResult matchAndRewrite(gpu::SpMVBufferSizeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; class ConvertSpMVOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertSpMVOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} private: LogicalResult matchAndRewrite(gpu::SpMVOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; } // namespace void GpuToLLVMConversionPass::runOnOperation() { LowerToLLVMOptions options(&getContext()); options.useOpaquePointers = useOpaquePointers; LLVMTypeConverter converter(&getContext(), options); RewritePatternSet patterns(&getContext()); LLVMConversionTarget target(getContext()); target.addIllegalDialect(); mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); populateVectorToLLVMConversionPatterns(converter, patterns); populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); populateFuncToLLVMConversionPatterns(converter, patterns); populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, target); populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation, kernelBarePtrCallConv); if (failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); } LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, ArrayRef arguments) const { auto module = builder.getBlock()->getParent()->getParentOfType(); auto function = [&] { if (auto function = module.lookupSymbol(functionName)) return function; return OpBuilder::atBlockEnd(module.getBody()) .create(loc, functionName, functionType); }(); return builder.create(loc, function, arguments); } // Returns whether all operands are of LLVM type. static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, ConversionPatternRewriter &rewriter) { if (!llvm::all_of(operands, [](Value value) { return LLVM::isCompatibleType(value.getType()); })) return rewriter.notifyMatchFailure( op, "Cannot convert if operands aren't of LLVM type."); return success(); } static LogicalResult isAsyncWithOneDependency(ConversionPatternRewriter &rewriter, gpu::AsyncOpInterface op) { if (op.getAsyncDependencies().size() != 1) return rewriter.notifyMatchFailure( op, "Can only convert with exactly one async dependency."); if (!op.getAsyncToken()) return rewriter.notifyMatchFailure(op, "Can convert only async version."); return success(); } LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::HostRegisterOp hostRegisterOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto *op = hostRegisterOp.getOperation(); if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter))) return failure(); Location loc = op->getLoc(); auto memRefType = hostRegisterOp.getValue().getType(); auto elementType = cast(memRefType).getElementType(); auto elementSize = getSizeInBytes(loc, elementType, rewriter); auto arguments = getTypeConverter()->promoteOperands( loc, op->getOperands(), adaptor.getOperands(), rewriter); arguments.push_back(elementSize); hostRegisterCallBuilder.create(loc, rewriter, arguments); rewriter.eraseOp(op); return success(); } LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Operation *op = hostUnregisterOp.getOperation(); if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter))) return failure(); Location loc = op->getLoc(); auto memRefType = hostUnregisterOp.getValue().getType(); auto elementType = cast(memRefType).getElementType(); auto elementSize = getSizeInBytes(loc, elementType, rewriter); auto arguments = getTypeConverter()->promoteOperands( loc, op->getOperands(), adaptor.getOperands(), rewriter); arguments.push_back(elementSize); hostUnregisterCallBuilder.create(loc, rewriter, arguments); rewriter.eraseOp(op); return success(); } LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (adaptor.getHostShared()) return rewriter.notifyMatchFailure( allocOp, "host_shared allocation is not supported"); MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || !isConvertibleAndHasIdentityMaps(memRefType) || failed(isAsyncWithOneDependency(rewriter, allocOp))) return failure(); auto loc = allocOp.getLoc(); // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector shape; SmallVector strides; Value sizeBytes; getMemRefDescriptorSizes(loc, memRefType, adaptor.getDynamicSizes(), rewriter, shape, strides, sizeBytes); // Allocate the underlying buffer and store a pointer to it in the MemRef // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); Value allocatedPtr = allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); if (!getTypeConverter()->useOpaquePointers()) allocatedPtr = rewriter.create(loc, elementPtrType, allocatedPtr); // No alignment. Value alignedPtr = allocatedPtr; // Create the MemRef descriptor. auto memRefDescriptor = this->createMemRefDescriptor( loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter); rewriter.replaceOp(allocOp, {memRefDescriptor, stream}); return success(); } LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DeallocOp deallocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, deallocOp))) return failure(); Location loc = deallocOp.getLoc(); Value pointer = MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pointer = rewriter.create(loc, llvmPointerType, pointer); Value stream = adaptor.getAsyncDependencies().front(); deallocCallBuilder.create(loc, rewriter, {pointer, stream}); rewriter.replaceOp(deallocOp, {stream}); return success(); } static bool isGpuAsyncTokenType(Value value) { return isa(value.getType()); } // Converts !gpu.async.token operands of `async.yield` to runtime calls. The // !gpu.async.token are lowered to stream within the async.execute region, but // are passed as events between them. For each !gpu.async.token operand, we // create an event and record it on the stream. LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( async::YieldOp yieldOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (llvm::none_of(yieldOp.getOperands(), isGpuAsyncTokenType)) return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand"); Location loc = yieldOp.getLoc(); SmallVector newOperands(adaptor.getOperands()); llvm::SmallDenseSet streams; for (auto &operand : yieldOp->getOpOperands()) { if (!isGpuAsyncTokenType(operand.get())) continue; auto idx = operand.getOperandNumber(); auto stream = adaptor.getOperands()[idx]; auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(); eventRecordCallBuilder.create(loc, rewriter, {event, stream}); newOperands[idx] = event; streams.insert(stream); } for (auto stream : streams) streamDestroyCallBuilder.create(loc, rewriter, {stream}); rewriter.updateRootInPlace(yieldOp, [&] { yieldOp->setOperands(newOperands); }); return success(); } // Returns whether `value` is the result of an LLVM::CallOp to `functionName`. static bool isDefinedByCallTo(Value value, StringRef functionName) { assert(isa(value.getType())); if (auto defOp = value.getDefiningOp()) return defOp.getCallee()->equals(functionName); return false; } // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host // with the stream/event operands. The operands are destroyed. That is, it // assumes that it is not used afterwards or elsewhere. Otherwise we will get a // runtime error. Eventually, we should guarantee this property. LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::WaitOp waitOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (waitOp.getAsyncToken()) return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op."); Location loc = waitOp.getLoc(); for (auto operand : adaptor.getOperands()) { if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { // The converted operand's definition created a stream. streamSynchronizeCallBuilder.create(loc, rewriter, {operand}); streamDestroyCallBuilder.create(loc, rewriter, {operand}); } else { // Otherwise the converted operand is an event. This assumes that we use // events in control flow code as well. eventSynchronizeCallBuilder.create(loc, rewriter, {operand}); eventDestroyCallBuilder.create(loc, rewriter, {operand}); } } rewriter.eraseOp(waitOp); return success(); } // Converts `gpu.wait async` to runtime calls. The converted op creates a new // stream that is synchronized with stream/event operands. The operands are // destroyed. That is, it assumes that it is not used afterwards or elsewhere. // Otherwise we will get a runtime error. Eventually, we should guarantee this // property. LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::WaitOp waitOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (!waitOp.getAsyncToken()) return rewriter.notifyMatchFailure(waitOp, "Can only convert async op."); Location loc = waitOp.getLoc(); auto insertionPoint = rewriter.saveInsertionPoint(); SmallVector events; for (auto pair : llvm::zip(waitOp.getAsyncDependencies(), adaptor.getOperands())) { auto operand = std::get<1>(pair); if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { // The converted operand's definition created a stream. Insert an event // into the stream just after the last use of the original token operand. auto *defOp = std::get<0>(pair).getDefiningOp(); rewriter.setInsertionPointAfter(defOp); auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(); eventRecordCallBuilder.create(loc, rewriter, {event, operand}); events.push_back(event); } else { // Otherwise the converted operand is an event. This assumes that we use // events in control flow code as well. events.push_back(operand); } } rewriter.restoreInsertionPoint(insertionPoint); auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(); for (auto event : events) streamWaitEventCallBuilder.create(loc, rewriter, {stream, event}); for (auto event : events) eventDestroyCallBuilder.create(loc, rewriter, {event}); rewriter.replaceOp(waitOp, {stream}); return success(); } // Creates a struct containing all kernel parameters on the stack and returns // an array of type-erased pointers to the fields of the struct. The array can // then be passed to the CUDA / ROCm (HIP) kernel launch calls. // The generated code is essentially as follows: // // %struct = alloca(sizeof(struct { Parameters... })) // %array = alloca(NumParameters * sizeof(void *)) // for (i : [0, NumParameters)) // %fieldPtr = llvm.getelementptr %struct[0, i] // llvm.store parameters[i], %fieldPtr // %elementPtr = llvm.getelementptr %array[i] // llvm.store %fieldPtr, %elementPtr // return %array Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, OpBuilder &builder) const { auto loc = launchOp.getLoc(); auto numKernelOperands = launchOp.getNumKernelOperands(); SmallVector arguments; if (kernelBarePtrCallConv) { // Hack the bare pointer value on just for the argument promotion LLVMTypeConverter *converter = getTypeConverter(); LowerToLLVMOptions options = converter->getOptions(); LowerToLLVMOptions overrideToMatchKernelOpts = options; overrideToMatchKernelOpts.useBarePtrCallConv = true; converter->dangerousSetOptions(overrideToMatchKernelOpts); arguments = converter->promoteOperands( loc, launchOp.getOperands().take_back(numKernelOperands), adaptor.getOperands().take_back(numKernelOperands), builder); converter->dangerousSetOptions(options); } else { arguments = getTypeConverter()->promoteOperands( loc, launchOp.getOperands().take_back(numKernelOperands), adaptor.getOperands().take_back(numKernelOperands), builder); } auto numArguments = arguments.size(); SmallVector argumentTypes; argumentTypes.reserve(numArguments); for (auto argument : arguments) argumentTypes.push_back(argument.getType()); auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(), argumentTypes); auto one = builder.create(loc, llvmInt32Type, 1); auto structPtr = builder.create( loc, getTypeConverter()->getPointerType(structType), structType, one, /*alignment=*/0); auto arraySize = builder.create(loc, llvmInt32Type, numArguments); auto arrayPtr = builder.create( loc, llvmPointerPointerType, llvmPointerType, arraySize, /*alignment=*/0); for (const auto &en : llvm::enumerate(arguments)) { Value fieldPtr = builder.create( loc, getTypeConverter()->getPointerType(argumentTypes[en.index()]), structType, structPtr, ArrayRef{0, en.index()}); builder.create(loc, en.value(), fieldPtr); auto elementPtr = builder.create( loc, llvmPointerPointerType, llvmPointerType, arrayPtr, ArrayRef{en.index()}); if (!getTypeConverter()->useOpaquePointers()) fieldPtr = builder.create(loc, llvmPointerType, fieldPtr); builder.create(loc, fieldPtr, elementPtr); } return arrayPtr; } // Generates an LLVM IR dialect global that contains the name of the given // kernel function as a C string, and returns a pointer to its beginning. // The code is essentially: // // llvm.global constant @kernel_name("function_name\00") // func(...) { // %0 = llvm.addressof @kernel_name // %1 = llvm.constant (0 : index) // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> // } Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) const { // Make sure the trailing zero is included in the constant. std::vector kernelName(name.begin(), name.end()); kernelName.push_back('\0'); std::string globalName = std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); return LLVM::createGlobalString( loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); } // Emits LLVM IR to launch a kernel function. Expects the module that contains // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. // // %0 = call %binarygetter // %1 = call %moduleLoad(%0) // %2 = // %3 = call %moduleGetFunction(%1, %2) // %4 = call %streamCreate() // %5 = // call %launchKernel(%3, , 0, %4, %5, nullptr) // call %streamSynchronize(%4) // call %streamDestroy(%4) // call %moduleUnload(%1) // // If the op is async, the stream corresponds to the (single) async dependency // as well as the async token the op produces. LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter))) return failure(); if (launchOp.getAsyncDependencies().size() > 1) return rewriter.notifyMatchFailure( launchOp, "Cannot convert with more than one async dependency."); // Fail when the synchronous version of the op has async dependencies. The // lowering destroys the stream, and we do not want to check that there is no // use of the stream after this op. if (!launchOp.getAsyncToken() && !launchOp.getAsyncDependencies().empty()) return rewriter.notifyMatchFailure( launchOp, "Cannot convert non-async op with async dependencies."); Location loc = launchOp.getLoc(); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. auto kernelModule = SymbolTable::lookupNearestSymbolFrom( launchOp, launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto binaryAttr = kernelModule->getAttrOfType(gpuBinaryAnnotation); if (!binaryAttr) { kernelModule.emitOpError() << "missing " << gpuBinaryAnnotation << " attribute"; return failure(); } SmallString<128> nameBuffer(kernelModule.getName()); nameBuffer.append(kGpuBinaryStorageSuffix); Value data = LLVM::createGlobalString( loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); auto module = moduleLoadCallBuilder.create(loc, rewriter, data); // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( launchOp.getKernelModuleName().getValue(), launchOp.getKernelName().getValue(), loc, rewriter); auto function = moduleGetFunctionCallBuilder.create( loc, rewriter, {module.getResult(), kernelName}); Value zero = rewriter.create(loc, llvmInt32Type, 0); Value stream = adaptor.getAsyncDependencies().empty() ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult() : adaptor.getAsyncDependencies().front(); // Create array of pointers to kernel arguments. auto kernelParams = generateParamsArray(launchOp, adaptor, rewriter); auto nullpointer = rewriter.create(loc, llvmPointerPointerType); Value dynamicSharedMemorySize = launchOp.getDynamicSharedMemorySize() ? launchOp.getDynamicSharedMemorySize() : zero; launchKernelCallBuilder.create( loc, rewriter, {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, /*extra=*/nullpointer}); if (launchOp.getAsyncToken()) { // Async launch: make dependent ops use the same stream. rewriter.replaceOp(launchOp, {stream}); } else { // Synchronize with host and destroy stream. This must be the stream created // above (with no other uses) because we check that the synchronous version // does not have any async dependencies. streamSynchronizeCallBuilder.create(loc, rewriter, stream); streamDestroyCallBuilder.create(loc, rewriter, stream); rewriter.eraseOp(launchOp); } moduleUnloadCallBuilder.create(loc, rewriter, module.getResult()); return success(); } static Value bitAndAddrspaceCast(Location loc, ConversionPatternRewriter &rewriter, LLVM::LLVMPointerType destinationType, Value sourcePtr, LLVMTypeConverter &typeConverter) { auto sourceTy = cast(sourcePtr.getType()); if (destinationType.getAddressSpace() != sourceTy.getAddressSpace()) sourcePtr = rewriter.create( loc, typeConverter.getPointerType(sourceTy.getElementType(), destinationType.getAddressSpace()), sourcePtr); if (typeConverter.useOpaquePointers()) return sourcePtr; return rewriter.create(loc, destinationType, sourcePtr); } LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::MemcpyOp memcpyOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto memRefType = cast(memcpyOp.getSrc().getType()); if (failed(areAllLLVMTypes(memcpyOp, adaptor.getOperands(), rewriter)) || !isConvertibleAndHasIdentityMaps(memRefType) || failed(isAsyncWithOneDependency(rewriter, memcpyOp))) return failure(); auto loc = memcpyOp.getLoc(); MemRefDescriptor srcDesc(adaptor.getSrc()); Value numElements = getNumElements(rewriter, loc, memRefType, srcDesc); Type elementPtrType = getElementPtrType(memRefType); Value nullPtr = rewriter.create(loc, elementPtrType); Value gepPtr = rewriter.create( loc, elementPtrType, typeConverter->convertType(memRefType.getElementType()), nullPtr, numElements); auto sizeBytes = rewriter.create(loc, getIndexType(), gepPtr); auto src = bitAndAddrspaceCast(loc, rewriter, llvmPointerType, srcDesc.alignedPtr(rewriter, loc), *getTypeConverter()); auto dst = bitAndAddrspaceCast( loc, rewriter, llvmPointerType, MemRefDescriptor(adaptor.getDst()).alignedPtr(rewriter, loc), *getTypeConverter()); auto stream = adaptor.getAsyncDependencies().front(); memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream}); rewriter.replaceOp(memcpyOp, {stream}); return success(); } LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::MemsetOp memsetOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto memRefType = cast(memsetOp.getDst().getType()); if (failed(areAllLLVMTypes(memsetOp, adaptor.getOperands(), rewriter)) || !isConvertibleAndHasIdentityMaps(memRefType) || failed(isAsyncWithOneDependency(rewriter, memsetOp))) return failure(); auto loc = memsetOp.getLoc(); Type valueType = adaptor.getValue().getType(); if (!valueType.isIntOrFloat() || valueType.getIntOrFloatBitWidth() != 32) { return rewriter.notifyMatchFailure(memsetOp, "value must be a 32 bit scalar"); } MemRefDescriptor dstDesc(adaptor.getDst()); Value numElements = getNumElements(rewriter, loc, memRefType, dstDesc); auto value = rewriter.create(loc, llvmInt32Type, adaptor.getValue()); auto dst = bitAndAddrspaceCast(loc, rewriter, llvmPointerType, dstDesc.alignedPtr(rewriter, loc), *getTypeConverter()); auto stream = adaptor.getAsyncDependencies().front(); memsetCallBuilder.create(loc, rewriter, {dst, value, numElements, stream}); rewriter.replaceOp(memsetOp, {stream}); return success(); } LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::SetDefaultDeviceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); setDefaultDeviceCallBuilder.create(loc, rewriter, {adaptor.getDevIndex()}); rewriter.replaceOp(op, {}); return success(); } LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateSparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); auto handle = createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); } LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DestroySparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream}); rewriter.replaceOp(op, {stream}); return success(); } LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateDnVecOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); Value pVec = MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pVec = rewriter.create(loc, llvmPointerType, pVec); Type dType = op.getMemref().getType().cast().getElementType(); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto handle = createDnVecCallBuilder .create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream}) .getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); } LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DestroyDnVecOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream}); rewriter.replaceOp(op, {stream}); return success(); } LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateCooOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); Value pRowIdxs = MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc); Value pColIdxs = MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc); Value pValues = MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) { pRowIdxs = rewriter.create(loc, llvmPointerType, pRowIdxs); pColIdxs = rewriter.create(loc, llvmPointerType, pColIdxs); pValues = rewriter.create(loc, llvmPointerType, pValues); } Type iType = op.getColIdxs().getType().cast().getElementType(); Type dType = op.getValues().getType().cast().getElementType(); auto iw = rewriter.create( loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto handle = createCooCallBuilder .create(loc, rewriter, {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(), pRowIdxs, pColIdxs, pValues, iw, dw, stream}) .getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); } LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateCsrOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); Value pRowPos = MemRefDescriptor(adaptor.getRowPos()).allocatedPtr(rewriter, loc); Value pColIdxs = MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc); Value pValues = MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) { pRowPos = rewriter.create(loc, llvmPointerType, pRowPos); pColIdxs = rewriter.create(loc, llvmPointerType, pColIdxs); pValues = rewriter.create(loc, llvmPointerType, pValues); } Type pType = op.getRowPos().getType().cast().getElementType(); Type iType = op.getColIdxs().getType().cast().getElementType(); Type dType = op.getValues().getType().cast().getElementType(); auto pw = rewriter.create( loc, llvmInt32Type, pType.isIndex() ? 64 : pType.getIntOrFloatBitWidth()); auto iw = rewriter.create( loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto handle = createCsrCallBuilder .create(loc, rewriter, {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(), pRowPos, pColIdxs, pValues, pw, iw, dw, stream}) .getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); } LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DestroySpMatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream}); rewriter.replaceOp(op, {stream}); return success(); } LogicalResult ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::SpMVBufferSizeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); auto bufferSize = spMVBufferSizeCallBuilder .create(loc, rewriter, {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnX(), adaptor.getDnY(), stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::SpMVOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); auto stream = adaptor.getAsyncDependencies().front(); Value pBuf = MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMVCallBuilder.create(loc, rewriter, {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnX(), adaptor.getDnY(), pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef gpuBinaryAnnotation, bool kernelBarePtrCallConv) { converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type { return converter.getPointerType( IntegerType::get(&converter.getContext(), 8)); }); converter.addConversion([&converter](gpu::SparseHandleType type) -> Type { return converter.getPointerType( IntegerType::get(&converter.getContext(), 8)); }); patterns.add(converter); patterns.add( converter, gpuBinaryAnnotation, kernelBarePtrCallConv); patterns.add(&converter.getContext()); }