diff --git a/external/llvm-project/mlir/lib/IR/MLIRContext.cpp b/external/llvm-project/mlir/lib/IR/MLIRContext.cpp index 8dc910552cd9..37f9f97333fa 100644 --- a/external/llvm-project/mlir/lib/IR/MLIRContext.cpp +++ b/external/llvm-project/mlir/lib/IR/MLIRContext.cpp @@ -76,7 +76,7 @@ struct MLIRContextOptions { static llvm::ManagedStatic clOptions; static bool isThreadingGloballyDisabled() { -#if MLIR_ENABLE_THREADS != 0 +#if LLVM_ENABLE_THREADS != 0 return clOptions.isConstructed() && clOptions->disableThreading; #else return true; diff --git a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp index 2d41c13a6038..f3291694c4fd 100644 --- a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp +++ b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp @@ -47,6 +47,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/ThreadPool.h" #include #include @@ -88,6 +89,24 @@ void pArgs(const std::tuple &formals, void **_vargs) { using namespace mlir; using namespace rocmlir::tuningdriver; +//===----------------------------------------------------------------------===// +// Shared Resources for Multi-threaded Compilation +//===----------------------------------------------------------------------===// + +/// Returns a shared dialect registry, initialized exactly once. +static DialectRegistry &getSharedDialectRegistry() { + static std::once_flag initFlag; + static DialectRegistry registry; + std::call_once(initFlag, []() { registerRocMLIRDialects(registry); }); + return registry; +} + +/// Returns a shared LLVM ThreadPool for all MLIR contexts. +static llvm::DefaultThreadPool &getSharedThreadPool() { + static llvm::DefaultThreadPool pool; + return pool; +} + static llvm::cl::opt inputFilename{ llvm::cl::Positional, llvm::cl::desc(""), llvm::cl::init("-")}; @@ -316,9 +335,13 @@ struct ThreadResources { const rock::KernelOptions &applicabilityOpts, const rock::KernelOptions &compilationKernOpts, const rock::BackendOptions &backendOpts) { - DialectRegistry registry; - registerRocMLIRDialects(registry); - ctx = std::make_unique(registry); + // Use the shared dialect registry (initialized exactly once) + DialectRegistry ®istry = getSharedDialectRegistry(); + // Create context with threading disabled internally, attach shared pool + ctx = std::make_unique(registry, + MLIRContext::Threading::DISABLED); + ctx->setThreadPool(getSharedThreadPool()); + ctx->loadAllAvailableDialects(); ctx->getDiagEngine().registerHandler([](Diagnostic &) {}); // Pre-build pipelines once per thread @@ -423,12 +446,12 @@ measureLargeKernel(unsigned iterations, hipStream_t stream, } // In order to match rocprof, returns time in nanoseconds -static FailureOr -benchmarkKernels(ArrayRef binaries, - ArrayRef funcNames, ArrayRef blockSizes, - ArrayRef gridSizes, ArrayRef hostBuffers, - MutableArrayRef gpuBuffers, - ArrayRef bufferSizes, const BenchmarkParams ¶ms) { +static FailureOr benchmarkKernels(ArrayRef binaries, + ArrayRef funcNames, + ArrayRef blockSizes, + ArrayRef gridSizes, + MutableArrayRef gpuBuffers, + const BenchmarkParams ¶ms) { bool benchmarkMode = !params.benchmarkConfig.empty(); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); @@ -440,12 +463,6 @@ benchmarkKernels(ArrayRef binaries, } }); - // Initialize device buffers - for (size_t i = 0; i < bufferSizes.size(); i++) { - HIPCHECK(hipMemcpyAsync(gpuBuffers[i], hostBuffers[i], bufferSizes[i], - hipMemcpyHostToDevice, stream)); - } - // HIP wants an array of pointers to each argument std::vector argPointers; for (void *&item : gpuBuffers) { @@ -731,6 +748,12 @@ static LogicalResult runTuningLoop(ModuleOp source) { gpuBuffers.push_back(gpuBuffer); } + // Copy host buffers to GPU once (reused across all config benchmarks) + for (size_t i = 0; i < bufferLengths.size(); i++) { + HIPCHECK(hipMemcpy(gpuBuffers[i], hostBuffers[i], bufferLengths[i], + hipMemcpyHostToDevice)); + } + // 4. Multi-iteration tuning loop SmallString<64> bestConfigOverall; float bestTimeOverall = std::numeric_limits::max(); @@ -975,10 +998,9 @@ static LogicalResult runTuningLoop(ModuleOp source) { assert(result.status == CompilationStatus::Success && "Unexpected compilation status in benchmarking phase"); - FailureOr timing = - benchmarkKernels(result.hipModules, kernelFuncNames, - result.blockSizes, result.gridSizes, hostBuffers, - gpuBuffers, bufferLengths, benchmarkParams); + FailureOr timing = benchmarkKernels( + result.hipModules, kernelFuncNames, result.blockSizes, + result.gridSizes, gpuBuffers, benchmarkParams); if (failed(timing)) { llvm::errs() << "Kernel execution failed\n";