diff options
author | Ruiling Song <ruiling.song@intel.com> | 2016-04-28 16:39:21 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-05-25 12:52:14 +0800 |
commit | df196c97b2fd770d9038c520205553d3741590ba (patch) | |
tree | fce5f314bcf079ee4c88593bef227b0dd50aa7b9 /backend | |
parent | ee4e471c9f5030869e61467b40be456f09b74d0c (diff) | |
download | beignet-df196c97b2fd770d9038c520205553d3741590ba.tar.gz |
GBE: Do more aggressive load/store merging.
Previously, we keep strict load/store order as before transformation.
But for load/store from/into different address spaces, the instruction
order can be changed. And this gave us an chance to merge more
load/store instructions.
Also change the processing window a little larger to (maxVecSize * 5).
This change would make significant performance benefit for SKL platform:
A typical case is:
./opencv_perf_core --gtest_filter=OCL_LUTFixture_LUT.LUT/15
v2:
should not move BBI backward if it points to the first instruction
in the basic block.
v3:
increase the scanning window little larger. change factor from 5 to 8.
thus this patch would not bring too much regression to gemm opencl
sample.
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
Diffstat (limited to 'backend')
-rw-r--r-- | backend/src/llvm/llvm_loadstore_optimization.cpp | 89 |
1 files changed, 72 insertions, 17 deletions
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp index c0ac6ed7..95c69e02 100644 --- a/backend/src/llvm/llvm_loadstore_optimization.cpp +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp @@ -70,11 +70,11 @@ namespace gbe { bool isLoadStoreCompatible(Value *A, Value *B); void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged); void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged); - BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB, - SmallVector<Instruction*, 16> &merged, - BasicBlock::iterator &start, - unsigned maxVecSize, - bool isLoad); + bool findConsecutiveAccess(BasicBlock &BB, + SmallVector<Instruction*, 16> &merged, + const BasicBlock::iterator &start, + unsigned maxVecSize, + bool isLoad); virtual const char *getPassName() const { return "Merge compatible Load/stores for Gen"; @@ -159,38 +159,58 @@ namespace gbe { values[i]->replaceAllUsesWith(S); } } - - BasicBlock::iterator + // When searching for consecutive memory access, we do it in a small window, + // if the window is too large, it would take up too much compiling time. + // An Important rule we have followed is don't try to change load/store order. + // But an exeption is 'load& store that are from different address spaces. The + // return value will indicate wheter such kind of reorder happens. + bool GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB, SmallVector<Instruction*, 16> &merged, - BasicBlock::iterator &start, + const BasicBlock::iterator &start, unsigned maxVecSize, bool isLoad) { - BasicBlock::iterator stepForward = start; - if(!isSimpleLoadStore(&*start)) return stepForward; + if(!isSimpleLoadStore(&*start)) return false; merged.push_back(&*start); + unsigned targetAddrSpace = getAddressSpace(&*start); BasicBlock::iterator E = BB.end(); - BasicBlock::iterator J = ++start; + BasicBlock::iterator J = start; + ++J; - unsigned maxLimit = maxVecSize * 3; + unsigned maxLimit = maxVecSize * 8; + bool reordered = false; for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) { if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) { if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) { merged.push_back(&*J); - stepForward = ++J; } - } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) { + } else if((isLoad && isa<StoreInst>(*J))) { // simple stop to keep read/write order - break; + StoreInst *st = cast<StoreInst>(&*J); + unsigned addrSpace = st->getPointerAddressSpace(); + if (addrSpace != targetAddrSpace) { + reordered = true; + } else { + break; + } + } else if ((!isLoad && isa<LoadInst>(*J))) { + LoadInst *ld = cast<LoadInst>(&*J); + unsigned addrSpace = ld->getPointerAddressSpace(); + if (addrSpace != targetAddrSpace) { + reordered = true; + } else { + break; + } } if(merged.size() >= maxVecSize) break; } - return stepForward; + + return reordered; } void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) { @@ -226,6 +246,28 @@ namespace gbe { newST->setAlignment(align); } + // Find the safe iterator we can point to. If reorder happens, we need to + // point to the instruction after the first of toBeDeleted. If no reorder, + // we are safe to point to the instruction after the last of toBeDeleted + static BasicBlock::iterator + findSafeInstruction(SmallVector<Instruction*, 16> &toBeDeleted, + const BasicBlock::iterator ¤t, + bool reorder) { + BasicBlock::iterator safe = current; + unsigned size = toBeDeleted.size(); + if (reorder) { + unsigned i = 0; + while (toBeDeleted[i] == &*safe && i < size) { + ++i; + ++safe; + } + } else { + safe = BasicBlock::iterator(toBeDeleted[size - 1]); + ++safe; + } + return safe; + } + bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) { bool changed = false; SmallVector<Instruction*, 16> merged; @@ -239,11 +281,18 @@ namespace gbe { if (!(ty->isFloatTy() || ty->isIntegerTy(32) || ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad))) continue; + unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 : (ty->isIntegerTy(16) ? 8 : 16); - BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad); + bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad); uint32_t size = merged.size(); uint32_t pos = 0; + bool doDeleting = size > 1; + if (doDeleting) { + // choose next undeleted instruction + BBI = findSafeInstruction(merged, BBI, reorder); + } + while(size > 1) { unsigned vecSize = (size >= 16) ? 16 : (size >= 8 ? 8 : @@ -260,6 +309,12 @@ namespace gbe { pos += vecSize; size -= vecSize; } + if (doDeleting) { + //adjust the BBI back by one, as we would increase it in for loop + //don't do this if BBI points to the very first instruction. + if (BBI != BB.begin()) + --BBI; + } merged.clear(); } } |