summaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorRuiling Song <ruiling.song@intel.com>2016-04-28 16:39:21 +0800
committerYang Rong <rong.r.yang@intel.com>2016-05-25 12:52:14 +0800
commitdf196c97b2fd770d9038c520205553d3741590ba (patch)
treefce5f314bcf079ee4c88593bef227b0dd50aa7b9 /backend
parentee4e471c9f5030869e61467b40be456f09b74d0c (diff)
downloadbeignet-df196c97b2fd770d9038c520205553d3741590ba.tar.gz
GBE: Do more aggressive load/store merging.
Previously, we keep strict load/store order as before transformation. But for load/store from/into different address spaces, the instruction order can be changed. And this gave us an chance to merge more load/store instructions. Also change the processing window a little larger to (maxVecSize * 5). This change would make significant performance benefit for SKL platform: A typical case is: ./opencv_perf_core --gtest_filter=OCL_LUTFixture_LUT.LUT/15 v2: should not move BBI backward if it points to the first instruction in the basic block. v3: increase the scanning window little larger. change factor from 5 to 8. thus this patch would not bring too much regression to gemm opencl sample. Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Yang Rong <rong.r.yang@intel.com>
Diffstat (limited to 'backend')
-rw-r--r--backend/src/llvm/llvm_loadstore_optimization.cpp89
1 files changed, 72 insertions, 17 deletions
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index c0ac6ed7..95c69e02 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -70,11 +70,11 @@ namespace gbe {
bool isLoadStoreCompatible(Value *A, Value *B);
void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
- BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
- SmallVector<Instruction*, 16> &merged,
- BasicBlock::iterator &start,
- unsigned maxVecSize,
- bool isLoad);
+ bool findConsecutiveAccess(BasicBlock &BB,
+ SmallVector<Instruction*, 16> &merged,
+ const BasicBlock::iterator &start,
+ unsigned maxVecSize,
+ bool isLoad);
virtual const char *getPassName() const {
return "Merge compatible Load/stores for Gen";
@@ -159,38 +159,58 @@ namespace gbe {
values[i]->replaceAllUsesWith(S);
}
}
-
- BasicBlock::iterator
+ // When searching for consecutive memory access, we do it in a small window,
+ // if the window is too large, it would take up too much compiling time.
+ // An Important rule we have followed is don't try to change load/store order.
+ // But an exeption is 'load& store that are from different address spaces. The
+ // return value will indicate wheter such kind of reorder happens.
+ bool
GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
SmallVector<Instruction*, 16> &merged,
- BasicBlock::iterator &start,
+ const BasicBlock::iterator &start,
unsigned maxVecSize,
bool isLoad) {
- BasicBlock::iterator stepForward = start;
- if(!isSimpleLoadStore(&*start)) return stepForward;
+ if(!isSimpleLoadStore(&*start)) return false;
merged.push_back(&*start);
+ unsigned targetAddrSpace = getAddressSpace(&*start);
BasicBlock::iterator E = BB.end();
- BasicBlock::iterator J = ++start;
+ BasicBlock::iterator J = start;
+ ++J;
- unsigned maxLimit = maxVecSize * 3;
+ unsigned maxLimit = maxVecSize * 8;
+ bool reordered = false;
for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) {
merged.push_back(&*J);
- stepForward = ++J;
}
- } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+ } else if((isLoad && isa<StoreInst>(*J))) {
// simple stop to keep read/write order
- break;
+ StoreInst *st = cast<StoreInst>(&*J);
+ unsigned addrSpace = st->getPointerAddressSpace();
+ if (addrSpace != targetAddrSpace) {
+ reordered = true;
+ } else {
+ break;
+ }
+ } else if ((!isLoad && isa<LoadInst>(*J))) {
+ LoadInst *ld = cast<LoadInst>(&*J);
+ unsigned addrSpace = ld->getPointerAddressSpace();
+ if (addrSpace != targetAddrSpace) {
+ reordered = true;
+ } else {
+ break;
+ }
}
if(merged.size() >= maxVecSize) break;
}
- return stepForward;
+
+ return reordered;
}
void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
@@ -226,6 +246,28 @@ namespace gbe {
newST->setAlignment(align);
}
+ // Find the safe iterator we can point to. If reorder happens, we need to
+ // point to the instruction after the first of toBeDeleted. If no reorder,
+ // we are safe to point to the instruction after the last of toBeDeleted
+ static BasicBlock::iterator
+ findSafeInstruction(SmallVector<Instruction*, 16> &toBeDeleted,
+ const BasicBlock::iterator &current,
+ bool reorder) {
+ BasicBlock::iterator safe = current;
+ unsigned size = toBeDeleted.size();
+ if (reorder) {
+ unsigned i = 0;
+ while (toBeDeleted[i] == &*safe && i < size) {
+ ++i;
+ ++safe;
+ }
+ } else {
+ safe = BasicBlock::iterator(toBeDeleted[size - 1]);
+ ++safe;
+ }
+ return safe;
+ }
+
bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
bool changed = false;
SmallVector<Instruction*, 16> merged;
@@ -239,11 +281,18 @@ namespace gbe {
if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
continue;
+
unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
(ty->isIntegerTy(16) ? 8 : 16);
- BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
+ bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
uint32_t size = merged.size();
uint32_t pos = 0;
+ bool doDeleting = size > 1;
+ if (doDeleting) {
+ // choose next undeleted instruction
+ BBI = findSafeInstruction(merged, BBI, reorder);
+ }
+
while(size > 1) {
unsigned vecSize = (size >= 16) ? 16 :
(size >= 8 ? 8 :
@@ -260,6 +309,12 @@ namespace gbe {
pos += vecSize;
size -= vecSize;
}
+ if (doDeleting) {
+ //adjust the BBI back by one, as we would increase it in for loop
+ //don't do this if BBI points to the very first instruction.
+ if (BBI != BB.begin())
+ --BBI;
+ }
merged.clear();
}
}