diff options
29 files changed, 1137 insertions, 316 deletions
diff --git a/mysql-test/lib/mtr_timer.pl b/mysql-test/lib/mtr_timer.pl index 709cebd6407..a85ab8c6122 100644 --- a/mysql-test/lib/mtr_timer.pl +++ b/mysql-test/lib/mtr_timer.pl @@ -78,6 +78,12 @@ sub mtr_timer_start($$$) { { # Child, redirect output and exec # FIXME do we need to redirect streams? + + # Don't do the ^C cleanup in the timeout child processes! + # There is actually a race here, if we get ^C after fork(), but before + # clearing the signal handler. + $SIG{INT}= 'DEFAULT'; + $0= "mtr_timer(timers,$name,$duration)"; sleep($duration); exit(0); diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index 4dd22cf5092..b42b930711c 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -126,7 +126,11 @@ public: DihAllAllowNodeStart = 7016, DihMinTimeBetweenLCP = 7017, DihMaxTimeBetweenLCP = 7018, + // 7019 + // 7020 + // 7021 EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP + DihSetTimeBetweenGcp = 7090, DihStartLcpImmediately = 7099, // 8000 Suma // 12000 Tux diff --git a/ndb/include/kernel/signaldata/StartPerm.hpp b/ndb/include/kernel/signaldata/StartPerm.hpp index 38be72835a3..63e01ed3868 100644 --- a/ndb/include/kernel/signaldata/StartPerm.hpp +++ b/ndb/include/kernel/signaldata/StartPerm.hpp @@ -64,5 +64,11 @@ private: Uint32 startingNodeId; Uint32 errorCode; + + enum ErrorCode + { + ZNODE_ALREADY_STARTING_ERROR = 305, + InitialStartRequired = 320 + }; }; #endif diff --git a/ndb/include/kernel/signaldata/TcContinueB.hpp b/ndb/include/kernel/signaldata/TcContinueB.hpp index 85213791b2a..b87b982e49b 100644 --- a/ndb/include/kernel/signaldata/TcContinueB.hpp +++ b/ndb/include/kernel/signaldata/TcContinueB.hpp @@ -44,7 +44,8 @@ private: CHECK_WAIT_DROP_TAB_FAILED_LQH = 16, TRIGGER_PENDING = 17, - DelayTCKEYCONF = 18 + DelayTCKEYCONF = 18, + ZNF_CHECK_TRANSACTIONS = 19 }; }; diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index eab4a8eb623..f2e77d4e7e0 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -228,6 +228,8 @@ Delay execution of COMPLETECONF signal 2 seconds to generate time-out. 8045: (ABORTCONF only as part of take-over) Delay execution of ABORTCONF signal 2 seconds to generate time-out. +8050: Send ZABORT_TIMEOUT_BREAK delayed + ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBTC ------------------------------------------------- @@ -305,6 +307,8 @@ Test Crashes in handling node restarts 7131: Crash when receiving START_COPYREQ in master node 7132: Crash when receiving START_COPYCONF in starting node +7170: Crash when receiving START_PERMREF (InitialStartRequired) + DICT: 6000 Crash during NR when receiving DICTSTARTREQ 6001 Crash during NR when receiving SCHEMA_INFO diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index 0c107e35603..78acf1ffd19 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -81,7 +81,6 @@ #define ZWRONG_FAILURE_NUMBER_ERROR 302 #define ZWRONG_START_NODE_ERROR 303 #define ZNO_REPLICA_FOUND_ERROR 304 -#define ZNODE_ALREADY_STARTING_ERROR 305 #define ZNODE_START_DISALLOWED_ERROR 309 // -------------------------------------- @@ -1038,7 +1037,8 @@ private: void prepareReplicas(FragmentstorePtr regFragptr); void removeNodeFromStored(Uint32 nodeId, FragmentstorePtr regFragptr, - ReplicaRecordPtr replicaPtr); + ReplicaRecordPtr replicaPtr, + bool temporary); void removeOldStoredReplica(FragmentstorePtr regFragptr, ReplicaRecordPtr replicaPtr); void removeStoredReplica(FragmentstorePtr regFragptr, diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 33736bcb4cf..6186ed3ac3c 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -1428,6 +1428,33 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref) return; } + NodeRecordPtr nodePtr; + Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()]; + for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) + { + jam(); + ptrAss(nodePtr, nodeRecord); + if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) + { + jam(); + /** + * Since we're starting(is master) and there + * there are other nodes with higher GCI... + * there gci's must be invalidated... + * and they _must_ do an initial start + * indicate this by setting lastCompletedGCI = 0 + */ + SYSFILE->lastCompletedGCI[nodePtr.i] = 0; + ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE); + warningEvent("Making filesystem for node %d unusable", + nodePtr.i); + } + } + /** + * This set which GCI we will try to restart to + */ + SYSFILE->newestRestorableGCI = gci; + ndbrequire(isMaster()); copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file! }//Dbdih::ndbStartReqLab() @@ -1563,7 +1590,7 @@ void Dbdih::execSTART_PERMREF(Signal* signal) { jamEntry(); Uint32 errorCode = signal->theData[1]; - if (errorCode == ZNODE_ALREADY_STARTING_ERROR) { + if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) { jam(); /*-----------------------------------------------------------------------*/ // The master was busy adding another node. We will wait for a second and @@ -1573,6 +1600,20 @@ void Dbdih::execSTART_PERMREF(Signal* signal) sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1); return; }//if + + if (errorCode == StartPermRef::InitialStartRequired) + { + CRASH_INSERTION(7170); + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), + "Cluster requires this node to be started " + " with --initial as partial start has been performed" + " and this filesystem is unusable"); + progError(__LINE__, + ERR_SR_RESTARTCONFLICT, + buf); + ndbrequire(false); + } /*------------------------------------------------------------------------*/ // Some node process in another node involving our node was still active. We // will recover from this by crashing here. @@ -1663,7 +1704,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) (c_nodeStartMaster.wait != ZFALSE)) { jam(); signal->theData[0] = nodeId; - signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR; + signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR; sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); return; }//if @@ -1673,6 +1714,16 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) ndbrequire(false); }//if + if (SYSFILE->lastCompletedGCI[nodeId] == 0 && + typeStart != NodeState::ST_INITIAL_NODE_RESTART) + { + jam(); + signal->theData[0] = nodeId; + signal->theData[1] = StartPermRef::InitialStartRequired; + sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); + return; + } + /*---------------------------------------------------------------------- * WE START THE INCLUSION PROCEDURE * ---------------------------------------------------------------------*/ @@ -3521,24 +3572,12 @@ void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr) /* ------------------------------------------------------------------------- */ void Dbdih::selectMasterCandidateAndSend(Signal* signal) { - Uint32 gci = 0; - Uint32 masterCandidateId = 0; - NodeRecordPtr nodePtr; - for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { - jam(); - ptrAss(nodePtr, nodeRecord); - if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) { - jam(); - masterCandidateId = nodePtr.i; - gci = SYSFILE->lastCompletedGCI[nodePtr.i]; - }//if - }//for - ndbrequire(masterCandidateId != 0); setNodeGroups(); - signal->theData[0] = masterCandidateId; - signal->theData[1] = gci; + signal->theData[0] = getOwnNodeId(); + signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()]; sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB); - + + NodeRecordPtr nodePtr; Uint32 node_groups[MAX_NDB_NODES]; memset(node_groups, 0, sizeof(node_groups)); for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { @@ -3556,10 +3595,10 @@ void Dbdih::selectMasterCandidateAndSend(Signal* signal) if(count != 0 && count != cnoReplicas){ char buf[255]; BaseString::snprintf(buf, sizeof(buf), - "Illegal configuration change." - " Initial start needs to be performed " - " when changing no of replicas (%d != %d)", - node_groups[nodePtr.i], cnoReplicas); + "Illegal configuration change." + " Initial start needs to be performed " + " when changing no of replicas (%d != %d)", + node_groups[nodePtr.i], cnoReplicas); progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf); } } @@ -5220,6 +5259,7 @@ void Dbdih::removeNodeFromTable(Signal* signal, //const Uint32 lcpId = SYSFILE->latestLCP_ID; const bool lcpOngoingFlag = (tabPtr.p->tabLcpStatus== TabRecord::TLS_ACTIVE); + const bool temporary = !tabPtr.p->storedTable; FragmentstorePtr fragPtr; for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){ @@ -5240,7 +5280,7 @@ void Dbdih::removeNodeFromTable(Signal* signal, jam(); found = true; noOfRemovedReplicas++; - removeNodeFromStored(nodeId, fragPtr, replicaPtr); + removeNodeFromStored(nodeId, fragPtr, replicaPtr, temporary); if(replicaPtr.p->lcpOngoingFlag){ jam(); /** @@ -5950,9 +5990,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) signal->theData[0] = 7012; execDUMP_STATE_ORD(signal); - signal->theData[0] = 7015; - execDUMP_STATE_ORD(signal); - c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__); checkLocalNodefailComplete(signal, failedNodePtr.i, NF_LCP_TAKE_OVER); @@ -12164,9 +12201,18 @@ void Dbdih::removeDeadNode(NodeRecordPtr removeNodePtr) /*---------------------------------------------------------------*/ void Dbdih::removeNodeFromStored(Uint32 nodeId, FragmentstorePtr fragPtr, - ReplicaRecordPtr replicatePtr) + ReplicaRecordPtr replicatePtr, + bool temporary) { - newCrashedReplica(nodeId, replicatePtr); + if (!temporary) + { + jam(); + newCrashedReplica(nodeId, replicatePtr); + } + else + { + jam(); + } removeStoredReplica(fragPtr, replicatePtr); linkOldStoredReplica(fragPtr, replicatePtr); ndbrequire(fragPtr.p->storedReplicas != RNIL); @@ -13100,7 +13146,8 @@ void Dbdih::execDUMP_STATE_ORD(Signal* signal) { DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0]; - if (dumpState->args[0] == DumpStateOrd::DihDumpNodeRestartInfo) { + Uint32 arg = dumpState->args[0]; + if (arg == DumpStateOrd::DihDumpNodeRestartInfo) { infoEvent("c_nodeStartMaster.blockLcp = %d, c_nodeStartMaster.blockGcp = %d, c_nodeStartMaster.wait = %d", c_nodeStartMaster.blockLcp, c_nodeStartMaster.blockGcp, c_nodeStartMaster.wait); infoEvent("cstartGcpNow = %d, cgcpStatus = %d", @@ -13110,7 +13157,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("cgcpOrderBlocked = %d, cgcpStartCounter = %d", cgcpOrderBlocked, cgcpStartCounter); }//if - if (dumpState->args[0] == DumpStateOrd::DihDumpNodeStatusInfo) { + if (arg == DumpStateOrd::DihDumpNodeStatusInfo) { NodeRecordPtr localNodePtr; infoEvent("Printing nodeStatus of all nodes"); for (localNodePtr.i = 1; localNodePtr.i < MAX_NDB_NODES; localNodePtr.i++) { @@ -13122,7 +13169,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) }//for }//if - if (dumpState->args[0] == DumpStateOrd::DihPrintFragmentation){ + if (arg == DumpStateOrd::DihPrintFragmentation){ infoEvent("Printing fragmentation of all tables --"); for(Uint32 i = 0; i<ctabFileSize; i++){ TabRecordPtr tabPtr; @@ -13297,7 +13344,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) } } - if(dumpState->args[0] == 7019 && signal->getLength() == 2) + if(arg == 7019 && signal->getLength() == 2) { char buf2[8+1]; NodeRecordPtr nodePtr; @@ -13315,7 +13362,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) nodePtr.p->m_nodefailSteps.getText(buf2)); } - if(dumpState->args[0] == 7020 && signal->getLength() > 3) + if(arg == 7020 && signal->getLength() > 3) { Uint32 gsn= signal->theData[1]; Uint32 block= signal->theData[2]; @@ -13339,7 +13386,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) gsn, getBlockName(block, "UNKNOWN"), length, buf); } - if(dumpState->args[0] == DumpStateOrd::DihDumpLCPState){ + if(arg == DumpStateOrd::DihDumpLCPState){ infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); infoEvent("lcpStatus = %d (update place = %d) ", c_lcpState.lcpStatus, c_lcpState.lcpStatusUpdatedPlace); @@ -13355,7 +13402,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); } - if(dumpState->args[0] == DumpStateOrd::DihDumpLCPMasterTakeOver){ + if(arg == DumpStateOrd::DihDumpLCPMasterTakeOver){ infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId()); infoEvent ("c_lcpMasterTakeOverState.state = %d updatePlace = %d failedNodeId = %d", @@ -13370,52 +13417,25 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId()); } - if (signal->theData[0] == 7015){ - for(Uint32 i = 0; i<ctabFileSize; i++){ - TabRecordPtr tabPtr; - tabPtr.i = i; - ptrCheckGuard(tabPtr, ctabFileSize, tabRecord); - - if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) - continue; - - infoEvent - ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d", - tabPtr.i, - tabPtr.p->tabCopyStatus, - tabPtr.p->tabUpdateState, - tabPtr.p->tabLcpStatus); + if (signal->theData[0] == 7015) + { + if (signal->getLength() == 1) + { + signal->theData[1] = 0; + } - FragmentstorePtr fragPtr; - for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) { - jam(); - getFragstore(tabPtr.p, fid, fragPtr); - - char buf[100], buf2[100]; - BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ", - fid, fragPtr.p->noLcpReplicas); - - Uint32 num=0; - ReplicaRecordPtr replicaPtr; - replicaPtr.i = fragPtr.p->storedReplicas; - do { - ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord); - BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)", - buf, num, - replicaPtr.p->procNode, - replicaPtr.p->lcpIdStarted, - replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle"); - BaseString::snprintf(buf, sizeof(buf), "%s", buf2); - - num++; - replicaPtr.i = replicaPtr.p->nextReplica; - } while (replicaPtr.i != RNIL); - infoEvent(buf); - } + Uint32 tableId = signal->theData[1]; + if (tableId < ctabFileSize) + { + signal->theData[0] = 7021; + execDUMP_STATE_ORD(signal); + signal->theData[0] = 7015; + signal->theData[1] = tableId + 1; + sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 2, JBB); } } - if(dumpState->args[0] == DumpStateOrd::EnableUndoDelayDataWrite){ + if(arg == DumpStateOrd::EnableUndoDelayDataWrite){ ndbout << "Dbdih:: delay write of datapages for table = " << dumpState->args[1]<< endl; // Send this dump to ACC and TUP @@ -13445,7 +13465,7 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) return; } - if(dumpState->args[0] == 7098){ + if(arg == 7098){ if(signal->length() == 3){ jam(); infoEvent("startLcpRoundLoopLab(tabel=%d, fragment=%d)", @@ -13458,10 +13478,73 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) } } - if(dumpState->args[0] == DumpStateOrd::DihStartLcpImmediately){ + if(arg == DumpStateOrd::DihStartLcpImmediately){ c_lcpState.ctimer += (1 << c_lcpState.clcpDelay); return; } + + if (arg == DumpStateOrd::DihSetTimeBetweenGcp) + { + if (signal->getLength() == 1) + { + const ndb_mgm_configuration_iterator * p = + theConfiguration.getOwnConfigIterator(); + ndbrequire(p != 0); + ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay); + } + else + { + cgcpDelay = signal->theData[1]; + } + ndbout_c("Setting time between gcp : %d", cgcpDelay); + } + + if (arg == 7021 && signal->getLength() == 2) + { + TabRecordPtr tabPtr; + tabPtr.i = signal->theData[1]; + if (tabPtr.i >= ctabFileSize) + return; + + ptrCheckGuard(tabPtr, ctabFileSize, tabRecord); + + if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) + return; + + infoEvent + ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d", + tabPtr.i, + tabPtr.p->tabCopyStatus, + tabPtr.p->tabUpdateState, + tabPtr.p->tabLcpStatus); + + FragmentstorePtr fragPtr; + for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) { + jam(); + getFragstore(tabPtr.p, fid, fragPtr); + + char buf[100], buf2[100]; + BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ", + fid, fragPtr.p->noLcpReplicas); + + Uint32 num=0; + ReplicaRecordPtr replicaPtr; + replicaPtr.i = fragPtr.p->storedReplicas; + do { + ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord); + BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)", + buf, num, + replicaPtr.p->procNode, + replicaPtr.p->lcpIdStarted, + replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle"); + BaseString::snprintf(buf, sizeof(buf), "%s", buf2); + + num++; + replicaPtr.i = replicaPtr.p->nextReplica; + } while (replicaPtr.i != RNIL); + infoEvent(buf); + } + } }//Dbdih::execDUMP_STATE_ORD() void diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index cdfc7880102..a03c4cf185a 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -18569,6 +18569,173 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal) c_error_insert_table_id = dumpState->args[1]; SET_ERROR_INSERT_VALUE(5042); } + + TcConnectionrec *regTcConnectionrec = tcConnectionrec; + Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize; + Uint32 arg = dumpState->args[0]; + if(arg == 2306) + { + for(Uint32 i = 0; i<1024; i++) + { + TcConnectionrecPtr tcRec; + tcRec.i = ctransidHash[i]; + while(tcRec.i != RNIL) + { + ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); + ndbout << "TcConnectionrec " << tcRec.i; + signal->theData[0] = 2307; + signal->theData[1] = tcRec.i; + execDUMP_STATE_ORD(signal); + tcRec.i = tcRec.p->nextHashRec; + } + } + } + + if(arg == 2307 || arg == 2308) + { + TcConnectionrecPtr tcRec; + tcRec.i = signal->theData[1]; + ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); + + ndbout << " transactionState = " << tcRec.p->transactionState<<endl; + ndbout << " operation = " << tcRec.p->operation<<endl; + ndbout << " tcNodeFailrec = " << tcRec.p->tcNodeFailrec + << " seqNoReplica = " << tcRec.p->seqNoReplica + << " simpleRead = " << tcRec.p->simpleRead + << endl; + ndbout << " replicaType = " << tcRec.p->replicaType + << " reclenAiLqhkey = " << tcRec.p->reclenAiLqhkey + << " opExec = " << tcRec.p->opExec + << endl; + ndbout << " opSimple = " << tcRec.p->opSimple + << " nextSeqNoReplica = " << tcRec.p->nextSeqNoReplica + << " lockType = " << tcRec.p->lockType + << endl; + ndbout << " lastReplicaNo = " << tcRec.p->lastReplicaNo + << " indTakeOver = " << tcRec.p->indTakeOver + << " dirtyOp = " << tcRec.p->dirtyOp + << endl; + ndbout << " activeCreat = " << tcRec.p->activeCreat + << " tcBlockref = " << hex << tcRec.p->tcBlockref + << " reqBlockref = " << hex << tcRec.p->reqBlockref + << " primKeyLen = " << tcRec.p->primKeyLen + << endl; + ndbout << " nextReplica = " << tcRec.p->nextReplica + << " tcBlockref = " << hex << tcRec.p->tcBlockref + << " reqBlockref = " << hex << tcRec.p->reqBlockref + << " primKeyLen = " << tcRec.p->primKeyLen + << endl; + ndbout << " logStopPageNo = " << tcRec.p->logStopPageNo + << " logStartPageNo = " << tcRec.p->logStartPageNo + << " logStartPageIndex = " << tcRec.p->logStartPageIndex + << endl; + ndbout << " errorCode = " << tcRec.p->errorCode + << " clientBlockref = " << hex << tcRec.p->clientBlockref + << " applRef = " << hex << tcRec.p->applRef + << " totSendlenAi = " << tcRec.p->totSendlenAi + << endl; + ndbout << " totReclenAi = " << tcRec.p->totReclenAi + << " tcScanRec = " << tcRec.p->tcScanRec + << " tcScanInfo = " << tcRec.p->tcScanInfo + << " tcOprec = " << hex << tcRec.p->tcOprec + << endl; + ndbout << " tableref = " << tcRec.p->tableref + << " simpleTcConnect = " << tcRec.p->simpleTcConnect + << " storedProcId = " << tcRec.p->storedProcId + << " schemaVersion = " << tcRec.p->schemaVersion + << endl; + ndbout << " reqinfo = " << tcRec.p->reqinfo + << " reqRef = " << tcRec.p->reqRef + << " readlenAi = " << tcRec.p->readlenAi + << " prevTc = " << tcRec.p->prevTc + << endl; + ndbout << " prevLogTcrec = " << tcRec.p->prevLogTcrec + << " prevHashRec = " << tcRec.p->prevHashRec + << " nodeAfterNext0 = " << tcRec.p->nodeAfterNext[0] + << " nodeAfterNext1 = " << tcRec.p->nodeAfterNext[1] + << endl; + ndbout << " nextTcConnectrec = " << tcRec.p->nextTcConnectrec + << " nextTc = " << tcRec.p->nextTc + << " nextTcLogQueue = " << tcRec.p->nextTcLogQueue + << " nextLogTcrec = " << tcRec.p->nextLogTcrec + << endl; + ndbout << " nextHashRec = " << tcRec.p->nextHashRec + << " logWriteState = " << tcRec.p->logWriteState + << " logStartFileNo = " << tcRec.p->logStartFileNo + << " listState = " << tcRec.p->listState + << endl; + ndbout << " lastAttrinbuf = " << tcRec.p->lastAttrinbuf + << " lastTupkeybuf = " << tcRec.p->lastTupkeybuf + << " hashValue = " << tcRec.p->hashValue + << endl; + ndbout << " gci = " << tcRec.p->gci + << " fragmentptr = " << tcRec.p->fragmentptr + << " fragmentid = " << tcRec.p->fragmentid + << " firstTupkeybuf = " << tcRec.p->firstTupkeybuf + << endl; + ndbout << " firstAttrinbuf = " << tcRec.p->firstAttrinbuf + << " currTupAiLen = " << tcRec.p->currTupAiLen + << " currReclenAi = " << tcRec.p->currReclenAi + << endl; + ndbout << " tcTimer = " << tcRec.p->tcTimer + << " clientConnectrec = " << tcRec.p->clientConnectrec + << " applOprec = " << hex << tcRec.p->applOprec + << " abortState = " << tcRec.p->abortState + << endl; + ndbout << " transid0 = " << hex << tcRec.p->transid[0] + << " transid1 = " << hex << tcRec.p->transid[1] + << " tupkeyData0 = " << tcRec.p->tupkeyData[0] + << " tupkeyData1 = " << tcRec.p->tupkeyData[1] + << endl; + ndbout << " tupkeyData2 = " << tcRec.p->tupkeyData[2] + << " tupkeyData3 = " << tcRec.p->tupkeyData[3] + << endl; + switch (tcRec.p->transactionState) { + + case TcConnectionrec::SCAN_STATE_USED: + if (tcRec.p->tcScanRec < cscanrecFileSize){ + ScanRecordPtr TscanPtr; + c_scanRecordPool.getPtr(TscanPtr, tcRec.p->tcScanRec); + ndbout << " scanState = " << TscanPtr.p->scanState << endl; + //TscanPtr.p->scanLocalref[2]; + ndbout << " copyPtr="<<TscanPtr.p->copyPtr + << " scanAccPtr="<<TscanPtr.p->scanAccPtr + << " scanAiLength="<<TscanPtr.p->scanAiLength + << endl; + ndbout << " m_curr_batch_size_rows="<< + TscanPtr.p->m_curr_batch_size_rows + << " m_max_batch_size_rows="<< + TscanPtr.p->m_max_batch_size_rows + << " scanErrorCounter="<<TscanPtr.p->scanErrorCounter + << endl; + ndbout << " scanSchemaVersion="<<TscanPtr.p->scanSchemaVersion + << " scanStoredProcId="<<TscanPtr.p->scanStoredProcId + << " scanTcrec="<<TscanPtr.p->scanTcrec + << endl; + ndbout << " scanType="<<TscanPtr.p->scanType + << " scanApiBlockref="<<TscanPtr.p->scanApiBlockref + << " scanNodeId="<<TscanPtr.p->scanNodeId + << " scanCompletedStatus="<<TscanPtr.p->scanCompletedStatus + << endl; + ndbout << " scanFlag="<<TscanPtr.p->scanFlag + << " scanLockHold="<<TscanPtr.p->scanLockHold + << " scanLockMode="<<TscanPtr.p->scanLockMode + << " scanNumber="<<TscanPtr.p->scanNumber + << endl; + ndbout << " scanReleaseCounter="<<TscanPtr.p->scanReleaseCounter + << " scanTcWaiting="<<TscanPtr.p->scanTcWaiting + << " scanKeyinfoFlag="<<TscanPtr.p->scanKeyinfoFlag + << endl; + } else{ + ndbout << "No connected scan record found" << endl; + } + break; + default: + break; + } + ndbrequire(arg != 2308); + } + }//Dblqh::execDUMP_STATE_ORD() void Dblqh::execSET_VAR_REQ(Signal* signal) diff --git a/ndb/src/kernel/blocks/dbtc/Dbtc.hpp b/ndb/src/kernel/blocks/dbtc/Dbtc.hpp index cb4f1c6244b..a0beec732a7 100644 --- a/ndb/src/kernel/blocks/dbtc/Dbtc.hpp +++ b/ndb/src/kernel/blocks/dbtc/Dbtc.hpp @@ -213,14 +213,6 @@ public: LTS_ACTIVE = 1 }; - enum TakeOverState { - TOS_NOT_DEFINED = 0, - TOS_IDLE = 1, - TOS_ACTIVE = 2, - TOS_COMPLETED = 3, - TOS_NODE_FAILED = 4 - }; - enum FailState { FS_IDLE = 0, FS_LISTENING = 1, @@ -638,6 +630,7 @@ public: ConnectionState apiConnectstate; UintR transid[2]; UintR firstTcConnect; + NdbNodeBitmask m_transaction_nodes; //--------------------------------------------------- // Second 16 byte cache line. Hot variables. @@ -934,7 +927,6 @@ public: struct HostRecord { HostState hostStatus; LqhTransState lqhTransStatus; - TakeOverState takeOverStatus; bool inPackedList; UintR noOfPackedWordsLqh; UintR packedWordsLqh[26]; @@ -943,6 +935,17 @@ public: UintR noOfWordsTCINDXCONF; UintR packedWordsTCINDXCONF[30]; BlockReference hostLqhBlockRef; + + enum NodeFailBits + { + NF_TAKEOVER = 0x1, + NF_CHECK_SCAN = 0x2, + NF_CHECK_TRANSACTION = 0x4, + NF_CHECK_DROP_TAB = 0x8, + NF_NODE_FAIL_BITS = 0xF // All bits... + }; + Uint32 m_nf_bits; + NdbNodeBitmask m_lqh_trans_conf; }; /* p2c: size = 128 bytes */ typedef Ptr<HostRecord> HostRecordPtr; @@ -1589,7 +1592,7 @@ private: void wrongSchemaVersionErrorLab(Signal* signal); void noFreeConnectionErrorLab(Signal* signal); void tckeyreq050Lab(Signal* signal); - void timeOutFoundLab(Signal* signal, UintR anAdd); + void timeOutFoundLab(Signal* signal, UintR anAdd, Uint32 errCode); void completeTransAtTakeOverLab(Signal* signal, UintR TtakeOverInd); void completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd); void completeTransAtTakeOverDoOne(Signal* signal, UintR TtakeOverInd); @@ -1611,6 +1614,9 @@ private: void checkScanFragList(Signal*, Uint32 failedNodeId, ScanRecord * scanP, LocalDLList<ScanFragRec>::Head&); + void nodeFailCheckTransactions(Signal*,Uint32 transPtrI,Uint32 failedNodeId); + void checkNodeFailComplete(Signal* signal, Uint32 failedNodeId, Uint32 bit); + // Initialisation void initData(); void initRecords(); @@ -1637,6 +1643,7 @@ private: HostRecord *hostRecord; HostRecordPtr hostptr; UintR chostFilesize; + NdbNodeBitmask c_alive_nodes; GcpRecord *gcpRecord; GcpRecordPtr gcpPtr; diff --git a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp index d7232030c41..2788d20b842 100644 --- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp +++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp @@ -266,6 +266,10 @@ void Dbtc::execCONTINUEB(Signal* signal) jam(); checkScanActiveInFailedLqh(signal, Tdata0, Tdata1); return; + case TcContinueB::ZNF_CHECK_TRANSACTIONS: + jam(); + nodeFailCheckTransactions(signal, Tdata0, Tdata1); + return; case TcContinueB::CHECK_WAIT_DROP_TAB_FAILED_LQH: jam(); checkWaitDropTabFailedLqh(signal, Tdata0, Tdata1); @@ -303,8 +307,8 @@ void Dbtc::execINCL_NODEREQ(Signal* signal) hostptr.i = signal->theData[1]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); hostptr.p->hostStatus = HS_ALIVE; - hostptr.p->takeOverStatus = TOS_IDLE; signal->theData[0] = cownref; + c_alive_nodes.set(hostptr.i); sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB); } @@ -503,6 +507,7 @@ Dbtc::checkWaitDropTabFailedLqh(Signal* signal, Uint32 nodeId, Uint32 tableId) * Finished */ jam(); + checkNodeFailComplete(signal, nodeId, HostRecord::NF_CHECK_DROP_TAB); return; } @@ -868,8 +873,6 @@ void Dbtc::execREAD_NODESCONF(Signal* signal) hostptr.i = i; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - hostptr.p->takeOverStatus = TOS_IDLE; - if (NodeBitmask::get(readNodes->inactiveNodes, i)) { jam(); hostptr.p->hostStatus = HS_DEAD; @@ -877,6 +880,7 @@ void Dbtc::execREAD_NODESCONF(Signal* signal) jam(); con_lineNodes++; hostptr.p->hostStatus = HS_ALIVE; + c_alive_nodes.set(i); }//if }//if }//for @@ -2378,6 +2382,7 @@ void Dbtc::initApiConnectRec(Signal* signal, regApiPtr->commitAckMarker = RNIL; regApiPtr->buddyPtr = RNIL; regApiPtr->currSavePointId = 0; + regApiPtr->m_transaction_nodes.clear(); // Trigger data releaseFiredTriggerData(®ApiPtr->theFiredTriggers), // Index data @@ -2986,6 +2991,10 @@ void Dbtc::tckeyreq050Lab(Signal* signal) signal->theData[0] = TdihConnectptr; signal->theData[1] = Ttableref; signal->theData[2] = TdistrHashValue; + signal->theData[3] = 0; + signal->theData[4] = 0; + signal->theData[5] = 0; + signal->theData[6] = 0; /*-------------------------------------------------------------*/ /* FOR EFFICIENCY REASONS WE AVOID THE SIGNAL SENDING HERE AND */ @@ -3165,6 +3174,7 @@ void Dbtc::sendlqhkeyreq(Signal* signal, TcConnectRecord * const regTcPtr = tcConnectptr.p; ApiConnectRecord * const regApiPtr = apiConnectptr.p; CacheRecord * const regCachePtr = cachePtr.p; + UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6; #ifdef ERROR_INSERT if (ERROR_INSERTED(8002)) { systemErrorLab(signal, __LINE__); @@ -3202,6 +3212,9 @@ void Dbtc::sendlqhkeyreq(Signal* signal, LqhKeyReq::setScanTakeOverFlag(tslrAttrLen, regCachePtr->scanTakeOverInd); Tdata10 = 0; + sig0 = regCachePtr->opSimple; + sig1 = regTcPtr->operation; + bool simpleRead = (sig1 == ZREAD && sig0 == ZTRUE); LqhKeyReq::setKeyLen(Tdata10, regCachePtr->keylen); LqhKeyReq::setLastReplicaNo(Tdata10, regTcPtr->lastReplicaNo); LqhKeyReq::setLockType(Tdata10, regCachePtr->opLock); @@ -3211,8 +3224,8 @@ void Dbtc::sendlqhkeyreq(Signal* signal, LqhKeyReq::setApplicationAddressFlag(Tdata10, 1); LqhKeyReq::setDirtyFlag(Tdata10, regTcPtr->dirtyOp); LqhKeyReq::setInterpretedFlag(Tdata10, regCachePtr->opExec); - LqhKeyReq::setSimpleFlag(Tdata10, regCachePtr->opSimple); - LqhKeyReq::setOperation(Tdata10, regTcPtr->operation); + LqhKeyReq::setSimpleFlag(Tdata10, sig0); + LqhKeyReq::setOperation(Tdata10, sig1); /* ----------------------------------------------------------------------- * Sequential Number of first LQH = 0, bit 22-23 * IF ATTRIBUTE INFORMATION IS SENT IN TCKEYREQ, @@ -3225,18 +3238,16 @@ void Dbtc::sendlqhkeyreq(Signal* signal, * ----------------------------------------------------------------------- */ //LqhKeyReq::setAPIVersion(Tdata10, regCachePtr->apiVersionNo); Uint32 commitAckMarker = regTcPtr->commitAckMarker; + const Uint32 noOfLqhs = regTcPtr->noOfNodes; if(commitAckMarker != RNIL){ jam(); - LqhKeyReq::setMarkerFlag(Tdata10, 1); - CommitAckMarker * tmp; - tmp = m_commitAckMarkerHash.getPtr(commitAckMarker); + CommitAckMarker * tmp = m_commitAckMarkerHash.getPtr(commitAckMarker); /** * Populate LQH array */ - const Uint32 noOfLqhs = regTcPtr->noOfNodes; tmp->noOfLqhs = noOfLqhs; for(Uint32 i = 0; i<noOfLqhs; i++){ tmp->lqhNodeId[i] = regTcPtr->tcNodedata[i]; @@ -3247,7 +3258,6 @@ void Dbtc::sendlqhkeyreq(Signal* signal, /* NO READ LENGTH SENT FROM TC. SEQUENTIAL NUMBER IS 1 AND IT */ /* IS SENT TO A PRIMARY NODE. */ /* ************************************************************> */ - UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6; LqhKeyReq * const lqhKeyReq = (LqhKeyReq *)signal->getDataPtrSend(); @@ -3271,6 +3281,14 @@ void Dbtc::sendlqhkeyreq(Signal* signal, sig5 = regTcPtr->clientData; sig6 = regCachePtr->scanInfo; + if (! simpleRead) + { + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[0]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[1]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[2]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[3]); + } + lqhKeyReq->tableSchemaVersion = sig0; lqhKeyReq->fragmentData = sig1; lqhKeyReq->transId1 = sig2; @@ -4655,6 +4673,7 @@ void Dbtc::copyApi(Signal* signal) UintR TgcpPointer = regTmpApiPtr->gcpPointer; UintR TgcpFilesize = cgcpFilesize; UintR TcommitAckMarker = regTmpApiPtr->commitAckMarker; + NdbNodeBitmask Tnodes = regTmpApiPtr->m_transaction_nodes; GcpRecord *localGcpRecord = gcpRecord; regApiPtr->ndbapiBlockref = regTmpApiPtr->ndbapiBlockref; @@ -4665,6 +4684,7 @@ void Dbtc::copyApi(Signal* signal) regApiPtr->transid[1] = Ttransid2; regApiPtr->lqhkeyconfrec = Tlqhkeyconfrec; regApiPtr->commitAckMarker = TcommitAckMarker; + regApiPtr->m_transaction_nodes = Tnodes; gcpPtr.i = TgcpPointer; ptrCheckGuard(gcpPtr, TgcpFilesize, localGcpRecord); @@ -4675,6 +4695,7 @@ void Dbtc::copyApi(Signal* signal) regTmpApiPtr->commitAckMarker = RNIL; regTmpApiPtr->firstTcConnect = RNIL; regTmpApiPtr->lastTcConnect = RNIL; + regTmpApiPtr->m_transaction_nodes.clear(); releaseAllSeizedIndexOperations(regTmpApiPtr); }//Dbtc::copyApi() @@ -4933,7 +4954,7 @@ void Dbtc::releaseTransResources(Signal* signal) TcConnectRecordPtr localTcConnectptr; UintR TtcConnectFilesize = ctcConnectFilesize; TcConnectRecord *localTcConnectRecord = tcConnectRecord; - + apiConnectptr.p->m_transaction_nodes.clear(); localTcConnectptr.i = apiConnectptr.p->firstTcConnect; do { jam(); @@ -5338,7 +5359,8 @@ void Dbtc::execTC_COMMITREQ(Signal* signal) break; case CS_ABORTING: jam(); - errorCode = ZABORTINPROGRESS; + errorCode = regApiPtr->returncode ? + regApiPtr->returncode : ZABORTINPROGRESS; break; case CS_START_SCAN: jam(); @@ -5877,9 +5899,9 @@ void Dbtc::abort010Lab(Signal* signal) if (transP->firstTcConnect == RNIL) { jam(); - /*-----------------------------------------------------------------------*/ - /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION. */ - /*-----------------------------------------------------------------------*/ + /*--------------------------------------------------------------------*/ + /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION. */ + /*--------------------------------------------------------------------*/ releaseAbortResources(signal); return; }//if @@ -6156,10 +6178,12 @@ void Dbtc::timeOutLoopStartLab(Signal* signal, Uint32 api_con_ptr) if (api_timer != 0) { time_out_value= time_out_param + (api_con_ptr & mask_value); time_passed= tc_timer - api_timer; - if (time_passed > time_out_value) { + if (time_passed > time_out_value) + { jam(); - timeOutFoundLab(signal, api_con_ptr); - return; + timeOutFoundLab(signal, api_con_ptr, ZTIME_OUT_ERROR); + api_con_ptr++; + break; } } } @@ -6179,10 +6203,8 @@ void Dbtc::timeOutLoopStartLab(Signal* signal, Uint32 api_con_ptr) return; }//Dbtc::timeOutLoopStartLab() -void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) +void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr, Uint32 errCode) { - sendContinueTimeOutControl(signal, TapiConPtr + 1); - apiConnectptr.i = TapiConPtr; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); /*------------------------------------------------------------------*/ @@ -6195,7 +6217,8 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) << "Time-out in state = " << apiConnectptr.p->apiConnectstate << " apiConnectptr.i = " << apiConnectptr.i << " - exec: " << apiConnectptr.p->m_exec_flag - << " - place: " << c_apiConTimer_line[apiConnectptr.i]); + << " - place: " << c_apiConTimer_line[apiConnectptr.i] + << " code: " << errCode); switch (apiConnectptr.p->apiConnectstate) { case CS_STARTED: if(apiConnectptr.p->lqhkeyreqrec == apiConnectptr.p->lqhkeyconfrec){ @@ -6212,7 +6235,7 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) }//if } apiConnectptr.p->returnsignal = RS_TCROLLBACKREP; - apiConnectptr.p->returncode = ZTIME_OUT_ERROR; + apiConnectptr.p->returncode = errCode; abort010Lab(signal); return; case CS_RECEIVING: @@ -6225,7 +6248,7 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) /* START ABORTING THE TRANSACTION. ALSO START CHECKING THE */ /* REMAINING TRANSACTIONS. */ /*------------------------------------------------------------------*/ - terrorCode = ZTIME_OUT_ERROR; + terrorCode = errCode; abortErrorLab(signal); return; case CS_COMMITTING: @@ -6432,6 +6455,7 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) return; } + bool found = false; OperationState tmp[16]; Uint32 TloopCount = 0; @@ -6439,7 +6463,31 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) jam(); if (tcConnectptr.i == RNIL) { jam(); - if (Tcheck == 0) { + +#ifdef VM_TRACE + ndbout_c("found: %d Tcheck: %d apiConnectptr.p->counter: %d", + found, Tcheck, apiConnectptr.p->counter); +#endif + if (found || apiConnectptr.p->counter) + { + jam(); + /** + * We sent atleast one ABORT/ABORTED + * or ZABORT_TIMEOUT_BREAK is in job buffer + * wait for reception... + */ + return; + } + + if (Tcheck == 1) + { + jam(); + releaseAbortResources(signal); + return; + } + + if (Tcheck == 0) + { jam(); /*------------------------------------------------------------------ * All nodes had already reported ABORTED for all tcConnect records. @@ -6448,9 +6496,11 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) *------------------------------------------------------------------*/ char buf[96]; buf[0] = 0; char buf2[96]; - BaseString::snprintf(buf, sizeof(buf), "TC %d: %d ops:", - __LINE__, apiConnectptr.i); - for(Uint32 i = 0; i<TloopCount; i++){ + BaseString::snprintf(buf, sizeof(buf), "TC %d: %d counter: %d ops:", + __LINE__, apiConnectptr.i, + apiConnectptr.p->counter); + for(Uint32 i = 0; i<TloopCount; i++) + { BaseString::snprintf(buf2, sizeof(buf2), "%s %d", buf, tmp[i]); BaseString::snprintf(buf, sizeof(buf), buf2); } @@ -6458,7 +6508,9 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) ndbout_c(buf); ndbrequire(false); releaseAbortResources(signal); + return; } + return; }//if TloopCount++; @@ -6473,7 +6525,16 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) signal->theData[0] = TcContinueB::ZABORT_TIMEOUT_BREAK; signal->theData[1] = tcConnectptr.i; signal->theData[2] = apiConnectptr.i; - sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + if (ERROR_INSERTED(8050)) + { + ndbout_c("sending ZABORT_TIMEOUT_BREAK delayed (%d %d)", + Tcheck, apiConnectptr.p->counter); + sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 2000, 3); + } + else + { + sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + } return; }//if ptrCheckGuard(tcConnectptr, ctcConnectFilesize, tcConnectRecord); @@ -6496,7 +6557,7 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) jam(); if (tcConnectptr.p->tcNodedata[Ti] != 0) { TloopCount += 31; - Tcheck = 1; + found = true; hostptr.i = tcConnectptr.p->tcNodedata[Ti]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); if (hostptr.p->hostStatus == HS_ALIVE) { @@ -6869,58 +6930,44 @@ void Dbtc::execNODE_FAILREP(Signal* signal) const Uint32 tnewMasterId = nodeFail->masterNodeId; arrGuard(tnoOfNodes, MAX_NDB_NODES); + Uint32 i; int index = 0; - for (unsigned i = 1; i< MAX_NDB_NODES; i++) { - if(NodeBitmask::get(nodeFail->theNodes, i)){ + for (i = 1; i< MAX_NDB_NODES; i++) + { + if(NodeBitmask::get(nodeFail->theNodes, i)) + { cdata[index] = i; index++; }//if }//for + cmasterNodeId = tnewMasterId; + tcNodeFailptr.i = 0; ptrAss(tcNodeFailptr, tcFailRecord); - Uint32 tindex; - for (tindex = 0; tindex < tnoOfNodes; tindex++) { + for (i = 0; i < tnoOfNodes; i++) + { jam(); - hostptr.i = cdata[tindex]; + hostptr.i = cdata[i]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); + /*------------------------------------------------------------*/ /* SET STATUS OF THE FAILED NODE TO DEAD SINCE IT HAS */ /* FAILED. */ /*------------------------------------------------------------*/ hostptr.p->hostStatus = HS_DEAD; + hostptr.p->m_nf_bits = HostRecord::NF_NODE_FAIL_BITS; + c_alive_nodes.clear(hostptr.i); - if (hostptr.p->takeOverStatus == TOS_COMPLETED) { - jam(); - /*------------------------------------------------------------*/ - /* A VERY UNUSUAL SITUATION. THE TAKE OVER WAS COMPLETED*/ - /* EVEN BEFORE WE HEARD ABOUT THE NODE FAILURE REPORT. */ - /* HOWEVER UNUSUAL THIS SITUATION IS POSSIBLE. */ - /*------------------------------------------------------------*/ - /* RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE */ - /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ - /* USED THEM IS COMPLETED. */ - /*------------------------------------------------------------*/ - { - NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; - nfRep->blockNo = DBTC; - nfRep->nodeId = cownNodeid; - nfRep->failedNodeId = hostptr.i; - } - sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, - NFCompleteRep::SignalLength, JBB); - } else { - ndbrequire(hostptr.p->takeOverStatus == TOS_IDLE); - hostptr.p->takeOverStatus = TOS_NODE_FAILED; - }//if - - if (tcNodeFailptr.p->failStatus == FS_LISTENING) { + if (tcNodeFailptr.p->failStatus == FS_LISTENING) + { jam(); /*------------------------------------------------------------*/ /* THE CURRENT TAKE OVER CAN BE AFFECTED BY THIS NODE */ /* FAILURE. */ /*------------------------------------------------------------*/ - if (hostptr.p->lqhTransStatus == LTS_ACTIVE) { + if (hostptr.p->lqhTransStatus == LTS_ACTIVE) + { jam(); /*------------------------------------------------------------*/ /* WE WERE WAITING FOR THE FAILED NODE IN THE TAKE OVER */ @@ -6932,86 +6979,46 @@ void Dbtc::execNODE_FAILREP(Signal* signal) }//if }//if - }//for - - const bool masterFailed = (cmasterNodeId != tnewMasterId); - cmasterNodeId = tnewMasterId; - - if(getOwnNodeId() == cmasterNodeId && masterFailed){ - /** - * Master has failed and I'm the new master - */ - jam(); - - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { + if (getOwnNodeId() != tnewMasterId) + { jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus != HS_ALIVE) { - jam(); - if (hostptr.p->takeOverStatus == TOS_COMPLETED) { - jam(); - /*------------------------------------------------------------*/ - /* SEND TAKE OVER CONFIRMATION TO ALL ALIVE NODES IF */ - /* TAKE OVER IS COMPLETED. THIS IS PERFORMED TO ENSURE */ - /* THAT ALL NODES AGREE ON THE IDLE STATE OF THE TAKE */ - /* OVER. THIS MIGHT BE MISSED IN AN ERROR SITUATION IF */ - /* MASTER FAILS AFTER SENDING CONFIRMATION TO NEW */ - /* MASTER BUT FAILING BEFORE SENDING TO ANOTHER NODE */ - /* WHICH WAS NOT MASTER. IF THIS NODE LATER BECOMES */ - /* MASTER IT MIGHT START A NEW TAKE OVER EVEN AFTER THE */ - /* CRASHED NODE HAVE ALREADY RECOVERED. */ - /*------------------------------------------------------------*/ - for(tmpHostptr.i = 1; tmpHostptr.i < MAX_NDB_NODES;tmpHostptr.i++) { - jam(); - ptrAss(tmpHostptr, hostRecord); - if (tmpHostptr.p->hostStatus == HS_ALIVE) { - jam(); - tblockref = calcTcBlockRef(tmpHostptr.i); - signal->theData[0] = hostptr.i; - sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//for - }//if - }//if - }//for - } - - if(getOwnNodeId() == cmasterNodeId){ - jam(); - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { + /** + * Only master does takeover currently + */ + hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; + } + else + { jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus != HS_ALIVE) { - jam(); - if (hostptr.p->takeOverStatus == TOS_NODE_FAILED) { - jam(); - /*------------------------------------------------------------*/ - /* CONCLUDE ALL ACTIVITIES THE FAILED TC DID CONTROL */ - /* SINCE WE ARE THE MASTER. THIS COULD HAVE BEEN STARTED*/ - /* BY A PREVIOUS MASTER BUT HAVE NOT BEEN CONCLUDED YET.*/ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_ACTIVE; - signal->theData[0] = hostptr.i; - sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); - }//if - }//if - }//for - }//if - for (tindex = 0; tindex < tnoOfNodes; tindex++) { - jam(); - hostptr.i = cdata[tindex]; - ptrCheckGuard(hostptr, chostFilesize, hostRecord); - /*------------------------------------------------------------*/ - /* LOOP THROUGH AND ABORT ALL SCANS THAT WHERE */ - /* CONTROLLED BY THIS TC AND ACTIVE IN THE FAILED */ - /* NODE'S LQH */ - /*------------------------------------------------------------*/ + signal->theData[0] = hostptr.i; + sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); + } + checkScanActiveInFailedLqh(signal, 0, hostptr.i); checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid - }//for - + nodeFailCheckTransactions(signal, 0, hostptr.i); + } }//Dbtc::execNODE_FAILREP() +void +Dbtc::checkNodeFailComplete(Signal* signal, + Uint32 failedNodeId, + Uint32 bit) +{ + hostptr.i = failedNodeId; + ptrCheckGuard(hostptr, chostFilesize, hostRecord); + hostptr.p->m_nf_bits &= ~bit; + if (hostptr.p->m_nf_bits == 0) + { + NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; + nfRep->blockNo = DBTC; + nfRep->nodeId = cownNodeid; + nfRep->failedNodeId = hostptr.i; + sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, + NFCompleteRep::SignalLength, JBB); + } +} + void Dbtc::checkScanActiveInFailedLqh(Signal* signal, Uint32 scanPtrI, Uint32 failedNodeId){ @@ -7053,8 +7060,44 @@ void Dbtc::checkScanActiveInFailedLqh(Signal* signal, sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); return; }//for + + checkNodeFailComplete(signal, failedNodeId, HostRecord::NF_CHECK_SCAN); +} + +void +Dbtc::nodeFailCheckTransactions(Signal* signal, + Uint32 transPtrI, + Uint32 failedNodeId) +{ + jam(); + Ptr<ApiConnectRecord> transPtr; + for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++) + { + ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); + if (transPtr.p->m_transaction_nodes.get(failedNodeId)) + { + jam(); + // Force timeout regardless of state + Uint32 save = c_appl_timeout_value; + c_appl_timeout_value = 1; + setApiConTimer(transPtr.i, 0, __LINE__); + timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT); + c_appl_timeout_value = save; + } + + // Send CONTINUEB to continue later + signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS; + signal->theData[1] = transPtr.i + 1; // Check next + signal->theData[2] = failedNodeId; + sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + return; + } + + checkNodeFailComplete(signal, failedNodeId, + HostRecord::NF_CHECK_TRANSACTION); } + void Dbtc::checkScanFragList(Signal* signal, Uint32 failedNodeId, @@ -7070,54 +7113,14 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal) tfailedNodeId = signal->theData[0]; hostptr.i = tfailedNodeId; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - switch (hostptr.p->takeOverStatus) { - case TOS_IDLE: - jam(); - /*------------------------------------------------------------*/ - /* THIS MESSAGE ARRIVED EVEN BEFORE THE NODE_FAILREP */ - /* MESSAGE. THIS IS POSSIBLE IN EXTREME SITUATIONS. */ - /* WE SET THE STATE TO TAKE_OVER_COMPLETED AND WAIT */ - /* FOR THE NODE_FAILREP MESSAGE. */ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_COMPLETED; - break; - case TOS_NODE_FAILED: - case TOS_ACTIVE: - jam(); - /*------------------------------------------------------------*/ - /* WE ARE NOT MASTER AND THE TAKE OVER IS ACTIVE OR WE */ - /* ARE MASTER AND THE TAKE OVER IS ACTIVE. IN BOTH */ - /* WE SET THE STATE TO TAKE_OVER_COMPLETED. */ - /*------------------------------------------------------------*/ - /* RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE */ - /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ - /* USED THEM IS COMPLETED. */ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_COMPLETED; - { - NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; - nfRep->blockNo = DBTC; - nfRep->nodeId = cownNodeid; - nfRep->failedNodeId = hostptr.i; - } - sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, - NFCompleteRep::SignalLength, JBB); - break; - case TOS_COMPLETED: - jam(); - /*------------------------------------------------------------*/ - /* WE HAVE ALREADY RECEIVED THE CONF SIGNAL. IT IS MOST */ - /* LIKELY SENT FROM A NEW MASTER WHICH WASN'T SURE IF */ - /* THIS NODE HEARD THE CONF SIGNAL FROM THE OLD MASTER. */ - /* WE SIMPLY IGNORE THE MESSAGE. */ - /*------------------------------------------------------------*/ - /*empty*/; - break; - default: + + if (signal->getSendersBlockRef() != reference()) + { jam(); - systemErrorLab(signal, __LINE__); return; - }//switch + } + + checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); }//Dbtc::execTAKE_OVERTCCONF() void Dbtc::execTAKE_OVERTCREQ(Signal* signal) @@ -7357,16 +7360,10 @@ void Dbtc::completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd) /* TO REPORT THE COMPLETION OF THE TAKE OVER TO ALL */ /* NODES THAT ARE ALIVE. */ /*------------------------------------------------------------*/ - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { - jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus == HS_ALIVE) { - jam(); - tblockref = calcTcBlockRef(hostptr.i); - signal->theData[0] = tcNodeFailptr.p->takeOverNode; - sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//for + NodeReceiverGroup rg(DBTC, c_alive_nodes); + signal->theData[0] = tcNodeFailptr.p->takeOverNode; + sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB); + if (tcNodeFailptr.p->queueIndex > 0) { jam(); /*------------------------------------------------------------*/ @@ -8048,6 +8045,7 @@ void Dbtc::initApiConnectFail(Signal* signal) apiConnectptr.p->ndbapiBlockref = 0; apiConnectptr.p->ndbapiConnect = 0; apiConnectptr.p->buddyPtr = RNIL; + apiConnectptr.p->m_transaction_nodes.clear(); setApiConTimer(apiConnectptr.i, 0, __LINE__); switch(ttransStatus){ case LqhTransConf::Committed: @@ -9875,6 +9873,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = tiacTmp - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9902,6 +9901,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = (2 * tiacTmp) - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9929,6 +9929,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = (3 * tiacTmp) - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9989,13 +9990,13 @@ void Dbtc::inithost(Signal* signal) ptrAss(hostptr, hostRecord); hostptr.p->hostStatus = HS_DEAD; hostptr.p->inPackedList = false; - hostptr.p->takeOverStatus = TOS_NOT_DEFINED; hostptr.p->lqhTransStatus = LTS_IDLE; hostptr.p->noOfWordsTCKEYCONF = 0; hostptr.p->noOfWordsTCINDXCONF = 0; hostptr.p->noOfPackedWordsLqh = 0; hostptr.p->hostLqhBlockRef = calcLqhBlockRef(hostptr.i); }//for + c_alive_nodes.clear(); }//Dbtc::inithost() void Dbtc::initialiseRecordsLab(Signal* signal, UintR Tdata0, @@ -10248,6 +10249,7 @@ void Dbtc::releaseAbortResources(Signal* signal) }//while apiConnectptr.p->firstTcConnect = RNIL; apiConnectptr.p->lastTcConnect = RNIL; + apiConnectptr.p->m_transaction_nodes.clear(); // MASV let state be CS_ABORTING until all // signals in the "air" have been received. Reset to CS_CONNECTED @@ -10321,6 +10323,7 @@ void Dbtc::releaseApiCon(Signal* signal, UintR TapiConnectPtr) cfirstfreeApiConnect = TlocalApiConnectptr.i; setApiConTimer(TlocalApiConnectptr.i, 0, __LINE__); TlocalApiConnectptr.p->apiConnectstate = CS_DISCONNECTED; + ndbassert(TlocalApiConnectptr.p->m_transaction_nodes.isclear()); ndbassert(TlocalApiConnectptr.p->apiScanRec == RNIL); TlocalApiConnectptr.p->ndbapiBlockref = 0; }//Dbtc::releaseApiCon() @@ -10856,6 +10859,34 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) c_theIndexOperationPool.getSize(), c_theIndexOperationPool.getNoOfFree()); } + + if (dumpState->args[0] == 2514) + { + if (signal->getLength() == 2) + { + dumpState->args[0] = DumpStateOrd::TcDumpOneApiConnectRec; + execDUMP_STATE_ORD(signal); + } + + NodeReceiverGroup rg(CMVMI, c_alive_nodes); + dumpState->args[0] = 15; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB); + + signal->theData[0] = 2515; + sendSignalWithDelay(cownref, GSN_DUMP_STATE_ORD, signal, 1000, 1); + return; + } + + if (dumpState->args[0] == 2515) + { + NdbNodeBitmask mask = c_alive_nodes; + mask.clear(getOwnNodeId()); + NodeReceiverGroup rg(NDBCNTR, mask); + + sendSignal(rg, GSN_SYSTEM_ERROR, signal, 1, JBB); + sendSignalWithDelay(cownref, GSN_SYSTEM_ERROR, signal, 300, 1); + return; + } }//Dbtc::execDUMP_STATE_ORD() void Dbtc::execSET_VAR_REQ(Signal* signal) diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index ed18a4ddb8b..1cecf69aaad 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -278,6 +278,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout) void Qmgr::execCONNECT_REP(Signal* signal) { + jamEntry(); const Uint32 nodeId = signal->theData[0]; c_connectedNodes.set(nodeId); NodeRecPtr nodePtr; @@ -285,9 +286,13 @@ void Qmgr::execCONNECT_REP(Signal* signal) ptrCheckGuard(nodePtr, MAX_NODES, nodeRec); switch(nodePtr.p->phase){ case ZSTARTING: + case ZRUNNING: jam(); + if(!c_start.m_nodes.isWaitingFor(nodeId)){ + jam(); + return; + } break; - case ZRUNNING: case ZPREPARE_FAIL: case ZFAIL_CLOSING: jam(); @@ -298,21 +303,28 @@ void Qmgr::execCONNECT_REP(Signal* signal) case ZAPI_INACTIVE: return; } - - if(!c_start.m_nodes.isWaitingFor(nodeId)){ - jam(); - return; - } - + switch(c_start.m_gsn){ case GSN_CM_REGREQ: jam(); sendCmRegReq(signal, nodeId); return; - case GSN_CM_NODEINFOREQ:{ + case GSN_CM_NODEINFOREQ: jam(); sendCmNodeInfoReq(signal, nodeId, nodePtr.p); return; + case GSN_CM_ADD:{ + jam(); + + ndbrequire(getOwnNodeId() != cpresident); + c_start.m_nodes.clearWaitingFor(nodeId); + c_start.m_gsn = RNIL; + + NodeRecPtr addNodePtr; + addNodePtr.i = nodeId; + ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec); + cmAddPrepare(signal, addNodePtr, nodePtr.p); + return; } default: return; @@ -945,15 +957,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){ return; case ZFAIL_CLOSING: jam(); -#ifdef VM_TRACE - ndbout_c("Enabling communication to CM_ADD node state=%d", - nodePtr.p->phase); -#endif + +#if 1 + warningEvent("Recieved request to incorperate node %u, " + "while error handling has not yet completed", + nodePtr.i); + + ndbrequire(getOwnNodeId() != cpresident); + ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD); + c_start.m_nodes.clearWaitingFor(); + c_start.m_nodes.setWaitingFor(nodePtr.i); + c_start.m_gsn = GSN_CM_ADD; +#else + warningEvent("Enabling communication to CM_ADD node %u state=%d", + nodePtr.i, + nodePtr.p->phase); nodePtr.p->phase = ZSTARTING; nodePtr.p->failState = NORMAL; signal->theData[0] = 0; signal->theData[1] = nodePtr.i; sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA); +#endif return; case ZSTARTING: break; @@ -1788,11 +1812,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal) jamEntry(); failedNodePtr.i = signal->theData[0]; + + if (ERROR_INSERTED(930)) + { + CLEAR_ERROR_INSERT_VALUE; + infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i); + return; + } + ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){ failedNodePtr.p->failState = NORMAL; } else { jam(); + + char buf[100]; + BaseString::snprintf(buf, 100, + "Received NDB_FAILCONF for node %u with state: %d %d", + failedNodePtr.i, + failedNodePtr.p->phase, + failedNodePtr.p->failState); + progError(__LINE__, 0, buf); systemErrorLab(signal, __LINE__); }//if if (cpresident == getOwnNodeId()) { @@ -2112,10 +2152,42 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.i == getOwnNodeId()) { jam(); - systemErrorLab(signal, __LINE__); + + const char * msg = 0; + switch(aFailCause){ + case FailRep::ZOWN_FAILURE: + msg = "Own failure"; + break; + case FailRep::ZOTHER_NODE_WHEN_WE_START: + case FailRep::ZOTHERNODE_FAILED_DURING_START: + msg = "Other node died during start"; + break; + case FailRep::ZIN_PREP_FAIL_REQ: + msg = "Prep fail"; + break; + case FailRep::ZSTART_IN_REGREQ: + msg = "Start timeout"; + break; + case FailRep::ZHEARTBEAT_FAILURE: + msg = "Hearbeat failure"; + break; + case FailRep::ZLINK_FAILURE: + msg = "Connection failure"; + break; + } + + char buf[100]; + BaseString::snprintf(buf, 100, + "We(%u) have been declared dead by %u reason: %s(%u)", + getOwnNodeId(), + refToNode(signal->getSendersBlockRef()), + aFailCause, + msg ? msg : "<Unknown>"); + + progError(__LINE__, 0, buf); return; }//if - + myNodePtr.i = getOwnNodeId(); ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec); if (myNodePtr.p->phase != ZRUNNING) { @@ -2826,6 +2898,7 @@ void Qmgr::failReport(Signal* signal, cfailureNr = cprepareFailureNr; ctoFailureNr = 0; ctoStatus = Q_ACTIVE; + c_start.reset(); // Don't take over nodes being started if (cnoCommitFailedNodes > 0) { jam(); /**----------------------------------------------------------------- diff --git a/ndb/src/ndbapi/NdbTransaction.cpp b/ndb/src/ndbapi/NdbTransaction.cpp index 294012d780c..7a2d8fc71c7 100644 --- a/ndb/src/ndbapi/NdbTransaction.cpp +++ b/ndb/src/ndbapi/NdbTransaction.cpp @@ -434,12 +434,12 @@ NdbTransaction::executeNoBlobs(ExecType aTypeOfExec, //------------------------------------------------------------------------ Ndb* tNdb = theNdb; + Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout; m_waitForReply = false; executeAsynchPrepare(aTypeOfExec, NULL, NULL, abortOption); if (m_waitForReply){ while (1) { - int noOfComp = tNdb->sendPollNdb((3 * WAITFOR_RESPONSE_TIMEOUT), - 1, forceSend); + int noOfComp = tNdb->sendPollNdb(3 * timeout, 1, forceSend); if (noOfComp == 0) { /** * This timeout situation can occur if NDB crashes. diff --git a/ndb/src/ndbapi/Ndbif.cpp b/ndb/src/ndbapi/Ndbif.cpp index bfbf98d1b3a..d39b21b52f7 100644 --- a/ndb/src/ndbapi/Ndbif.cpp +++ b/ndb/src/ndbapi/Ndbif.cpp @@ -953,23 +953,25 @@ Ndb::pollCompleted(NdbTransaction** aCopyArray) void Ndb::check_send_timeout() { + Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout; NDB_TICKS current_time = NdbTick_CurrentMillisecond(); if (current_time - the_last_check_time > 1000) { the_last_check_time = current_time; Uint32 no_of_sent = theNoOfSentTransactions; for (Uint32 i = 0; i < no_of_sent; i++) { NdbTransaction* a_con = theSentTransactionsArray[i]; - if ((current_time - a_con->theStartTransTime) > - WAITFOR_RESPONSE_TIMEOUT) { + if ((current_time - a_con->theStartTransTime) > timeout) + { #ifdef VM_TRACE a_con->printState(); Uint32 t1 = a_con->theTransactionId; Uint32 t2 = a_con->theTransactionId >> 32; - ndbout_c("[%.8x %.8x]", t1, t2); - abort(); + ndbout_c("4012 [%.8x %.8x]", t1, t2); + //abort(); #endif + a_con->theReleaseOnClose = true; a_con->setOperationErrorCodeAbort(4012); - a_con->theCommitStatus = NdbTransaction::Aborted; + a_con->theCommitStatus = NdbTransaction::NeedAbort; a_con->theCompletionStatus = NdbTransaction::CompletedFailure; a_con->handleExecuteCompletion(); remove_sent_list(i); diff --git a/ndb/src/ndbapi/TransporterFacade.cpp b/ndb/src/ndbapi/TransporterFacade.cpp index 77750a3c3d0..f661d53487c 100644 --- a/ndb/src/ndbapi/TransporterFacade.cpp +++ b/ndb/src/ndbapi/TransporterFacade.cpp @@ -563,6 +563,19 @@ TransporterFacade::init(Uint32 nodeId, const ndb_mgm_configuration* props) m_batch_size= batch_size; } + Uint32 timeout = 120000; + iter.first(); + for (iter.first(); iter.valid(); iter.next()) + { + Uint32 tmp1 = 0, tmp2 = 0; + iter.get(CFG_DB_TRANSACTION_CHECK_INTERVAL, &tmp1); + iter.get(CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, &tmp2); + tmp1 += tmp2; + if (tmp1 > timeout) + timeout = tmp1; + } + m_waitfor_timeout = timeout; + if (!theTransporterRegistry->start_service(m_socket_server)){ ndbout_c("Unable to start theTransporterRegistry->start_service"); DBUG_RETURN(false); diff --git a/ndb/src/ndbapi/TransporterFacade.hpp b/ndb/src/ndbapi/TransporterFacade.hpp index fa070889dd9..7174ce5206b 100644 --- a/ndb/src/ndbapi/TransporterFacade.hpp +++ b/ndb/src/ndbapi/TransporterFacade.hpp @@ -178,6 +178,7 @@ public: * (Ndb objects should not be shared by different threads.) */ STATIC_CONST( MAX_NO_THREADS = 4711 ); + Uint32 m_waitfor_timeout; // in milli seconds... private: struct ThreadData { diff --git a/ndb/test/include/NdbRestarter.hpp b/ndb/test/include/NdbRestarter.hpp index 19a88b4f8ad..3ec92ae786e 100644 --- a/ndb/test/include/NdbRestarter.hpp +++ b/ndb/test/include/NdbRestarter.hpp @@ -62,6 +62,7 @@ public: int dumpStateAllNodes(int * _args, int _num_args); int getMasterNodeId(); + int getRandomNodeSameNodeGroup(int nodeId, int randomNumber); int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber); int getRandomNotMasterNodeId(int randomNumber); diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index 92d6c1830ef..726c575316f 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -535,6 +535,119 @@ err: return NDBT_FAILED; } +int +runBug16772(NDBT_Context* ctx, NDBT_Step* step){ + + NdbRestarter restarter; + if (restarter.getNumDbNodes() < 2) + { + ctx->stopTest(); + return NDBT_OK; + } + + int aliveNodeId = restarter.getRandomNotMasterNodeId(rand()); + int deadNodeId = aliveNodeId; + while (deadNodeId == aliveNodeId) + deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes()); + + if (restarter.insertErrorInNode(aliveNodeId, 930)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(deadNodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&deadNodeId, 1)) + return NDBT_FAILED; + + if (restarter.startNodes(&deadNodeId, 1)) + return NDBT_FAILED; + + // It should now be hanging since we throw away NDB_FAILCONF + int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10); + // So this should fail...i.e it should not reach startphase 3 + + // Now send a NDB_FAILCONF for deadNo + int dump[] = { 7020, 323, 252, 0 }; + dump[3] = deadNodeId; + if (restarter.dumpStateOneNode(aliveNodeId, dump, 4)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&deadNodeId, 1)) + return NDBT_FAILED; + + return ret ? NDBT_OK : NDBT_FAILED; +} + +int +runBug18414(NDBT_Context* ctx, NDBT_Step* step){ + + NdbRestarter restarter; + if (restarter.getNumDbNodes() < 2) + { + ctx->stopTest(); + return NDBT_OK; + } + + Ndb* pNdb = GETNDB(step); + HugoOperations hugoOps(*ctx->getTab()); + HugoTransactions hugoTrans(*ctx->getTab()); + int loop = 0; + do + { + if(hugoOps.startTransaction(pNdb) != 0) + goto err; + + if(hugoOps.pkUpdateRecord(pNdb, 0, 128, rand()) != 0) + goto err; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + goto err; + + int node1 = hugoOps.getTransaction()->getConnectedNodeId(); + int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand()); + + if (node1 == -1 || node2 == -1) + break; + + if (loop & 1) + { + if (restarter.insertErrorInNode(node1, 8050)) + goto err; + } + + if (restarter.insertErrorInNode(node2, 5003)) + goto err; + + int res= hugoOps.execute_Rollback(pNdb); + + if (restarter.waitNodesNoStart(&node2, 1) != 0) + goto err; + + if (restarter.insertErrorInAllNodes(0)) + goto err; + + if (restarter.startNodes(&node2, 1) != 0) + goto err; + + if (restarter.waitClusterStarted() != 0) + goto err; + + if (hugoTrans.scanUpdateRecords(pNdb, 128) != 0) + goto err; + + hugoOps.closeTransaction(pNdb); + + } while(++loop < 5); + + return NDBT_OK; + +err: + hugoOps.closeTransaction(pNdb); + return NDBT_FAILED; +} NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -820,6 +933,16 @@ TESTCASE("Bug15685", STEP(runBug15685); FINALIZER(runClearTable); } +TESTCASE("Bug16772", + "Test bug with restarting before NF handling is complete"){ + STEP(runBug16772); +} +TESTCASE("Bug18414", + "Test bug with NF during NR"){ + INITIALIZER(runLoadTable); + STEP(runBug18414); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/ndbapi/testSystemRestart.cpp b/ndb/test/ndbapi/testSystemRestart.cpp index 35016896495..30f7aca9b06 100644 --- a/ndb/test/ndbapi/testSystemRestart.cpp +++ b/ndb/test/ndbapi/testSystemRestart.cpp @@ -1051,6 +1051,52 @@ int runSystemRestart9(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int runBug18385(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + const Uint32 nodeCount = restarter.getNumDbNodes(); + if(nodeCount < 2){ + g_info << "Bug18385 - Needs atleast 2 nodes to test" << endl; + return NDBT_OK; + } + + int node1 = restarter.getDbNodeId(rand() % nodeCount); + int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand()); + + if (node1 == -1 || node2 == -1) + return NDBT_OK; + + int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 300 }; + + int result = NDBT_OK; + do { + CHECK(restarter.dumpStateAllNodes(dump, 2) == 0); + CHECK(restarter.restartOneDbNode(node1, false, true, false) == 0); + NdbSleep_SecSleep(3); + CHECK(restarter.restartAll(false, true, false) == 0); + + Uint32 cnt = 0; + int nodes[128]; + for(Uint32 i = 0; i<nodeCount; i++) + if ((nodes[cnt] = restarter.getDbNodeId(i)) != node2) + cnt++; + + assert(cnt == nodeCount - 1); + + CHECK(restarter.startNodes(nodes, cnt) == 0); + CHECK(restarter.waitNodesStarted(nodes, cnt, 300) == 0); + + CHECK(restarter.insertErrorInNode(node2, 7170) == 0); + CHECK(restarter.waitNodesNoStart(&node2, 1) == 0); + CHECK(restarter.restartOneDbNode(node2, true, false, true) == 0); + CHECK(restarter.waitNodesStarted(&node2, 1) == 0); + + } while(0); + + g_info << "Bug18385 finished" << endl; + + return result; +} + int runWaitStarted(NDBT_Context* ctx, NDBT_Step* step){ NdbRestarter restarter; @@ -1234,6 +1280,13 @@ TESTCASE("SR9", STEP(runSystemRestart9); FINALIZER(runClearTable); } +TESTCASE("Bug18385", + "Perform partition system restart with other nodes with higher GCI"){ + INITIALIZER(runWaitStarted); + INITIALIZER(runClearTable); + STEP(runBug18385); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testSystemRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/ndbapi/testTimeout.cpp b/ndb/test/ndbapi/testTimeout.cpp index b02751ec819..36fb34a50e2 100644 --- a/ndb/test/ndbapi/testTimeout.cpp +++ b/ndb/test/ndbapi/testTimeout.cpp @@ -24,6 +24,7 @@ #define TIMEOUT (Uint32)3000 Uint32 g_org_timeout = 3000; +Uint32 g_org_deadlock = 3000; int setTransactionTimeout(NDBT_Context* ctx, NDBT_Step* step){ @@ -59,6 +60,60 @@ resetTransactionTimeout(NDBT_Context* ctx, NDBT_Step* step){ return NDBT_OK; } +int +setDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + int timeout = ctx->getProperty("TransactionDeadlockTimeout", TIMEOUT); + + NdbConfig conf(GETNDB(step)->getNodeId()+1); + unsigned int nodeId = conf.getMasterNodeId(); + if (!conf.getProperty(nodeId, + NODE_TYPE_DB, + CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, + &g_org_deadlock)) + return NDBT_FAILED; + + g_err << "Setting timeout: " << timeout << endl; + int val[] = { DumpStateOrd::TcSetTransactionTimeout, timeout }; + if(restarter.dumpStateAllNodes(val, 2) != 0){ + return NDBT_FAILED; + } + + return NDBT_OK; +} + +int +getDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + + Uint32 val = 0; + NdbConfig conf(GETNDB(step)->getNodeId()+1); + unsigned int nodeId = conf.getMasterNodeId(); + if (!conf.getProperty(nodeId, + NODE_TYPE_DB, + CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, + &val)) + return NDBT_FAILED; + + if (val < 120000) + val = 120000; + ctx->setProperty("TransactionDeadlockTimeout", 4*val); + + return NDBT_OK; +} + +int +resetDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + + int val[] = { DumpStateOrd::TcSetTransactionTimeout, g_org_deadlock }; + if(restarter.dumpStateAllNodes(val, 2) != 0){ + return NDBT_FAILED; + } + + return NDBT_OK; +} + int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){ @@ -333,6 +388,43 @@ int runBuddyTransNoTimeout(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int +runError4012(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int stepNo = step->getStepNo(); + + int timeout = ctx->getProperty("TransactionDeadlockTimeout", TIMEOUT); + + HugoOperations hugoOps(*ctx->getTab()); + Ndb* pNdb = GETNDB(step); + + do{ + // Commit transaction + CHECK(hugoOps.startTransaction(pNdb) == 0); + CHECK(hugoOps.pkUpdateRecord(pNdb, 0) == 0); + int ret = hugoOps.execute_NoCommit(pNdb); + if (ret == 0) + { + int sleep = timeout; + ndbout << "Sleeping for " << sleep << " milliseconds" << endl; + NdbSleep_MilliSleep(sleep); + + // Expect that transaction has NOT timed-out + CHECK(hugoOps.execute_Commit(pNdb) == 0); + } + else + { + CHECK(ret == 4012); + } + } while(false); + + hugoOps.closeTransaction(pNdb); + + return result; +} + + NDBT_TESTSUITE(testTimeout); TESTCASE("DontTimeoutTransaction", "Test that the transaction does not timeout "\ @@ -403,6 +495,15 @@ TESTCASE("BuddyTransNoTimeout5", FINALIZER(resetTransactionTimeout); FINALIZER(runClearTable); } +TESTCASE("Error4012", ""){ + TC_PROPERTY("TransactionDeadlockTimeout", 120000); + INITIALIZER(runLoadTable); + INITIALIZER(getDeadlockTimeout); + INITIALIZER(setDeadlockTimeout); + STEPS(runError4012, 2); + FINALIZER(runClearTable); +} + NDBT_TESTSUITE_END(testTimeout); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/Makefile.am b/ndb/test/run-test/Makefile.am index 60d64a7697f..2c45db50556 100644 --- a/ndb/test/run-test/Makefile.am +++ b/ndb/test/run-test/Makefile.am @@ -7,11 +7,10 @@ include $(top_srcdir)/ndb/config/type_mgmapiclient.mk.am test_PROGRAMS = atrt test_DATA=daily-basic-tests.txt daily-devel-tests.txt 16node-tests.txt \ - conf-daily-basic-ndbmaster.txt \ - conf-daily-basic-shark.txt \ - conf-daily-devel-ndbmaster.txt \ - conf-daily-sql-ndbmaster.txt \ - conf-daily-basic-dl145a.txt + conf-ndbmaster.txt \ + conf-shark.txt \ + conf-dl145a.txt + test_SCRIPTS=atrt-analyze-result.sh atrt-gather-result.sh atrt-setup.sh \ atrt-clear-result.sh make-config.sh make-index.sh make-html-reports.sh diff --git a/ndb/test/run-test/conf-daily-devel-ndbmaster.txt b/ndb/test/run-test/conf-daily-devel-ndbmaster.txt index 8b340e6a39d..51c171a6357 100644 --- a/ndb/test/run-test/conf-daily-devel-ndbmaster.txt +++ b/ndb/test/run-test/conf-daily-devel-ndbmaster.txt @@ -17,3 +17,6 @@ FileSystemPath: /space/autotest/run PortNumber: 16000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/conf-daily-basic-dl145a.txt b/ndb/test/run-test/conf-dl145a.txt index d8cf8d34d82..d0a240f09d1 100644 --- a/ndb/test/run-test/conf-daily-basic-dl145a.txt +++ b/ndb/test/run-test/conf-dl145a.txt @@ -17,3 +17,6 @@ FileSystemPath: /home/ndbdev/autotest/run PortNumber: 14000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/conf-daily-basic-ndbmaster.txt b/ndb/test/run-test/conf-ndbmaster.txt index bcd809593f3..89b41850ec0 100644 --- a/ndb/test/run-test/conf-daily-basic-ndbmaster.txt +++ b/ndb/test/run-test/conf-ndbmaster.txt @@ -17,3 +17,6 @@ FileSystemPath: /space/autotest/run PortNumber: 14000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/conf-daily-basic-shark.txt b/ndb/test/run-test/conf-shark.txt index 6d1f8b64f44..d66d0280d8a 100644 --- a/ndb/test/run-test/conf-daily-basic-shark.txt +++ b/ndb/test/run-test/conf-shark.txt @@ -17,3 +17,6 @@ FileSystemPath: /space/autotest/run PortNumber: 14000 ArbitrationRank: 1 DataDir: . + +[TCP DEFAULT] +SendBufferMemory: 2M diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 59f51044b51..d331a62cc7e 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -425,14 +425,26 @@ max-time: 500 cmd: testNodeRestart args: -n Bug15685 T1 +max-time: 500 +cmd: testNodeRestart +args: -n Bug16772 T1 + +max-time: 500 +cmd: testSystemRestart +args: -n Bug18385 T1 + +max-time: 500 +cmd: testNodeRestart +args: -n Bug18414 T1 + # OLD FLEX max-time: 500 cmd: flexBench -args: -c 25 -t 10 +args: -c 25 -t 10 max-time: 500 cmd: flexHammer -args: -r 5 -t 32 +args: -r 5 -t 32 # # DICT TESTS diff --git a/ndb/test/run-test/ndb-autotest.sh b/ndb/test/run-test/ndb-autotest.sh index 4228d2354d3..544897a2aa2 100755 --- a/ndb/test/run-test/ndb-autotest.sh +++ b/ndb/test/run-test/ndb-autotest.sh @@ -13,7 +13,7 @@ save_args=$* VERSION="ndb-autotest.sh version 1.04" DATE=`date '+%Y-%m-%d'` -HOST=`hostname` +HOST=`hostname -s` export DATE HOST set -e @@ -35,6 +35,7 @@ report=yes clone=5.0-ndb RUN="daily-basic daily-devel" conf=autotest.conf +LOCK=$HOME/.autotest-lock ############################ # Read command line entries# @@ -66,7 +67,7 @@ done if [ -f $conf ] then - . ./$conf + . $conf else echo "Can't find config file: $conf" exit @@ -105,7 +106,6 @@ fi # Setup the clone source location # #################################### -LOCK=$HOME/.autotest-lock src_clone=$src_clone_base-$clone ####################################### @@ -299,9 +299,12 @@ choose_conf(){ elif [ -f $test_dir/conf-$1.txt ] then echo "$test_dir/conf-$1.txt" + elif [ -f $test_dir/conf-$HOST.txt ] + echo "$test_dir/conf-$HOST.txt" else echo "Unable to find conf file looked for" 1>&2 echo "$test_dir/conf-$1-$HOST.txt and" 1>&2 + echo "$test_dir/conf-$HOST.txt" 1>&2 echo "$test_dir/conf-$1.txt" 1>&2 exit fi @@ -386,7 +389,8 @@ do awk '{for(i=1;i<='$count';i++)print $i;}'` echo $run_hosts >> /tmp/filter_hosts.$$ - choose $conf $run_hosts > d.tmp + choose $conf $run_hosts > d.tmp.$$ + sed -e s,CHOOSE_dir,"$install_dir",g < d.tmp.$$ > d.tmp $mkconfig d.tmp fi diff --git a/ndb/test/src/NdbRestarter.cpp b/ndb/test/src/NdbRestarter.cpp index 91c0963feae..2c16a05240d 100644 --- a/ndb/test/src/NdbRestarter.cpp +++ b/ndb/test/src/NdbRestarter.cpp @@ -174,6 +174,39 @@ NdbRestarter::getRandomNodeOtherNodeGroup(int nodeId, int rand){ return -1; } +int +NdbRestarter::getRandomNodeSameNodeGroup(int nodeId, int rand){ + if (!isConnected()) + return -1; + + if (getStatus() != 0) + return -1; + + int node_group = -1; + for(size_t i = 0; i < ndbNodes.size(); i++){ + if(ndbNodes[i].node_id == nodeId){ + node_group = ndbNodes[i].node_group; + break; + } + } + if(node_group == -1){ + return -1; + } + + Uint32 counter = 0; + rand = rand % ndbNodes.size(); + while(counter++ < ndbNodes.size() && + (ndbNodes[rand].node_id == nodeId || + ndbNodes[rand].node_group != node_group)) + rand = (rand + 1) % ndbNodes.size(); + + if(ndbNodes[rand].node_group == node_group && + ndbNodes[rand].node_id != nodeId) + return ndbNodes[rand].node_id; + + return -1; +} + int NdbRestarter::waitClusterStarted(unsigned int _timeout){ return waitClusterState(NDB_MGM_NODE_STATUS_STARTED, _timeout); diff --git a/ndb/tools/desc.cpp b/ndb/tools/desc.cpp index be0f6942db5..408227452a7 100644 --- a/ndb/tools/desc.cpp +++ b/ndb/tools/desc.cpp @@ -23,6 +23,7 @@ NDB_STD_OPTS_VARS; static const char* _dbname = "TEST_DB"; static int _unqualified = 0; +static int _partinfo = 0; static struct my_option my_long_options[] = { NDB_STD_OPTS("ndb_desc"), @@ -32,6 +33,9 @@ static struct my_option my_long_options[] = { "unqualified", 'u', "Use unqualified table names", (gptr*) &_unqualified, (gptr*) &_unqualified, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 }, + { "extra-partition-info", 'p', "Print more info per partition", + (gptr*) &_partinfo, (gptr*) &_partinfo, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} }; static void usage() @@ -45,6 +49,8 @@ static void usage() my_print_variables(my_long_options); } +static void print_part_info(Ndb* pNdb, NDBT_Table* pTab); + int main(int argc, char** argv){ NDB_INIT(argv[0]); const char *load_default_groups[]= { "mysql_cluster",0 }; @@ -109,7 +115,11 @@ int main(int argc, char** argv){ ndbout << (*pIdx) << endl; } + ndbout << endl; + + if (_partinfo) + print_part_info(pMyNdb, pTab); } else ndbout << argv[i] << ": " << dict->getNdbError() << endl; @@ -117,3 +127,71 @@ int main(int argc, char** argv){ return NDBT_ProgramExit(NDBT_OK); } + +struct InfoInfo +{ + const char * m_title; + NdbRecAttr* m_rec_attr; + const NdbDictionary::Column* m_column; +}; + + +static +void print_part_info(Ndb* pNdb, NDBT_Table* pTab) +{ + InfoInfo g_part_info[] = { + { "Partition", 0, NdbDictionary::Column::FRAGMENT }, + { "Row count", 0, NdbDictionary::Column::ROW_COUNT }, + { "Commit count", 0, NdbDictionary::Column::COMMIT_COUNT }, + { "Frag memory", 0, NdbDictionary::Column::FRAGMENT_MEMORY }, + { 0, 0, 0 } + }; + + ndbout << "-- Per partition info -- " << endl; + + NdbConnection* pTrans = pNdb->startTransaction(); + if (pTrans == 0) + return; + + do + { + NdbScanOperation* pOp= pTrans->getNdbScanOperation(pTab->getName()); + if (pOp == NULL) + break; + + NdbResultSet* rs= pOp->readTuples(NdbOperation::LM_CommittedRead); + if (rs == 0) + break; + + if (pOp->interpret_exit_last_row() != 0) + break; + + Uint32 i = 0; + for(i = 0; g_part_info[i].m_title != 0; i++) + { + if ((g_part_info[i].m_rec_attr = pOp->getValue(g_part_info[i].m_column)) == 0) + break; + } + + if (g_part_info[i].m_title != 0) + break; + + if (pTrans->execute(NoCommit) != 0) + break; + + for (i = 0; g_part_info[i].m_title != 0; i++) + ndbout << g_part_info[i].m_title << "\t"; + ndbout << endl; + + while(rs->nextResult() == 0) + { + for(i = 0; g_part_info[i].m_title != 0; i++) + { + ndbout << *g_part_info[i].m_rec_attr << "\t"; + } + ndbout << endl; + } + } while(0); + + pTrans->close(); +} diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index 1b1326920ad..b386439aed5 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -513,6 +513,13 @@ convert_error_code_to_mysql( return(HA_ERR_NO_SAVEPOINT); } else if (error == (int) DB_LOCK_TABLE_FULL) { + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + ha_rollback(thd); + } return(HA_ERR_LOCK_TABLE_FULL); } else { |