summaryrefslogtreecommitdiff
path: root/storage/ndb/src/common/transporter/SCI_Transporter.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'storage/ndb/src/common/transporter/SCI_Transporter.hpp')
-rw-r--r--storage/ndb/src/common/transporter/SCI_Transporter.hpp395
1 files changed, 395 insertions, 0 deletions
diff --git a/storage/ndb/src/common/transporter/SCI_Transporter.hpp b/storage/ndb/src/common/transporter/SCI_Transporter.hpp
new file mode 100644
index 00000000000..8d263f32a57
--- /dev/null
+++ b/storage/ndb/src/common/transporter/SCI_Transporter.hpp
@@ -0,0 +1,395 @@
+/* Copyright (C) 2003 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef SCI_Transporter_H
+#define SCI_Transporter_H
+#include "Transporter.hpp"
+#include "SHM_Buffer.hpp"
+
+
+#include <sisci_api.h>
+#include <sisci_error.h>
+#include <sisci_types.h>
+
+#include <ndb_types.h>
+
+/**
+ * The SCI Transporter
+ *
+ * The design goal of the SCI transporter is to deliver high performance
+ * data transfers (low latency, high bandwidth) combined with very high
+ * availability (failover support).
+ * High performance is an inherit feature of SCI and the, whereas failover
+ * support is implemented at the application level.
+ * In SCI the programming model is similar to the shared memory paradigm.
+ * A process on one node (A) allocates a memory segment and import the
+ * segment to its virtual address space. Another node (B) can connect to
+ * the segment and map this segment into its virtual address space.
+ * If A writes data to the segment, then B can read it and vice versa, through
+ * ordinary loads and stores. This is also called PIO (programmable IO), and
+ * is one thing that distinguish SCI from other interconnects such as,
+ * ethernet, Gig-e, Myrinet, and Infiniband. By using PIO, lower network
+ * latency is achieved, compared to the interconnects mentioned above.
+ * In order for NDB to utilize SCI, the SCI transporter relies on the
+ * SISCI api. The SISCI api provides a high level abstraction to the low
+ * level SCI driver called PCISCI driver.
+ * The SISCI api provides functions to setup, export, and import
+ * memory segments in a process virtual address space, and also functions to
+ * guarantee the correctness of data transfers between nodes. Basically, the
+ *
+ * In NDB Cluster, each SCI transporter creates a local segment
+ * that is mapped into the virtual address space. After the creation of the
+ * local segment, the SCI transporter connects to a segment created by another
+ * transporter at a remote node, and the maps the remote segment into its
+ * virtual address space. However, since NDB Cluster relies on redundancy
+ * at the network level, by using dual SCI adapters communica
+ *
+ *
+ */
+
+
+/**
+ * class SCITransporter
+ * @brief - main class for the SCI transporter.
+ */
+class SCI_Transporter : public Transporter {
+ friend class TransporterRegistry;
+public:
+
+ /**
+ * Init the transporter. Allocate sendbuffers and open a SCI virtual device
+ * for each adapter.
+ * @return true if successful, otherwize false
+ */
+ bool initTransporter();
+
+
+ /**
+ * Creates a sequence for error checking.
+ * @param adapterid the adapter on which to create a new sequence.
+ * @return SCI_ERR_OK if ok, otherwize something else.
+ */
+ sci_error_t createSequence(Uint32 adapterid);
+
+
+ /**
+ * starts a sequence for error checking.
+ * The actual checking that a sequence is correct is done implicitly
+ * in SCIMemCpy (in doSend).
+ * @param adapterid the adapter on which to start the sequence.
+ * @return SCI_ERR_OK if ok, otherwize something else.
+ */
+ sci_error_t startSequence(Uint32 adapterid);
+
+
+ /** Initiate Local Segment: create a memory segment,
+ * prepare a memory segment, map the local segment
+ * into memory space and make segment available.
+ * @return SCI_ERR_OK if ok, otherwize something else.
+ */
+ sci_error_t initLocalSegment();
+
+ /**
+ * Calculate the segment id for the remote segment
+ * @param localNodeId - local id (e.g. 1 = mgm , 2 = ndb.2 etc.)
+ * @param remoteNodeId - remote id (e.g. 1 = mgm , 2 = ndb.2 etc.)
+ * @return a segment id
+ */
+ Uint32 remoteSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
+
+ // Get local segment id (inline)
+ Uint32 hostSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
+
+ /**
+ * closeSCI closes the SCI virtual device
+ */
+ void closeSCI();
+
+
+ /**
+ * Check the status of the remote node,
+ * if it is connected or has disconnected
+ * @return true if connected, otherwize false.
+ */
+ bool checkConnected();
+
+ /**
+ * Check if the segment are properly connected to each other (remotely
+ * and locally).
+ * @return True if the both the local segment is mapped and the
+ * remote segment is mapped. Otherwize false.
+ */
+ bool getConnectionStatus();
+
+private:
+ SCI_Transporter(TransporterRegistry &t_reg,
+ const char *local_host,
+ const char *remote_host,
+ int port,
+ bool isMgmConnection,
+ Uint32 packetSize,
+ Uint32 bufferSize,
+ Uint32 nAdapters,
+ Uint16 remoteSciNodeId0,
+ Uint16 remoteSciNodeId1,
+ NodeId localNodeID,
+ NodeId remoteNodeID,
+ NodeId serverNodeId,
+ bool checksum,
+ bool signalId,
+ Uint32 reportFreq = 4096);
+
+ /**
+ * Destructor. Disconnects the transporter.
+ */
+ ~SCI_Transporter();
+ bool m_mapped;
+ bool m_initLocal;
+ bool m_sciinit;
+ Uint32 m_swapCounter;
+ Uint32 m_failCounter;
+ /**
+ * For statistics on transfered packets
+ */
+//#ifdef DEBUG_TRANSPORTER
+#if 1
+ Uint32 i1024;
+ Uint32 i2048;
+ Uint32 i2049;
+ Uint32 i10242048;
+ Uint32 i20484096;
+ Uint32 i4096;
+ Uint32 i4097;
+#endif
+
+ volatile Uint32 * m_localStatusFlag;
+ volatile Uint32 * m_remoteStatusFlag;
+ volatile Uint32 * m_remoteStatusFlag2;
+
+ struct {
+ Uint32 * m_buffer; // The buffer
+ Uint32 m_dataSize; // No of words in buffer
+ Uint32 m_sendBufferSize; // Buffer size
+ Uint32 m_forceSendLimit; // Send when buffer is this full
+ } m_sendBuffer;
+
+ SHM_Reader * reader;
+ SHM_Writer * writer;
+ SHM_Writer * writer2;
+
+ /**
+ * Statistics
+ */
+ Uint32 m_reportFreq;
+
+
+ Uint32 m_adapters;
+ Uint32 m_numberOfRemoteNodes;
+
+ Uint16 m_remoteNodes[2];
+
+ typedef struct SciAdapter {
+ sci_desc_t scidesc;
+ Uint32 localSciNodeId;
+ bool linkStatus;
+ } SciAdapter;
+
+ SciAdapter* sciAdapters;
+ Uint32 m_ActiveAdapterId;
+ Uint32 m_StandbyAdapterId;
+
+ typedef struct sourceSegm {
+ sci_local_segment_t localHandle; // Handle to local segment to be mapped
+ struct localHandleMap {
+ sci_map_t map; // Handle to the new mapped segment.
+ // 2 = max adapters in one node
+ } lhm[2];
+
+ volatile void *mappedMemory; // Used when reading
+ } sourceSegm;
+
+ typedef struct targetSegm {
+ struct remoteHandleMap {
+ sci_remote_segment_t remoteHandle; //Handle to local segment to be mapped
+ sci_map_t map; //Handle to the new mapped segment
+ } rhm[2];
+
+ sci_sequence_status_t m_SequenceStatus; // Used for error checking
+ sci_sequence_t sequence;
+ volatile void * mappedMemory; // Used when writing
+ SHM_Writer * writer;
+ } targetSegm;
+
+ sci_sequence_status_t m_SequenceStatus; // Used for error checking
+
+
+ // Shared between all SCI users active=(either prim or second)
+ sci_desc_t activeSCIDescriptor;
+
+ sourceSegm* m_SourceSegm; // Local segment reference
+ targetSegm* m_TargetSegm; // Remote segment reference
+
+ Uint32 m_LocalAdapterId; // Adapter Id
+ Uint16 m_LocalSciNodeId; // The SCI-node Id of this machine (adapter 0)
+ Uint16 m_LocalSciNodeId1; // The SCI-node Id of this machine (adapter 1)
+ Uint16 m_RemoteSciNodeId; // The SCI-node Id of remote machine (adapter 0)
+ Uint16 m_RemoteSciNodeId1; // The SCI-node Id of remote machine (adapter 1)
+
+ Uint32 m_PacketSize; // The size of each data packet
+ Uint32 m_BufferSize; // Mapped SCI buffer size
+
+ Uint32 * getWritePtr(Uint32 lenBytes, Uint32 prio);
+ void updateWritePtr(Uint32 lenBytes, Uint32 prio);
+
+ /**
+ * doSend. Copies the data from the source (the send buffer) to the
+ * shared mem. segment.
+ * Sequences are used for error checking.
+ * If an error occurs, the transfer is retried.
+ * If the link that we need to swap to is broken, we will disconnect.
+ * @return Returns true if datatransfer ok. If not retriable
+ * then false is returned.
+ */
+ bool doSend();
+
+ /**
+ * @param adapterNo the adapter for which to retrieve the node id.
+ * @return Returns the node id for an adapter.
+ */
+ Uint32 getLocalNodeId(Uint32 adapterNo);
+
+ bool hasDataToRead() const {
+ return reader->empty() == false;
+ }
+
+ bool hasDataToSend() const {
+ return m_sendBuffer.m_dataSize > 0;
+ }
+
+ /**
+ * Make the local segment unavailable, no new connections will be accepted.
+ * @return Returns true if the segment was successfully disconnected.
+ */
+ bool disconnectLocal();
+
+ /**
+ * Make the local segment unavailable, no new connections will be accepted.
+ * @return Returns true if the segment was successfully disconnected.
+ */
+ bool disconnectRemote();
+
+ void resetToInitialState();
+
+ /**
+ * It is always possible to send data with SCI!
+ * @return True (always)
+ */
+ bool sendIsPossible(struct timeval * timeout);
+
+ void getReceivePtr(Uint32 ** ptr, Uint32 ** eod){
+ reader->getReadPtr(* ptr, * eod);
+ }
+
+ void updateReceivePtr(Uint32 *ptr){
+ reader->updateReadPtr(ptr);
+ }
+
+ /**
+ * Corresponds to SHM_Transporter::setupBuffers()
+ * Initiates the start pointer of the buffer and read pointers.
+ * Initiate the localSegment for the SHM reader.
+ */
+ void setupLocalSegment();
+
+ /**
+ * Initiate the remoteSegment for the SHM writer
+ */
+ void setupRemoteSegment();
+
+ /**
+ * Set the connect flag in the remote memory segment (write through)
+ */
+ void setConnected();
+
+ /**
+ * Set the disconnect flag in the remote memory segment (write through)
+ */
+ void setDisconnect();
+
+ /**
+ * Check if there is a link between the adapter and the switch
+ * @param adapterNo the adapter for which to retrieve the link status.
+ * @return Returns true if there is a link between adapter and switch.
+ * Otherwize false is returned and the cables must be checked.
+ */
+ bool getLinkStatus(Uint32 adapterNo);
+
+ /**
+ * failoverShmWriter takes the state of the active writer and inserts into
+ * the standby writer.
+ */
+ void failoverShmWriter();
+
+ bool init_local();
+ bool init_remote();
+
+protected:
+
+ /** Perform a connection between segment
+ * This is a client node, trying to connect to a remote segment.
+ * @param timeout, the time the connect thread sleeps before
+ * retrying.
+ * @return Returns true on success, otherwize falser
+ */
+ bool connect_server_impl(NDB_SOCKET_TYPE sockfd);
+ bool connect_client_impl(NDB_SOCKET_TYPE sockfd);
+
+ /**
+ * We will disconnect if:
+ * -# the other node has disconnected from us
+ * -# unrecoverable error in transmission, on both adapters
+ * -# if we are shutdown properly
+ */
+ void disconnectImpl();
+
+ static bool initSCI();
+};
+
+
+/** The theLocalAdapterId combined with the theRemoteNodeId constructs
+ * (SCI ids)* a unique identifier for the local segment
+ */
+inline
+Uint32
+SCI_Transporter::hostSegmentId(Uint16 SciLocalNodeId,
+ Uint16 SciRemoteNodeId) {
+
+ return (SciLocalNodeId << 16) | SciRemoteNodeId;
+}
+
+/** The theLocalAdapterId combined with the theRemoteNodeId constructs
+ * (SCI ids)* a unique identifier for the remote segment
+ */
+inline
+Uint32
+SCI_Transporter::remoteSegmentId(Uint16 SciLocalNodeId,
+ Uint16 SciRemoteNodeId) {
+
+ return (SciRemoteNodeId << 16) | SciLocalNodeId;
+}
+
+
+#endif