/// <summary> /// Checks the status of a node whether he is running or not. We send a status request /// message and wait for the response for a particular timeout. If the node is alive /// it sends backs its status otherwise timeout occurs and we consider hime DEAD. /// </summary> private void CheckStatus() { while (_statusCheckingThread != null) { lock (_checkStatusList.SyncRoot) { if (_checkStatusList.Count > 0) { _currentSuspect = _checkStatusList[0] as Address; _checkStatusList.Remove(_currentSuspect); } else { _currentSuspect = null; } if (_currentSuspect == null) { _statusCheckingThread = null; continue; } } lock (_status_mutex) { try { NodeStatus nodeStatus = null; if (_enclosingInstance.ct.ConnectionExist(_currentSuspect)) { Message msg = new Message(_currentSuspect, null, new byte[0]); msg.putHeader(HeaderType.KEEP_ALIVE, new TCPHearBeat(TCPHearBeat.ARE_YOU_ALIVE)); if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) { _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "sending status request to " + _currentSuspect); } _enclosingInstance.sendUnicastMessage(msg, false, msg.Payload, Priority.High); _statusReceived = null; //wait for the result or timeout occurs first; Monitor.Wait(_status_mutex, _statusTimeout); if (_statusReceived != null) { TCPHearBeat status = _statusReceived as TCPHearBeat; if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) { _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "received status " + status + " from " + _currentSuspect); } if (status.Type == TCPHearBeat.I_AM_NOT_DEAD) { nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_ALIVE); } else if (status.Type == TCPHearBeat.I_AM_LEAVING) { nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_LEAVING); } else if (status.Type == TCPHearBeat.I_AM_STARTING) { nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_DEAD); } } else { nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_DEAD); if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) { _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "did not receive status from " + _currentSuspect + "; consider him DEAD"); } } } else { if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) { _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "no connection exists for " + _currentSuspect); } nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_DEAD); } Event statusEvent = new Event(Event.GET_NODE_STATUS_OK, nodeStatus); _enclosingInstance.passUp(statusEvent); } catch (Exception e) { _enclosingInstance.Stack.NCacheLog.Error("ConnectionKeepAlive.CheckStatus", e.ToString()); } finally { _currentSuspect = null; _statusReceived = null; } } } }
public override void up(Event evt) { object obj; Message msg; HDR hdr; MergeData merge_data; switch (evt.Type) { case Event.MSG: msg = (Message)evt.Arg; obj = msg.getHeader(HeaderType.GMS); if (obj == null || !(obj is HDR)) break; hdr = (HDR)msg.removeHeader(HeaderType.GMS); switch (hdr.type) { case HDR.JOIN_REQ: object[] args = new object[4]; args[0] = hdr.mbr; args[1] = hdr.subGroup_name; args[2] = hdr.isStartedAsMirror; args[3] = hdr.GMSId; ThreadPool.QueueUserWorkItem(new WaitCallback(handleJoinrequestAsync), args); break; case HDR.SPECIAL_JOIN_REQUEST: HandleSpecialJoinRequest(hdr.mbr, hdr.GMSId); break; case HDR.JOIN_RSP: MarkStateTransferInProcess(); impl.handleJoinResponse(hdr.join_rsp); break; case HDR.LEAVE_REQ: Stack.NCacheLog.Debug("received LEAVE_REQ " + hdr + " from " + msg.Src); if (hdr.mbr == null) { Stack.NCacheLog.Error( "LEAVE_REQ's mbr field is null"); return; } if (isPartReplica && IsCoordinator) { //if replica node on the coordinator is leaving then send a special event to TCP //to mark himself leaving. This way other node asking for death status through keep //alive will get dead status. if (hdr.mbr != null && hdr.mbr.IpAddress.Equals(local_addr.IpAddress)) { down(new Event(Event.I_AM_LEAVING)); } } ThreadPool.QueueUserWorkItem(new WaitCallback(handleLeaveAsync), new object[] { hdr.mbr, false }); break; case HDR.LEAVE_RSP: impl.handleLeaveResponse(); break; case HDR.VIEW_RESPONSE: if (_promise != null) _promise.SetResult(hdr.arg); break; case HDR.VIEW: if (hdr.view == null) { Stack.NCacheLog.Error("[VIEW]: view == null"); return; } else Stack.NCacheLog.CriticalInfo("gms.Up", "received view from :" + msg.Src + " ; view = " + hdr.view); impl.handleViewChange(hdr.view, hdr.digest); break; case HDR.MERGE_REQ: impl.handleMergeRequest(msg.Src, hdr.merge_id); break; case HDR.MERGE_RSP: merge_data = new MergeData(msg.Src, hdr.view, hdr.digest); merge_data.merge_rejected = hdr.merge_rejected; impl.handleMergeResponse(merge_data, hdr.merge_id); break; case HDR.INSTALL_MERGE_VIEW: impl.handleMergeView(new MergeData(msg.Src, hdr.view, hdr.digest), hdr.merge_id); break; case HDR.CANCEL_MERGE: impl.handleMergeCancelled(hdr.merge_id); break; case HDR.CAN_NOT_CONNECT_TO: impl.handleCanNotConnectTo(msg.Src, hdr.nodeList); break; case HDR.LEAVE_CLUSTER: string gmsId = hdr.arg as string;//reported gms id string myGmsId = GetNodeGMSId(local_addr); if (gmsId != null && myGmsId != null && gmsId.Equals(myGmsId)) { ThreadPool.QueueUserWorkItem(new WaitCallback(handleLeaveClusterRequestAsync), hdr.mbr); } break; case HDR.CONNECTION_BROKEN: impl.handleConnectionBroken(msg.Src, hdr.mbr); break; case HDR.VIEW_REJECTED: impl.handleViewRejected(hdr.mbr); break; case HDR.INFORM_NODE_REJOINING: impl.handleInformNodeRejoining(msg.Src, hdr.mbr); break; case HDR.RESET_ON_NODE_REJOINING: impl.handleResetOnNodeRejoining(msg.Src, hdr.mbr, hdr.view); break; case HDR.RE_CHECK_CLUSTER_HEALTH: Thread t = new Thread(new ParameterizedThreadStart(impl.ReCheckClusterHealth)); t.Start(hdr.mbr); break; case HDR.INFORM_ABOUT_NODE_DEATH: //Replica is not supposed to handle this event if (isPartReplica && _startedAsMirror) break; impl.handleInformAboutNodeDeath(msg.Src, (Address)hdr.arg); break; case HDR.IS_NODE_IN_STATE_TRANSFER: impl.handleIsClusterInStateTransfer(msg.Src); break; case HDR.IS_NODE_IN_STATE_TRANSFER_RSP: if (_stateTransferPromise != null) { if(Stack.NCacheLog.IsInfoEnabled) Stack.NCacheLog.Info("gms.UP", "(state transfer rsp) sender: " + msg.Src + " ->" + hdr.arg); _stateTransferPromise.SetResult(hdr.arg); } break; default: Stack.NCacheLog.Error( "HDR with type=" + hdr.type + " not known"); break; } return; // don't pass up case Event.CONNECT_OK: // sent by someone else, but WE are responsible for sending this ! case Event.DISCONNECT_OK: // dito (e.g. sent by UDP layer). Don't send up the stack return; case Event.GET_NODE_STATUS_OK: lock (suspect_verify_mutex) { NodeStatus status = evt.Arg as NodeStatus; if (status.Node != null && status.Node.Equals(nodeTobeSuspect)) { nodeStatus = status; Monitor.PulseAll(suspect_verify_mutex); } } break; case Event.SET_LOCAL_ADDRESS: local_addr = (Address)evt.Arg; break; // pass up case Event.SUSPECT: ThreadPool.QueueUserWorkItem(new WaitCallback(handleSuspectAsync), evt.Arg); break; // pass up case Event.UNSUSPECT: impl.unsuspect((Address)evt.Arg); return; // discard case Event.MERGE: impl.merge((System.Collections.ArrayList)evt.Arg); return; // don't pass up case Event.CONNECTION_FAILURE: impl.handleConnectionFailure(evt.Arg as ArrayList); return;//dont passup case Event.NODE_REJOINING: impl.handleNodeRejoining(evt.Arg as Address); return; case Event.CONNECTION_BREAKAGE: Address node = evt.Arg as Address; if (!disconnected_nodes.Contains(node)) disconnected_nodes.Add(node); break; case Event.CONNECTION_RE_ESTABLISHED: node = evt.Arg as Address; if (disconnected_nodes.Contains(node)) disconnected_nodes.Remove(node); break; } if (impl.handleUpEvent(evt)) passUp(evt); }
/// <summary> /// Checks the status of a node whether he is running or not. We send a status request /// message and wait for the response for a particular timeout. If the node is alive /// it sends backs its status otherwise timeout occurs and we consider hime DEAD. /// </summary> private void CheckStatus() { while (_statusCheckingThread != null) { lock (_checkStatusList.SyncRoot) { if (_checkStatusList.Count > 0) { _currentSuspect = _checkStatusList[0] as Address; _checkStatusList.Remove(_currentSuspect); } else _currentSuspect = null; if (_currentSuspect == null) { _statusCheckingThread = null; continue; } } lock (_status_mutex) { try { NodeStatus nodeStatus = null; if (_enclosingInstance.ct.ConnectionExist(_currentSuspect)) { Message msg = new Message(_currentSuspect, null, new byte[0]); msg.putHeader(HeaderType.KEEP_ALIVE, new HearBeat(HearBeat.ARE_YOU_ALIVE)); if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "sending status request to " + _currentSuspect); _enclosingInstance.sendUnicastMessage(msg, false, msg.Payload, Priority.Critical); _statusReceived = null; //wait for the result or timeout occurs first; Monitor.Wait(_status_mutex, _statusTimeout); if (_statusReceived != null) { HearBeat status = _statusReceived as HearBeat; if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "received status " + status + " from " + _currentSuspect); if (status.Type == HearBeat.I_AM_NOT_DEAD) nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_ALIVE); else if (status.Type == HearBeat.I_AM_LEAVING) nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_LEAVING); else if (status.Type == HearBeat.I_AM_STARTING) nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_DEAD); } else { nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_DEAD); if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "did not receive status from " + _currentSuspect + "; consider him DEAD"); } } else { if (_enclosingInstance.Stack.NCacheLog.IsInfoEnabled) _enclosingInstance.Stack.NCacheLog.Info("ConnectionKeepAlive.CheckStatus", "no connection exists for " + _currentSuspect); nodeStatus = new NodeStatus(_currentSuspect, NodeStatus.IS_DEAD); } Event statusEvent = new Event(Event.GET_NODE_STATUS_OK, nodeStatus); _enclosingInstance.passUp(statusEvent); } catch (Exception e) { _enclosingInstance.Stack.NCacheLog.Error("ConnectionKeepAlive.CheckStatus", e.ToString()); } finally { _currentSuspect = null; _statusReceived = null; } } } }
/// <summary> /// Verifes whether the given node is dead or not. /// </summary> /// <param name="suspect">suspected node</param> /// <returns>true, if node is dead otherwise false</returns> public bool VerifySuspect(Address suspect, bool matchGmsId) { bool isDead = true; string gmsId = null; if (suspect != null) { Stack.NCacheLog.CriticalInfo("GMS.VerifySuspect", " verifying the death of node " + suspect); if (Stack.NCacheLog.IsInfoEnabled) Stack.NCacheLog.Info("GMS.VerifySuspect", " verifying the death of node " + suspect); gmsId = GmsIds[suspect] as string; lock (suspect_verify_mutex) { nodeStatus = null; nodeTobeSuspect = suspect; passDown(new Event(Event.GET_NODE_STATUS, suspect, Priority.Critical)); //we wait for the verification Monitor.Wait(suspect_verify_mutex); if (nodeStatus != null) { if (Stack.NCacheLog.IsInfoEnabled) Stack.NCacheLog.Info("GMS.VerifySuspect", " node status is " + nodeStatus.ToString()); switch (nodeStatus.Status) { case NodeStatus.IS_ALIVE: isDead = false; break; case NodeStatus.IS_DEAD: isDead = true; break; case NodeStatus.IS_LEAVING: isDead = true; break; } } } } if (isDead && matchGmsId) { //we verify whether current gms id is same as when node was reported suspect. string currentGmsId = GmsIds[suspect] as string; if (currentGmsId != null && gmsId != null && currentGmsId.Equals(gmsId)) return true; else { if (Stack.NCacheLog.IsErrorEnabled) Stack.NCacheLog.CriticalInfo("GMS.VerifySuspect", "node gms ids differ; old : " + gmsId + " new: " + currentGmsId + nodeStatus.ToString()); return false; } } return isDead; }