private void SetLastDbGoodTime(ClusterNodeStatusReport lastSuccessfulReport, string dbName) { DateTime lastGood = DateTime.MinValue; lastSuccessfulReport?.LastGoodDatabaseStatus.TryGetValue(dbName, out lastGood); LastGoodDatabaseStatus[dbName] = lastGood; }
public ClusterNodeStatusReport( Dictionary <string, DatabaseStatusReport> report, ReportStatus reportStatus, Exception error, DateTime updateDateTime, ClusterNodeStatusReport lastSuccessfulReport) { Report = report; Status = reportStatus; Error = error; UpdateDateTime = updateDateTime; LastSuccessfulUpdateDateTime = lastSuccessfulReport?.UpdateDateTime ?? DateTime.MinValue; LastGoodDatabaseStatus = new Dictionary <string, DateTime>(); foreach (var dbReport in report) { var dbName = dbReport.Key; var dbStatus = dbReport.Value.Status; if (reportStatus == ReportStatus.Ok && (dbStatus == DatabaseStatus.Loaded || dbStatus == DatabaseStatus.NoChange)) { LastGoodDatabaseStatus[dbName] = updateDateTime; } else { SetLastDbGoodTime(lastSuccessfulReport, dbName); } } }
private void OnTimeout() { if (_token.IsCancellationRequested) { return; } // expected timeout if (_log.IsInfoEnabled) { _log.Info("Timeout occurred while collecting info report."); } ReceivedReport = new ClusterNodeStatusReport(new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Timeout, null, DateTime.UtcNow, _lastSuccessfulReceivedReport); }
public ClusterNodeStatusReport( ServerReport serverReport, Dictionary <string, DatabaseStatusReport> report, ReportStatus reportStatus, Exception error, DateTime updateDateTime, ClusterNodeStatusReport lastSuccessfulReport) { ServerReport = serverReport; Report = report; Status = reportStatus; Error = error; UpdateDateTime = updateDateTime; if (ServerReport.OutOfCpuCredits ?? ServerReport.EarlyOutOfMemory ?? ServerReport.HighDirtyMemory ?? false) { // we don't want to give any grace time if the node is out of credits, early out of memory or high dirty memory LastSuccessfulUpdateDateTime = DateTime.MinValue; } else { LastSuccessfulUpdateDateTime = lastSuccessfulReport?.UpdateDateTime ?? DateTime.MinValue; } LastGoodDatabaseStatus = new Dictionary <string, DateTime>(); foreach (var dbReport in report) { var dbName = dbReport.Key; var dbStatus = dbReport.Value.Status; if (reportStatus == ReportStatus.Ok && (dbStatus == DatabaseStatus.Loaded || dbStatus == DatabaseStatus.NoChange)) { LastGoodDatabaseStatus[dbName] = updateDateTime; } else { SetLastDbGoodTime(lastSuccessfulReport, dbName); } } }
private void UpdateNodeReportIfNeeded(ClusterNodeStatusReport nodeReport, List <DatabaseStatusReport> unchangedReports) { foreach (var dbReport in nodeReport.Report) { if (dbReport.Value.Status == DatabaseStatus.NoChange) { _parent.ForTestingPurposes?.NoChangeFoundAction(this); unchangedReports.Add(dbReport.Value); } } if (unchangedReports.Count == 0) { return; } // we take the last received and not the last successful. // we don't want to reuse by mistake a successful report when we receive an 'unchanged' error. var lastReport = ReceivedReport; if (lastReport.Status != ClusterNodeStatusReport.ReportStatus.Ok) { throw new InvalidOperationException( $"We have databases with '{DatabaseStatus.NoChange}' status, but our last report from this node is '{lastReport.Status}'"); } foreach (var dbReport in unchangedReports) { var dbName = dbReport.Name; if (lastReport.Report.TryGetValue(dbName, out var previous) == false) { throw new InvalidOperationException( $"We got '{DatabaseStatus.NoChange}' for the database '{dbReport}', but it is missing in the last good report"); } previous.LastSentEtag = dbReport.LastSentEtag; previous.UpTime = dbReport.UpTime; nodeReport.Report[dbName] = previous; } }
private void UpdateNodeReportIfNeeded(ClusterNodeStatusReport nodeReport, List <DatabaseStatusReport> unchangedReports) { // we take the last received and not the last successful. // we don't want to reuse by miskate a successful report when we recieve an 'unchanged' error. var lastReport = ReceivedReport; if (lastReport.Status != ClusterNodeStatusReport.ReportStatus.Ok) { return; } foreach (var dbReport in nodeReport.Report) { if (dbReport.Value.Status == DatabaseStatus.NoChange) { unchangedReports.Add(dbReport.Value); } } if (unchangedReports.Count == 0) { return; } foreach (var dbReport in unchangedReports) { var dbName = dbReport.Name; if (lastReport.Report.TryGetValue(dbName, out var previous) == false) { // new db, shouldn't really be the case, but not much we can do, we'll // show it to the user as is continue; } previous.LastSentEtag = dbReport.LastSentEtag; previous.UpTime = dbReport.UpTime; nodeReport.Report[dbName] = previous; } }
private async Task ListenToMaintenanceWorker() { bool needToWait = false; var onErrorDelayTime = _parent.Config.OnErrorDelayTime.AsTimeSpan; var receiveFromWorkerTimeout = _parent.Config.ReceiveFromWorkerTimeout.AsTimeSpan; TcpConnectionInfo tcpConnection = null; try { tcpConnection = await ReplicationUtils.GetTcpInfoAsync(Url, null, "Supervisor", _parent._server.RavenServer.ClusterCertificateHolder?.Certificate); } catch (Exception e) { if (_log.IsInfoEnabled) { _log.Info($"ClusterMaintenanceSupervisor() => Failed to add to cluster node key = {ClusterTag}", e); } } while (_token.IsCancellationRequested == false) { var internalTaskCancellationToken = CancellationTokenSource.CreateLinkedTokenSource(_token); try { if (needToWait) { needToWait = false; // avoid tight loop if there was timeout / error await TimeoutManager.WaitFor(onErrorDelayTime, _token); tcpConnection = await ReplicationUtils.GetTcpInfoAsync(Url, null, "Supervisor", _parent._server.RavenServer.ClusterCertificateHolder.Certificate); } if (tcpConnection == null) { needToWait = true; continue; } var(tcpClient, connection) = await ConnectToClientNodeAsync(tcpConnection, _parent._server.Engine.TcpConnectionTimeout); using (tcpClient) using (_cts.Token.Register(tcpClient.Dispose)) using (connection) { while (_token.IsCancellationRequested == false) { using (_contextPool.AllocateOperationContext(out JsonOperationContext context)) { var readResponseTask = context.ReadForMemoryAsync(connection, _readStatusUpdateDebugString, internalTaskCancellationToken.Token); var timeout = TimeoutManager.WaitFor(receiveFromWorkerTimeout, _token); if (await Task.WhenAny(readResponseTask.AsTask(), timeout) == timeout) { if (_log.IsInfoEnabled) { _log.Info($"Timeout occurred while collecting info from {ClusterTag}"); } ReceivedReport = new ClusterNodeStatusReport(new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Timeout, null, DateTime.UtcNow, _lastSuccessfulReceivedReport); needToWait = true; internalTaskCancellationToken.Cancel(); break; } using (var statusUpdateJson = await readResponseTask) { var report = new Dictionary <string, DatabaseStatusReport>(); foreach (var property in statusUpdateJson.GetPropertyNames()) { var value = (BlittableJsonReaderObject)statusUpdateJson[property]; report.Add(property, JsonDeserializationServer.DatabaseStatusReport(value)); } ReceivedReport = new ClusterNodeStatusReport( report, ClusterNodeStatusReport.ReportStatus.Ok, null, DateTime.UtcNow, _lastSuccessfulReceivedReport); _lastSuccessfulReceivedReport = ReceivedReport; } } } } } catch (Exception e) { if (_log.IsInfoEnabled) { _log.Info($"Exception was thrown while collecting info from {ClusterTag}", e); } ReceivedReport = new ClusterNodeStatusReport(new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Error, e, DateTime.UtcNow, _lastSuccessfulReceivedReport); needToWait = true; } finally { internalTaskCancellationToken.Dispose(); } } }
private void ListenToMaintenanceWorker() { var needToWait = false; var firstIteration = true; var onErrorDelayTime = _parent.Config.OnErrorDelayTime.AsTimeSpan; var receiveFromWorkerTimeout = _parent.Config.ReceiveFromWorkerTimeout.AsTimeSpan; var tcpTimeout = _parent.Config.TcpConnectionTimeout.AsTimeSpan; if (tcpTimeout < receiveFromWorkerTimeout) { if (_log.IsInfoEnabled) { _log.Info( $"Warning: TCP timeout is lower than the receive from worker timeout ({tcpTimeout} < {receiveFromWorkerTimeout}), " + "this could affect the cluster observer's decisions."); } } TcpConnectionInfo tcpConnection = null; while (_token.IsCancellationRequested == false) { try { if (needToWait) { needToWait = false; // avoid tight loop if there was timeout / error if (firstIteration == false) { _token.WaitHandle.WaitOne(onErrorDelayTime); } firstIteration = false; using (var timeout = new CancellationTokenSource(tcpTimeout)) using (var combined = CancellationTokenSource.CreateLinkedTokenSource(_token, timeout.Token)) { tcpConnection = ReplicationUtils.GetTcpInfo(Url, null, "Supervisor", _parent._server.Server.Certificate.Certificate, combined.Token); } } if (tcpConnection == null) { needToWait = true; continue; } var connection = ConnectToClientNode(tcpConnection, _parent._server.Engine.TcpConnectionTimeout); var tcpClient = connection.TcpClient; var stream = connection.Stream; using (tcpClient) using (_cts.Token.Register(tcpClient.Dispose)) using (_contextPool.AllocateOperationContext(out JsonOperationContext context)) using (var timeoutEvent = new TimeoutEvent(receiveFromWorkerTimeout, $"Timeout event for: {_name}", singleShot: false)) { timeoutEvent.Start(OnTimeout); while (_token.IsCancellationRequested == false) { BlittableJsonReaderObject rawReport; try { // even if there is a timeout event, we will keep waiting on the same connection until the TCP timeout occurs. rawReport = context.ReadForMemory(stream, _readStatusUpdateDebugString); timeoutEvent.Defer(_parent._leaderClusterTag); } catch (Exception e) { if (_token.IsCancellationRequested) { return; } if (_log.IsInfoEnabled) { _log.Info("Exception occurred while reading the report from the connection", e); } ReceivedReport = new ClusterNodeStatusReport(new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Error, e, DateTime.UtcNow, _lastSuccessfulReceivedReport); needToWait = true; break; } var report = BuildReport(rawReport); timeoutEvent.Defer(_parent._leaderClusterTag); ReceivedReport = _lastSuccessfulReceivedReport = report; } } } catch (Exception e) { if (_log.IsInfoEnabled) { _log.Info($"Exception was thrown while collecting info from {ClusterTag}", e); } ReceivedReport = new ClusterNodeStatusReport(new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Error, e, DateTime.UtcNow, _lastSuccessfulReceivedReport); needToWait = true; } } }
private void ListenToMaintenanceWorker() { var firstIteration = true; var onErrorDelayTime = _parent.Config.OnErrorDelayTime.AsTimeSpan; var receiveFromWorkerTimeout = _parent.Config.ReceiveFromWorkerTimeout.AsTimeSpan; var tcpTimeout = _parent.Config.TcpConnectionTimeout.AsTimeSpan; if (tcpTimeout < receiveFromWorkerTimeout) { if (_log.IsInfoEnabled) { _log.Info( $"Warning: TCP timeout is lower than the receive from worker timeout ({tcpTimeout} < {receiveFromWorkerTimeout}), " + "this could affect the cluster observer's decisions."); } } while (_token.IsCancellationRequested == false) { try { if (firstIteration == false) { // avoid tight loop if there was timeout / error _token.WaitHandle.WaitOne(onErrorDelayTime); if (_token.IsCancellationRequested) { return; } } firstIteration = false; TcpConnectionInfo tcpConnection = null; using (var timeout = new CancellationTokenSource(tcpTimeout)) using (var combined = CancellationTokenSource.CreateLinkedTokenSource(_token, timeout.Token)) { tcpConnection = ReplicationUtils.GetTcpInfo(Url, null, "Supervisor", _parent._server.Server.Certificate.Certificate, combined.Token); if (tcpConnection == null) { continue; } } var connection = ConnectToClientNode(tcpConnection, _parent._server.Engine.TcpConnectionTimeout); var tcpClient = connection.TcpClient; var stream = connection.Stream; using (tcpClient) using (_cts.Token.Register(tcpClient.Dispose)) using (_contextPool.AllocateOperationContext(out JsonOperationContext contextForParsing)) using (_contextPool.AllocateOperationContext(out JsonOperationContext contextForBuffer)) using (contextForBuffer.GetMemoryBuffer(out var readBuffer)) using (var timeoutEvent = new TimeoutEvent(receiveFromWorkerTimeout, $"Timeout event for: {_name}", singleShot: false)) { timeoutEvent.Start(OnTimeout); var unchangedReports = new List <DatabaseStatusReport>(); while (_token.IsCancellationRequested == false) { contextForParsing.Reset(); contextForParsing.Renew(); BlittableJsonReaderObject rawReport; try { // even if there is a timeout event, we will keep waiting on the same connection until the TCP timeout occurs. rawReport = contextForParsing.Sync.ParseToMemory(stream, _readStatusUpdateDebugString, BlittableJsonDocumentBuilder.UsageMode.None, readBuffer); timeoutEvent.Defer(_parent._leaderClusterTag); } catch (Exception e) { if (_token.IsCancellationRequested) { return; } if (_log.IsInfoEnabled) { _log.Info("Exception occurred while reading the report from the connection", e); } ReceivedReport = new ClusterNodeStatusReport(new ServerReport(), new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Error, e, DateTime.UtcNow, _lastSuccessfulReceivedReport); break; } _parent.ForTestingPurposes?.BeforeReportBuildAction(this); var nodeReport = BuildReport(rawReport, connection.SupportedFeatures); timeoutEvent.Defer(_parent._leaderClusterTag); UpdateNodeReportIfNeeded(nodeReport, unchangedReports); unchangedReports.Clear(); ReceivedReport = _lastSuccessfulReceivedReport = nodeReport; _parent.ForTestingPurposes?.AfterSettingReportAction(this); } } } catch (Exception e) { if (_token.IsCancellationRequested) { return; } if (_log.IsInfoEnabled) { _log.Info($"Exception was thrown while collecting info from {ClusterTag}", e); } ReceivedReport = new ClusterNodeStatusReport(new ServerReport(), new Dictionary <string, DatabaseStatusReport>(), ClusterNodeStatusReport.ReportStatus.Error, e, DateTime.UtcNow, _lastSuccessfulReceivedReport); } } }