private void BuildNodesFromSeeds() { foreach (var seed in _seeds.Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries)) { var hostPort = seed.Split(':'); var broker = new BrokerMeta { Host = hostPort[0], Port = int.Parse(hostPort[1]) }; var node = _nodeFactory(broker.Host, broker.Port); _nodes[node] = broker; _nodesByHostPort[BuildKey(broker.Host, broker.Port)] = node; } }
private void HandleTransportError(Exception e, BrokerMeta broker) { _log.Info("Handling TransportError for broker {0}", broker); ( from topic in _metadata.Topics from part in topic.Partitions where part.Leader == broker.NodeId select new { topic.TopicName, part } ).ForEach(p => { _log.Debug("Marking topic {2} partition {1} with transport error for broker {0}", broker, p.part, p.TopicName); p.part.ErrorCode = ErrorCode.TransportError; EtwTrace.Log.MetadataTransportError(p.TopicName, _id, p.part.Id, p.part.Leader); _partitionStateChangesSubject.OnNext(new PartitionStateChangeEvent(p.TopicName, p.part.Id, ErrorCode.TransportError)); }); }
private Task UpsertApplicationBrokerMetaAsync(BrokerMeta brokerMeta) { var filter = Builders <BrokerMeta> .Filter.Where(b => b.BrokerName == brokerMeta.BrokerName && b.ApplicationId == brokerMeta.ApplicationId); var options = new FindOneAndReplaceOptions <BrokerMeta, BrokerMeta> { IsUpsert = true }; return(_brokerColl.FindOneAndReplaceAsync( filter, brokerMeta, options)); }
private void Init(Configuration configuration) { _node = new Mock <INode>(); var brokerMeta = new BrokerMeta(); _node.Setup(n => n.FetchMetadata()) .Returns(Task.FromResult(new MetadataResponse { BrokersMeta = new[] { brokerMeta }, TopicsMeta = new TopicMeta[0] })); _producer = new Mock <IProduceRouter>(); _consumer = new Mock <IConsumeRouter>(); var logger = new Mock <ILogger>(); _client = new ClusterClient(configuration, logger.Object, new Cluster(configuration, logger.Object, (h, p) => _node.Object, () => _producer.Object, () => _consumer.Object)); }
public Fetcher(Cluster cluster, BrokerMeta broker, Protocol protocol, ConsumerConfiguration consumerConfig, CancellationToken cancel) { _cluster = cluster; _broker = broker; _protocol = protocol; _cancel = cancel; _consumerConfig = consumerConfig; _fetchResponses = FetchLoop().Publish().RefCount(); BuildReceivedMessages(); _cancel.Register(() => _wakeupSignal.OnNext(true)); if(_log.IsDebugEnabled) _log.Debug("Created new fetcher #{0} for broker: {1}", _id, _broker); EtwTrace.Log.FetcherStart(_id, consumerConfig.Topic); }
public Fetcher(Cluster cluster, BrokerMeta broker, Protocol protocol, ConsumerConfiguration consumerConfig, CancellationToken cancel) { _cluster = cluster; _broker = broker; _protocol = protocol; _cancel = cancel; _consumerConfig = consumerConfig; _fetchResponses = FetchLoop().Publish().RefCount(); BuildReceivedMessages(); _cancel.Register(() => _wakeupSignal.OnNext(true)); if (_log.IsDebugEnabled) { _log.Debug("Created new fetcher #{0} for broker: {1}", _id, _broker); } EtwTrace.Log.FetcherStart(_id, consumerConfig.Topic); }
internal async Task <MetadataResponse> MetadataRequest(TopicRequest request, BrokerMeta broker = null, bool noTransportErrors = false) { TcpClient tcp; Connection conn; if (broker != null) { conn = broker.Conn; tcp = await conn.GetClientAsync(noTransportErrors); } else { var clientAndConnection = await _cluster.GetAnyClientAsync(); conn = clientAndConnection.Item1; tcp = clientAndConnection.Item2; } //var tcp = await (broker != null ? broker.Conn.GetClientAsync() : _cluster.GetAnyClientAsync()); _log.Debug("Sending MetadataRequest to {0}", tcp.Client.RemoteEndPoint); if (_etw.IsEnabled()) { _etw.ProtocolMetadataRequest(request.ToString()); } var response = await conn.Correlation.SendAndCorrelateAsync( id => Serializer.Serialize(request, id), Serializer.DeserializeMetadataResponse, tcp, CancellationToken.None); if (_etw.IsEnabled()) { _etw.ProtocolMetadataResponse(response.ToString(), broker != null ? broker.Host : "", broker != null ? broker.Port : -1, broker != null ? broker.NodeId : -1); } return(response); }
internal async Task<MetadataResponse> MetadataRequest(TopicRequest request, BrokerMeta broker = null, bool noTransportErrors = false) { TcpClient tcp; Connection conn; if (broker != null) { conn = broker.Conn; tcp = await conn.GetClientAsync(noTransportErrors); } else { var clientAndConnection = await _cluster.GetAnyClientAsync(); conn = clientAndConnection.Item1; tcp = clientAndConnection.Item2; } //var tcp = await (broker != null ? broker.Conn.GetClientAsync() : _cluster.GetAnyClientAsync()); _log.Debug("Sending MetadataRequest to {0}", tcp.Client.RemoteEndPoint); if (_etw.IsEnabled()) { _etw.ProtocolMetadataRequest(request.ToString()); } var response = await conn.Correlation.SendAndCorrelateAsync( id => Serializer.Serialize(request, id), Serializer.DeserializeMetadataResponse, tcp, CancellationToken.None); if (_etw.IsEnabled()) { _etw.ProtocolMetadataResponse(response.ToString(), broker != null ? broker.Host : "", broker != null ? broker.Port : -1, broker != null ? broker.NodeId : -1); } return response; }
private async Task RecoveryLoop(BrokerMeta broker) { _log.Debug("{0} Starting recovery loop on broker: {1}", this, broker); EtwTrace.Log.RecoveryMonitor_RecoveryLoopStarted(_id, broker.Host, broker.Port, broker.NodeId); while (!_cancel.IsCancellationRequested) { //_log.Debug("RecoveryLoop iterating {0}", this); // // Check either there is any job for given broker // if (_failedList.Count == 0) { // TODO: await for the list to receive 1st item instead of looping await Task.Delay(1000, _cancel); continue; } // // Query metadata from given broker for any failed topics. // MetadataResponse response; try { EtwTrace.Log.RecoveryMonitor_SendingPing(_id, broker.Host, broker.Port); response = await _protocol.MetadataRequest(new TopicRequest { Topics = _failedList.Keys.Select(t => t.Item1).Distinct().ToArray() }, broker, noTransportErrors: true); EtwTrace.Log.RecoveryMonitor_PingResponse(_id, broker.Host, broker.Port); } catch (Exception ex) { _log.Debug("PartitionRecoveryMonitor error. Broker: {0}, error: {1}", broker, ex.Message); EtwTrace.Log.RecoveryMonitor_PingFailed(_id, broker.Host, broker.Port, ex.Message); response = null; } if (response == null) { await Task.Delay(1000, _cancel); continue; } // // Join failed partitions with successful responses to find out recovered ones // Tuple<string, int, int>[] maybeHealedPartitions = ( from responseTopic in response.Topics from responsePart in responseTopic.Partitions let key = new Tuple<string, int>(responseTopic.TopicName, responsePart.Id) where responseTopic.ErrorCode.IsSuccess() && responsePart.ErrorCode.IsSuccess() && _failedList.ContainsKey(key) select Tuple.Create(responseTopic.TopicName, responsePart.Id, responsePart.Leader) ).ToArray(); if (_log.IsDebugEnabled) { if (maybeHealedPartitions.Length == 0) { _log.Debug("Out of {0} partitions returned from broker {2}, none of the {3} errored partitions are healed. Current partition states for errored partitions: [{1}]", response.Topics.SelectMany(t => t.Partitions).Count(), string.Join(",", response.Topics .SelectMany(t => t.Partitions.Select(p => new { t.TopicName, TopicErrorCode = t.ErrorCode, PartitionId = p.Id, PartitionErrorCode = p.ErrorCode })) .Where(p => _failedList.ContainsKey(new Tuple<string, int>(p.TopicName, p.PartitionId))) .Select(p => string.Format("{0}:{1}:{2}:{3}", p.TopicName, p.TopicErrorCode, p.PartitionId, p.PartitionErrorCode))), broker, _failedList.Count ); } else { var str = new StringBuilder(); foreach (var leader in maybeHealedPartitions.GroupBy(p => p.Item3, (i, tuples) => new { Leader = i, Topics = tuples.GroupBy(t => t.Item1) })) { str.AppendFormat(" Leader: {0}\n", leader.Leader); foreach (var topic1 in leader.Topics) { str.AppendFormat(" Topic: {0} ", topic1.Key); str.AppendFormat("[{0}]\n", string.Join(",", topic1.Select(t => t.Item2))); } } _log.Debug("Healed partitions found by broker {0} (will check broker availability):\n{1}", broker, str.ToString()); } } if(EtwTrace.Log.IsEnabled()) { if (maybeHealedPartitions.Length != 0) { EtwTrace.Log.RecoveryMonitor_PossiblyHealedPartitions(_id, maybeHealedPartitions.Length); } else { EtwTrace.Log.RecoveryMonitor_NoHealedPartitions(_id); } } // // Make sure that brokers for healed partitions are accessible, because it is possible that // broker B1 said that partition belongs to B2 and B2 can not be reach. // It is checked only that said broker responds to metadata request without exceptions. // maybeHealedPartitions. GroupBy(p => p.Item3). ForEach(async brokerGrp => { BrokerMeta newBroker; _brokers.TryGetValue(brokerGrp.Key, out newBroker); if (newBroker == null) { newBroker = response.Brokers.SingleOrDefault(b => b.NodeId == brokerGrp.Key); // If Cluster started when one of the brokers was down, and later it comes alive, // it will be missing from our list of brokers. See issue #14. _log.Debug("received MetadataResponse for broker that is not yet in our list: {0}", newBroker); if (newBroker == null) { _log.Error("Got metadata response with partition refering to a broker which is not part of the response: {0}", response.ToString()); return; } // Broadcast only newly discovered broker and strip everything else, because this is the only // confirmed data. var filteredMeta = new MetadataResponse { Brokers = new[] { newBroker }, Topics = new TopicMeta[] { } }; _newMetadataEvent.OnNext(filteredMeta); } try { EtwTrace.Log.RecoveryMonitor_CheckingBrokerAccessibility(_id, newBroker.Host, newBroker.Port, newBroker.NodeId); MetadataResponse response2 = await _protocol.MetadataRequest(new TopicRequest { Topics = brokerGrp.Select(g=>g.Item1).Distinct().ToArray() }, newBroker, noTransportErrors: true); EtwTrace.Log.RecoveryMonitor_BrokerIsAccessible(_id, newBroker.Host, newBroker.Port, newBroker.NodeId); // success! // raise new metadata event _log.Info("Alive brokers detected: {0} which responded with: {1}", newBroker, response2); // Join maybe healed partitions with partitions which belong to alive broker var confirmedHealedTopics = (from maybeHealedPartition in brokerGrp from healedTopic in response2.Topics where healedTopic.TopicName == maybeHealedPartition.Item1 from healedPart in healedTopic.Partitions where healedPart.Id == maybeHealedPartition.Item2 && healedPart.Leader == brokerGrp.Key group healedPart by new { healedTopic.TopicName, healedTopic.ErrorCode } into healedTopicGrp select healedTopicGrp ); // broadcast only trully healed partitions which belong to alive broker var filteredResponse = new MetadataResponse { Brokers = response2.Brokers, // we may broadcast more than 1 broker, but it should be ok because discovery of new broker metadata does not cause any actions Topics = confirmedHealedTopics. Where(t => t.Any()). // broadcast only topics which have healed partitions Select(t => new TopicMeta { ErrorCode = t.Key.ErrorCode, TopicName = t.Key.TopicName, Partitions = t.ToArray() }).ToArray() }; _log.Debug("Broadcasting filtered response {0}", filteredResponse); if(EtwTrace.Log.IsEnabled()) foreach(var topic in filteredResponse.Topics) EtwTrace.Log.RecoveryMonitor_HealedPartitions(_id, newBroker.Host, newBroker.Port, newBroker.NodeId, topic.TopicName, string.Join(",", topic.Partitions.Select(p => p.Id))); _newMetadataEvent.OnNext(filteredResponse); } catch (Exception e) { _log.Warn("Metadata points to broker but it is not accessible. Error: {0}", e.Message); } }); await Task.Delay(3000, _cancel); } _log.Debug("RecoveryLoop exiting. Setting completion"); EtwTrace.Log.RecoveryMonitor_RecoveryLoopStop(_id); }
private async Task RecoveryLoop(BrokerMeta broker) { _log.Debug("{0} Starting recovery loop on broker: {1}", this, broker); EtwTrace.Log.RecoveryMonitor_RecoveryLoopStarted(_id, broker.Host, broker.Port, broker.NodeId); while (!_cancel.IsCancellationRequested) { //_log.Debug("RecoveryLoop iterating {0}", this); // // Check either there is any job for given broker // if (_failedList.Count == 0) { // TODO: await for the list to receive 1st item instead of looping await Task.Delay(1000, _cancel); continue; } // // Query metadata from given broker for any failed topics. // MetadataResponse response; try { EtwTrace.Log.RecoveryMonitor_SendingPing(_id, broker.Host, broker.Port); response = await _protocol.MetadataRequest(new TopicRequest { Topics = _failedList.Keys.Select(t => t.Item1).Distinct().ToArray() }, broker, noTransportErrors : true); EtwTrace.Log.RecoveryMonitor_PingResponse(_id, broker.Host, broker.Port); } catch (Exception ex) { _log.Debug("PartitionRecoveryMonitor error. Broker: {0}, error: {1}", broker, ex.Message); EtwTrace.Log.RecoveryMonitor_PingFailed(_id, broker.Host, broker.Port, ex.Message); response = null; } if (response == null) { await Task.Delay(1000, _cancel); continue; } // // Join failed partitions with successful responses to find out recovered ones // Tuple <string, int, int>[] maybeHealedPartitions = ( from responseTopic in response.Topics from responsePart in responseTopic.Partitions let key = new Tuple <string, int>(responseTopic.TopicName, responsePart.Id) where responseTopic.ErrorCode.IsSuccess() && responsePart.ErrorCode.IsSuccess() && _failedList.ContainsKey(key) select Tuple.Create(responseTopic.TopicName, responsePart.Id, responsePart.Leader) ).ToArray(); if (_log.IsDebugEnabled) { if (maybeHealedPartitions.Length == 0) { _log.Debug("Out of {0} partitions returned from broker {2}, none of the {3} errored partitions are healed. Current partition states for errored partitions: [{1}]", response.Topics.SelectMany(t => t.Partitions).Count(), string.Join(",", response.Topics .SelectMany(t => t.Partitions.Select(p => new { t.TopicName, TopicErrorCode = t.ErrorCode, PartitionId = p.Id, PartitionErrorCode = p.ErrorCode })) .Where(p => _failedList.ContainsKey(new Tuple <string, int>(p.TopicName, p.PartitionId))) .Select(p => string.Format("{0}:{1}:{2}:{3}", p.TopicName, p.TopicErrorCode, p.PartitionId, p.PartitionErrorCode))), broker, _failedList.Count ); } else { var str = new StringBuilder(); foreach (var leader in maybeHealedPartitions.GroupBy(p => p.Item3, (i, tuples) => new { Leader = i, Topics = tuples.GroupBy(t => t.Item1) })) { str.AppendFormat(" Leader: {0}\n", leader.Leader); foreach (var topic1 in leader.Topics) { str.AppendFormat(" Topic: {0} ", topic1.Key); str.AppendFormat("[{0}]\n", string.Join(",", topic1.Select(t => t.Item2))); } } _log.Debug("Healed partitions found by broker {0} (will check broker availability):\n{1}", broker, str.ToString()); } } if (EtwTrace.Log.IsEnabled()) { if (maybeHealedPartitions.Length != 0) { EtwTrace.Log.RecoveryMonitor_PossiblyHealedPartitions(_id, maybeHealedPartitions.Length); } else { EtwTrace.Log.RecoveryMonitor_NoHealedPartitions(_id); } } // // Make sure that brokers for healed partitions are accessible, because it is possible that // broker B1 said that partition belongs to B2 and B2 can not be reach. // It is checked only that said broker responds to metadata request without exceptions. // var aliveChecks = maybeHealedPartitions. GroupBy(p => p.Item3). Select(async brokerGrp => { BrokerMeta newBroker; _brokers.TryGetValue(brokerGrp.Key, out newBroker); if (newBroker == null) { newBroker = response.Brokers.SingleOrDefault(b => b.NodeId == brokerGrp.Key); // If Cluster started when one of the brokers was down, and later it comes alive, // it will be missing from our list of brokers. See issue #14. _log.Debug("received MetadataResponse for broker that is not yet in our list: {0}", newBroker); if (newBroker == null) { _log.Error("Got metadata response with partition refering to a broker which is not part of the response: {0}", response.ToString()); return; } // Broadcast only newly discovered broker and strip everything else, because this is the only // confirmed data. var filteredMeta = new MetadataResponse { Brokers = new[] { newBroker }, Topics = new TopicMeta[] { } }; _newMetadataEvent.OnNext(filteredMeta); } try { EtwTrace.Log.RecoveryMonitor_CheckingBrokerAccessibility(_id, newBroker.Host, newBroker.Port, newBroker.NodeId); MetadataResponse response2 = await _protocol.MetadataRequest(new TopicRequest { Topics = brokerGrp.Select(g => g.Item1).Distinct().ToArray() }, newBroker, noTransportErrors: true); EtwTrace.Log.RecoveryMonitor_BrokerIsAccessible(_id, newBroker.Host, newBroker.Port, newBroker.NodeId); // success! // raise new metadata event _log.Info("Alive brokers detected: {0} which responded with: {1}", newBroker, response2); // Join maybe healed partitions with partitions which belong to alive broker var confirmedHealedTopics = (from maybeHealedPartition in brokerGrp from healedTopic in response2.Topics where healedTopic.TopicName == maybeHealedPartition.Item1 from healedPart in healedTopic.Partitions where healedPart.Id == maybeHealedPartition.Item2 && healedPart.Leader == brokerGrp.Key group healedPart by new { healedTopic.TopicName, healedTopic.ErrorCode } into healedTopicGrp select healedTopicGrp ); // broadcast only trully healed partitions which belong to alive broker var filteredResponse = new MetadataResponse { Brokers = response2.Brokers, // we may broadcast more than 1 broker, but it should be ok because discovery of new broker metadata does not cause any actions Topics = confirmedHealedTopics. Where(t => t.Any()). // broadcast only topics which have healed partitions Select(t => new TopicMeta { ErrorCode = t.Key.ErrorCode, TopicName = t.Key.TopicName, Partitions = t.ToArray() }).ToArray() }; _log.Debug("Broadcasting filtered response {0}", filteredResponse); if (EtwTrace.Log.IsEnabled()) { foreach (var topic in filteredResponse.Topics) { EtwTrace.Log.RecoveryMonitor_HealedPartitions(_id, newBroker.Host, newBroker.Port, newBroker.NodeId, topic.TopicName, string.Join(",", topic.Partitions.Select(p => p.Id))); } } _newMetadataEvent.OnNext(filteredResponse); } catch (Exception e) { _log.Warn("Metadata points to broker but it is not accessible. Error: {0}", e.Message); } }); // Wait for all checks to complete, otherwise, if a broker does not respond and hold connection open until tcp timeout, // we will keep accumulating responses in memory faster than they time out. See https://github.com/ntent-ad/kafka4net/issues/30 await Task.WhenAll(aliveChecks.ToArray()); await Task.Delay(3000, _cancel); } _log.Debug("RecoveryLoop exiting. Setting completion"); EtwTrace.Log.RecoveryMonitor_RecoveryLoopStop(_id); }