Exemple #1
0
 private void BuildNodesFromSeeds()
 {
     foreach (var seed in _seeds.Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries))
     {
         var hostPort = seed.Split(':');
         var broker   = new BrokerMeta {
             Host = hostPort[0], Port = int.Parse(hostPort[1])
         };
         var node = _nodeFactory(broker.Host, broker.Port);
         _nodes[node] = broker;
         _nodesByHostPort[BuildKey(broker.Host, broker.Port)] = node;
     }
 }
Exemple #2
0
 private void HandleTransportError(Exception e, BrokerMeta broker)
 {
     _log.Info("Handling TransportError for broker {0}", broker);
     (
         from topic in _metadata.Topics
         from part in topic.Partitions
         where part.Leader == broker.NodeId
         select new { topic.TopicName, part }
     ).ForEach(p =>
     {
         _log.Debug("Marking topic {2} partition {1} with transport error for broker {0}", broker, p.part, p.TopicName);
         p.part.ErrorCode = ErrorCode.TransportError;
         EtwTrace.Log.MetadataTransportError(p.TopicName, _id, p.part.Id, p.part.Leader);
         _partitionStateChangesSubject.OnNext(new PartitionStateChangeEvent(p.TopicName, p.part.Id, ErrorCode.TransportError));
     });
 }
        private Task UpsertApplicationBrokerMetaAsync(BrokerMeta brokerMeta)
        {
            var filter = Builders <BrokerMeta> .Filter.Where(b =>
                                                             b.BrokerName == brokerMeta.BrokerName &&
                                                             b.ApplicationId == brokerMeta.ApplicationId);

            var options = new FindOneAndReplaceOptions <BrokerMeta, BrokerMeta>
            {
                IsUpsert = true
            };

            return(_brokerColl.FindOneAndReplaceAsync(
                       filter,
                       brokerMeta,
                       options));
        }
Exemple #4
0
        private void Init(Configuration configuration)
        {
            _node = new Mock <INode>();
            var brokerMeta = new BrokerMeta();

            _node.Setup(n => n.FetchMetadata())
            .Returns(Task.FromResult(new MetadataResponse
            {
                BrokersMeta = new[] { brokerMeta }, TopicsMeta = new TopicMeta[0]
            }));
            _producer = new Mock <IProduceRouter>();
            _consumer = new Mock <IConsumeRouter>();
            var logger = new Mock <ILogger>();

            _client = new ClusterClient(configuration, logger.Object,
                                        new Cluster(configuration, logger.Object, (h, p) => _node.Object, () => _producer.Object,
                                                    () => _consumer.Object));
        }
Exemple #5
0
        public Fetcher(Cluster cluster, BrokerMeta broker, Protocol protocol, ConsumerConfiguration consumerConfig, CancellationToken cancel)
        {
            _cluster = cluster;
            _broker = broker;
            _protocol = protocol;
            _cancel = cancel;

            _consumerConfig = consumerConfig;
            
            _fetchResponses = FetchLoop().Publish().RefCount();
            BuildReceivedMessages();

            _cancel.Register(() => _wakeupSignal.OnNext(true));

            if(_log.IsDebugEnabled)
                _log.Debug("Created new fetcher #{0} for broker: {1}", _id, _broker);
            EtwTrace.Log.FetcherStart(_id, consumerConfig.Topic);
        }
Exemple #6
0
        public Fetcher(Cluster cluster, BrokerMeta broker, Protocol protocol, ConsumerConfiguration consumerConfig, CancellationToken cancel)
        {
            _cluster  = cluster;
            _broker   = broker;
            _protocol = protocol;
            _cancel   = cancel;

            _consumerConfig = consumerConfig;

            _fetchResponses = FetchLoop().Publish().RefCount();
            BuildReceivedMessages();

            _cancel.Register(() => _wakeupSignal.OnNext(true));

            if (_log.IsDebugEnabled)
            {
                _log.Debug("Created new fetcher #{0} for broker: {1}", _id, _broker);
            }
            EtwTrace.Log.FetcherStart(_id, consumerConfig.Topic);
        }
Exemple #7
0
        internal async Task <MetadataResponse> MetadataRequest(TopicRequest request, BrokerMeta broker = null, bool noTransportErrors = false)
        {
            TcpClient  tcp;
            Connection conn;

            if (broker != null)
            {
                conn = broker.Conn;
                tcp  = await conn.GetClientAsync(noTransportErrors);
            }
            else
            {
                var clientAndConnection = await _cluster.GetAnyClientAsync();

                conn = clientAndConnection.Item1;
                tcp  = clientAndConnection.Item2;
            }

            //var tcp = await (broker != null ? broker.Conn.GetClientAsync() : _cluster.GetAnyClientAsync());
            _log.Debug("Sending MetadataRequest to {0}", tcp.Client.RemoteEndPoint);
            if (_etw.IsEnabled())
            {
                _etw.ProtocolMetadataRequest(request.ToString());
            }

            var response = await conn.Correlation.SendAndCorrelateAsync(
                id => Serializer.Serialize(request, id),
                Serializer.DeserializeMetadataResponse,
                tcp, CancellationToken.None);

            if (_etw.IsEnabled())
            {
                _etw.ProtocolMetadataResponse(response.ToString(),
                                              broker != null ? broker.Host : "",
                                              broker != null ? broker.Port : -1,
                                              broker != null ? broker.NodeId : -1);
            }


            return(response);
        }
Exemple #8
0
        internal async Task<MetadataResponse> MetadataRequest(TopicRequest request, BrokerMeta broker = null, bool noTransportErrors = false)
        {
            TcpClient tcp;
            Connection conn;

            if (broker != null)
            {
                conn = broker.Conn;
                tcp = await conn.GetClientAsync(noTransportErrors);
            }
            else
            {
                var clientAndConnection = await _cluster.GetAnyClientAsync();
                conn = clientAndConnection.Item1;
                tcp = clientAndConnection.Item2;
            }

            //var tcp = await (broker != null ? broker.Conn.GetClientAsync() : _cluster.GetAnyClientAsync());
            _log.Debug("Sending MetadataRequest to {0}", tcp.Client.RemoteEndPoint);
            if (_etw.IsEnabled())
            {
                _etw.ProtocolMetadataRequest(request.ToString());
            }

            var response = await conn.Correlation.SendAndCorrelateAsync(
                id => Serializer.Serialize(request, id),
                Serializer.DeserializeMetadataResponse,
                tcp, CancellationToken.None);

            if (_etw.IsEnabled())
            {
                _etw.ProtocolMetadataResponse(response.ToString(), 
                    broker != null ? broker.Host : "", 
                    broker != null ? broker.Port : -1,
                    broker != null ? broker.NodeId : -1);
            }


            return response;
        }
        private async Task RecoveryLoop(BrokerMeta broker)
        {
            _log.Debug("{0} Starting recovery loop on broker: {1}", this, broker);
            EtwTrace.Log.RecoveryMonitor_RecoveryLoopStarted(_id, broker.Host, broker.Port, broker.NodeId);
            while (!_cancel.IsCancellationRequested)
            {
                //_log.Debug("RecoveryLoop iterating {0}", this);

                //
                // Check either there is any job for given broker
                //
                if (_failedList.Count == 0)
                {
                    // TODO: await for the list to receive 1st item instead of looping
                    await Task.Delay(1000, _cancel);
                    continue;
                }

                //
                // Query metadata from given broker for any failed topics.
                //
                MetadataResponse response;
                try
                {
                    EtwTrace.Log.RecoveryMonitor_SendingPing(_id, broker.Host, broker.Port);
                    response = await _protocol.MetadataRequest(new TopicRequest { Topics = _failedList.Keys.Select(t => t.Item1).Distinct().ToArray() }, broker, noTransportErrors: true);
                    EtwTrace.Log.RecoveryMonitor_PingResponse(_id, broker.Host, broker.Port);
                }
                catch (Exception ex)
                {
                    _log.Debug("PartitionRecoveryMonitor error. Broker: {0}, error: {1}", broker, ex.Message);
                    EtwTrace.Log.RecoveryMonitor_PingFailed(_id, broker.Host, broker.Port, ex.Message);
                    response = null;
                }

                if (response == null)
                {
                    await Task.Delay(1000, _cancel);
                    continue;
                }

                //
                // Join failed partitions with successful responses to find out recovered ones
                //
                Tuple<string, int, int>[] maybeHealedPartitions = (
                    from responseTopic in response.Topics
                    from responsePart in responseTopic.Partitions
                    let key = new Tuple<string, int>(responseTopic.TopicName, responsePart.Id)
                    where 
                        responseTopic.ErrorCode.IsSuccess()
                        && responsePart.ErrorCode.IsSuccess()
                        && _failedList.ContainsKey(key)
                    select Tuple.Create(responseTopic.TopicName, responsePart.Id, responsePart.Leader)
                    ).ToArray();

                if (_log.IsDebugEnabled)
                {
                    if (maybeHealedPartitions.Length == 0)
                    {
                        _log.Debug("Out of {0} partitions returned from broker {2}, none of the {3} errored partitions are healed. Current partition states for errored partitions: [{1}]",
                            response.Topics.SelectMany(t => t.Partitions).Count(),
                            string.Join(",", response.Topics
                                .SelectMany(t => t.Partitions.Select(p => new { t.TopicName, TopicErrorCode = t.ErrorCode, PartitionId = p.Id, PartitionErrorCode = p.ErrorCode }))
                                .Where(p => _failedList.ContainsKey(new Tuple<string, int>(p.TopicName, p.PartitionId)))
                                .Select(p => string.Format("{0}:{1}:{2}:{3}", p.TopicName, p.TopicErrorCode, p.PartitionId, p.PartitionErrorCode))),
                            broker,
                            _failedList.Count
                            );
                    }
                    else
                    {
                        var str = new StringBuilder();
                        foreach (var leader in maybeHealedPartitions.GroupBy(p => p.Item3, (i, tuples) => new { Leader = i, Topics = tuples.GroupBy(t => t.Item1) }))
                        {
                            str.AppendFormat(" Leader: {0}\n", leader.Leader);
                            foreach (var topic1 in leader.Topics)
                            {
                                str.AppendFormat("  Topic: {0} ", topic1.Key);
                                str.AppendFormat("[{0}]\n", string.Join(",", topic1.Select(t => t.Item2)));
                            }
                        }
                        _log.Debug("Healed partitions found by broker {0} (will check broker availability):\n{1}", broker, str.ToString());
                    }
                }

                if(EtwTrace.Log.IsEnabled()) 
                {
                    if (maybeHealedPartitions.Length != 0)
                    {
                        EtwTrace.Log.RecoveryMonitor_PossiblyHealedPartitions(_id, maybeHealedPartitions.Length);
                    }
                    else
                    {
                        EtwTrace.Log.RecoveryMonitor_NoHealedPartitions(_id);
                    }
                }

                //
                // Make sure that brokers for healed partitions are accessible, because it is possible that
                // broker B1 said that partition belongs to B2 and B2 can not be reach.
                // It is checked only that said broker responds to metadata request without exceptions.
                //
                maybeHealedPartitions.
                    GroupBy(p => p.Item3).
                    ForEach(async brokerGrp =>
                    {
                        BrokerMeta newBroker;
                        _brokers.TryGetValue(brokerGrp.Key, out newBroker);
                        if (newBroker == null)
                        {
                            newBroker = response.Brokers.SingleOrDefault(b => b.NodeId == brokerGrp.Key);

                            // If Cluster started when one of the brokers was down, and later it comes alive,
                            // it will be missing from our list of brokers. See issue #14.
                            _log.Debug("received MetadataResponse for broker that is not yet in our list: {0}", newBroker);

                            if (newBroker == null)
                            {
                                _log.Error("Got metadata response with partition refering to a broker which is not part of the response: {0}", response.ToString());
                                return;
                            }

                            // Broadcast only newly discovered broker and strip everything else, because this is the only
                            // confirmed data.
                            var filteredMeta = new MetadataResponse
                            {
                                Brokers = new[] { newBroker },
                                Topics = new TopicMeta[] { }
                            };

                            _newMetadataEvent.OnNext(filteredMeta);
                        }

                        try
                        {
                            EtwTrace.Log.RecoveryMonitor_CheckingBrokerAccessibility(_id, newBroker.Host, newBroker.Port, newBroker.NodeId);
                            MetadataResponse response2 = await _protocol.MetadataRequest(new TopicRequest { Topics = brokerGrp.Select(g=>g.Item1).Distinct().ToArray() }, newBroker, noTransportErrors: true);
                            EtwTrace.Log.RecoveryMonitor_BrokerIsAccessible(_id, newBroker.Host, newBroker.Port, newBroker.NodeId);

                            // success!
                            // raise new metadata event 
                            _log.Info("Alive brokers detected: {0} which responded with: {1}", newBroker, response2);

                            // Join maybe healed partitions with partitions which belong to alive broker
                            var confirmedHealedTopics =
                                (from maybeHealedPartition in brokerGrp
                                 from healedTopic in response2.Topics
                                 where healedTopic.TopicName == maybeHealedPartition.Item1
                                 from healedPart in healedTopic.Partitions
                                 where healedPart.Id == maybeHealedPartition.Item2 && healedPart.Leader == brokerGrp.Key
                                 group healedPart by new { healedTopic.TopicName, healedTopic.ErrorCode } into healedTopicGrp
                                 select healedTopicGrp
                                 );

                            
                            // broadcast only trully healed partitions which belong to alive broker
                            var filteredResponse = new MetadataResponse
                            {
                                Brokers = response2.Brokers, // we may broadcast more than 1 broker, but it should be ok because discovery of new broker metadata does not cause any actions
                                Topics = confirmedHealedTopics.
                                    Where(t => t.Any()). // broadcast only topics which have healed partitions
                                    Select(t => new TopicMeta
                                    {
                                        ErrorCode = t.Key.ErrorCode,
                                        TopicName = t.Key.TopicName,
                                        Partitions = t.ToArray()
                                    }).ToArray()
                            };

                            _log.Debug("Broadcasting filtered response {0}", filteredResponse);
                            if(EtwTrace.Log.IsEnabled())
                                foreach(var topic in filteredResponse.Topics)
                                    EtwTrace.Log.RecoveryMonitor_HealedPartitions(_id, newBroker.Host, newBroker.Port, newBroker.NodeId, topic.TopicName, string.Join(",", topic.Partitions.Select(p => p.Id)));
                            _newMetadataEvent.OnNext(filteredResponse);

                        }
                        catch (Exception e)
                        {
                            _log.Warn("Metadata points to broker but it is not accessible. Error: {0}", e.Message);
                        }
                    });

                await Task.Delay(3000, _cancel);
            }

            _log.Debug("RecoveryLoop exiting. Setting completion");
            EtwTrace.Log.RecoveryMonitor_RecoveryLoopStop(_id);
        }
Exemple #10
0
 private void HandleTransportError(Exception e, BrokerMeta broker)
 {
     _log.Info("Handling TransportError for broker {0}", broker);
     (
         from topic in _metadata.Topics
         from part in topic.Partitions
         where part.Leader == broker.NodeId
         select new { topic.TopicName, part }
     ).ForEach(p =>
     {
         _log.Debug("Marking topic {2} partition {1} with transport error for broker {0}", broker, p.part, p.TopicName);
         p.part.ErrorCode = ErrorCode.TransportError;
         EtwTrace.Log.MetadataTransportError(p.TopicName, _id, p.part.Id, p.part.Leader);
         _partitionStateChangesSubject.OnNext(new PartitionStateChangeEvent(p.TopicName, p.part.Id, ErrorCode.TransportError));
     });
 }
Exemple #11
0
        private async Task RecoveryLoop(BrokerMeta broker)
        {
            _log.Debug("{0} Starting recovery loop on broker: {1}", this, broker);
            EtwTrace.Log.RecoveryMonitor_RecoveryLoopStarted(_id, broker.Host, broker.Port, broker.NodeId);
            while (!_cancel.IsCancellationRequested)
            {
                //_log.Debug("RecoveryLoop iterating {0}", this);

                //
                // Check either there is any job for given broker
                //
                if (_failedList.Count == 0)
                {
                    // TODO: await for the list to receive 1st item instead of looping
                    await Task.Delay(1000, _cancel);

                    continue;
                }

                //
                // Query metadata from given broker for any failed topics.
                //
                MetadataResponse response;
                try
                {
                    EtwTrace.Log.RecoveryMonitor_SendingPing(_id, broker.Host, broker.Port);
                    response = await _protocol.MetadataRequest(new TopicRequest { Topics = _failedList.Keys.Select(t => t.Item1).Distinct().ToArray() }, broker, noTransportErrors : true);

                    EtwTrace.Log.RecoveryMonitor_PingResponse(_id, broker.Host, broker.Port);
                }
                catch (Exception ex)
                {
                    _log.Debug("PartitionRecoveryMonitor error. Broker: {0}, error: {1}", broker, ex.Message);
                    EtwTrace.Log.RecoveryMonitor_PingFailed(_id, broker.Host, broker.Port, ex.Message);
                    response = null;
                }

                if (response == null)
                {
                    await Task.Delay(1000, _cancel);

                    continue;
                }

                //
                // Join failed partitions with successful responses to find out recovered ones
                //
                Tuple <string, int, int>[] maybeHealedPartitions = (
                    from responseTopic in response.Topics
                    from responsePart in responseTopic.Partitions
                    let key = new Tuple <string, int>(responseTopic.TopicName, responsePart.Id)
                              where
                              responseTopic.ErrorCode.IsSuccess() &&
                              responsePart.ErrorCode.IsSuccess() &&
                              _failedList.ContainsKey(key)
                              select Tuple.Create(responseTopic.TopicName, responsePart.Id, responsePart.Leader)
                    ).ToArray();

                if (_log.IsDebugEnabled)
                {
                    if (maybeHealedPartitions.Length == 0)
                    {
                        _log.Debug("Out of {0} partitions returned from broker {2}, none of the {3} errored partitions are healed. Current partition states for errored partitions: [{1}]",
                                   response.Topics.SelectMany(t => t.Partitions).Count(),
                                   string.Join(",", response.Topics
                                               .SelectMany(t => t.Partitions.Select(p => new { t.TopicName, TopicErrorCode = t.ErrorCode, PartitionId = p.Id, PartitionErrorCode = p.ErrorCode }))
                                               .Where(p => _failedList.ContainsKey(new Tuple <string, int>(p.TopicName, p.PartitionId)))
                                               .Select(p => string.Format("{0}:{1}:{2}:{3}", p.TopicName, p.TopicErrorCode, p.PartitionId, p.PartitionErrorCode))),
                                   broker,
                                   _failedList.Count
                                   );
                    }
                    else
                    {
                        var str = new StringBuilder();
                        foreach (var leader in maybeHealedPartitions.GroupBy(p => p.Item3, (i, tuples) => new { Leader = i, Topics = tuples.GroupBy(t => t.Item1) }))
                        {
                            str.AppendFormat(" Leader: {0}\n", leader.Leader);
                            foreach (var topic1 in leader.Topics)
                            {
                                str.AppendFormat("  Topic: {0} ", topic1.Key);
                                str.AppendFormat("[{0}]\n", string.Join(",", topic1.Select(t => t.Item2)));
                            }
                        }
                        _log.Debug("Healed partitions found by broker {0} (will check broker availability):\n{1}", broker, str.ToString());
                    }
                }

                if (EtwTrace.Log.IsEnabled())
                {
                    if (maybeHealedPartitions.Length != 0)
                    {
                        EtwTrace.Log.RecoveryMonitor_PossiblyHealedPartitions(_id, maybeHealedPartitions.Length);
                    }
                    else
                    {
                        EtwTrace.Log.RecoveryMonitor_NoHealedPartitions(_id);
                    }
                }

                //
                // Make sure that brokers for healed partitions are accessible, because it is possible that
                // broker B1 said that partition belongs to B2 and B2 can not be reach.
                // It is checked only that said broker responds to metadata request without exceptions.
                //
                var aliveChecks = maybeHealedPartitions.
                                  GroupBy(p => p.Item3).
                                  Select(async brokerGrp =>
                {
                    BrokerMeta newBroker;
                    _brokers.TryGetValue(brokerGrp.Key, out newBroker);
                    if (newBroker == null)
                    {
                        newBroker = response.Brokers.SingleOrDefault(b => b.NodeId == brokerGrp.Key);

                        // If Cluster started when one of the brokers was down, and later it comes alive,
                        // it will be missing from our list of brokers. See issue #14.
                        _log.Debug("received MetadataResponse for broker that is not yet in our list: {0}", newBroker);

                        if (newBroker == null)
                        {
                            _log.Error("Got metadata response with partition refering to a broker which is not part of the response: {0}", response.ToString());
                            return;
                        }

                        // Broadcast only newly discovered broker and strip everything else, because this is the only
                        // confirmed data.
                        var filteredMeta = new MetadataResponse
                        {
                            Brokers = new[] { newBroker },
                            Topics  = new TopicMeta[] { }
                        };

                        _newMetadataEvent.OnNext(filteredMeta);
                    }

                    try
                    {
                        EtwTrace.Log.RecoveryMonitor_CheckingBrokerAccessibility(_id, newBroker.Host, newBroker.Port, newBroker.NodeId);
                        MetadataResponse response2 = await _protocol.MetadataRequest(new TopicRequest {
                            Topics = brokerGrp.Select(g => g.Item1).Distinct().ToArray()
                        }, newBroker, noTransportErrors: true);
                        EtwTrace.Log.RecoveryMonitor_BrokerIsAccessible(_id, newBroker.Host, newBroker.Port, newBroker.NodeId);

                        // success!
                        // raise new metadata event
                        _log.Info("Alive brokers detected: {0} which responded with: {1}", newBroker, response2);

                        // Join maybe healed partitions with partitions which belong to alive broker
                        var confirmedHealedTopics =
                            (from maybeHealedPartition in brokerGrp
                             from healedTopic in response2.Topics
                             where healedTopic.TopicName == maybeHealedPartition.Item1
                             from healedPart in healedTopic.Partitions
                             where healedPart.Id == maybeHealedPartition.Item2 && healedPart.Leader == brokerGrp.Key
                             group healedPart by new { healedTopic.TopicName, healedTopic.ErrorCode } into healedTopicGrp
                             select healedTopicGrp
                            );


                        // broadcast only trully healed partitions which belong to alive broker
                        var filteredResponse = new MetadataResponse
                        {
                            Brokers = response2.Brokers,   // we may broadcast more than 1 broker, but it should be ok because discovery of new broker metadata does not cause any actions
                            Topics  = confirmedHealedTopics.
                                      Where(t => t.Any()). // broadcast only topics which have healed partitions
                                      Select(t => new TopicMeta
                            {
                                ErrorCode  = t.Key.ErrorCode,
                                TopicName  = t.Key.TopicName,
                                Partitions = t.ToArray()
                            }).ToArray()
                        };

                        _log.Debug("Broadcasting filtered response {0}", filteredResponse);
                        if (EtwTrace.Log.IsEnabled())
                        {
                            foreach (var topic in filteredResponse.Topics)
                            {
                                EtwTrace.Log.RecoveryMonitor_HealedPartitions(_id, newBroker.Host, newBroker.Port, newBroker.NodeId, topic.TopicName, string.Join(",", topic.Partitions.Select(p => p.Id)));
                            }
                        }
                        _newMetadataEvent.OnNext(filteredResponse);
                    }
                    catch (Exception e)
                    {
                        _log.Warn("Metadata points to broker but it is not accessible. Error: {0}", e.Message);
                    }
                });

                // Wait for all checks to complete, otherwise, if a broker does not respond and hold connection open until tcp timeout,
                // we will keep accumulating responses in memory faster than they time out. See https://github.com/ntent-ad/kafka4net/issues/30
                await Task.WhenAll(aliveChecks.ToArray());

                await Task.Delay(3000, _cancel);
            }

            _log.Debug("RecoveryLoop exiting. Setting completion");
            EtwTrace.Log.RecoveryMonitor_RecoveryLoopStop(_id);
        }