Пример #1
0
        static void Main(string[] args)
        {
            const int POISON_QUEUE_MAX_TRY = 5;

            Console.WriteLine("====== WORKER ======");

            using var fromFrontQueue = new PullSocket($">{ThreadSafeFactory.FrontQueue}");
            using var toBackQueue    = new PushSocket($">{ThreadSafeFactory.BackUrlQueue}");
            using var toPoisonQueue  = new PushSocket($">{ThreadSafeFactory.BackPosionQueue}");

            Console.WriteLine("Initialized Socket");
            //process tasks forever
            var frontQueueProcessor = Task.Run(() =>
            {
                while (true)
                {
                    try
                    {
                        Console.WriteLine($"Waiting for front queue");
                        var workload           = fromFrontQueue.ReceiveFrameString();
                        UrlQueueItem queueItem = workload.FromJson <UrlQueueItem>();
                        Console.WriteLine($"Request from frontqueue {queueItem.Url}");
                        _urlQueue.Enqueue(queueItem);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine($"Error in Front Queue Processor {ex.Message}");
                    }
                }
            });

            var processUrlTask = Task.Run(() =>
            {
                while (true)
                {
                    Parallel.For(0, 2 * maxThread / 3, (threadIndex) =>
                    {
                        //while (_urlQueue.IsEmpty)
                        //{
                        //    Task.Delay(5000).Wait();
                        //}
                        if (_urlQueue.TryDequeue(out UrlQueueItem url))
                        {
                            Console.WriteLine($"Processing {url.Url}");
                            url.CrawlStart = DateTime.UtcNow;
                            try
                            {
                                foreach (var childUrl in _htmlRenderer.FindChildLinks(url))
                                {
                                    if (!_visitedUrls.ContainsKey(childUrl))
                                    {
                                        _visitedUrls.TryAdd(childUrl, false);
                                        Console.WriteLine($"Sending to backqueue");
                                        toBackQueue.SendFrame(childUrl);
                                        Console.WriteLine($"Sending to backqueue");
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                var u           = url.Clone() as UrlPosionQueueItem;
                                u.TryCount      = 0;
                                u.CrawlFinished = DateTime.UtcNow;
                                u.Errors        = new List <string>
                                {
                                    ex.Message + "|" + ex?.InnerException?.Message
                                };
                                _urlPoisonQueue.Enqueue(u);
                                Console.WriteLine($"Error in Url Processor {ex.Message}");
                            }
                            url.CrawlFinished = DateTime.UtcNow;
                        }
                    });
                }
Пример #2
0
        static void Main(string[] seed)
        {
            Console.WriteLine("====== FRONT QUEUE ======");
            using var toWorkers     = new PushSocket($"@{ThreadSafeFactory.FrontQueue}");
            using var fromBackQueue = new PullSocket($">{ThreadSafeFactory.BackQueue}");

            Console.WriteLine("Socket initialized");

            foreach (var s in seed)
            {
                _urlQueue.Enqueue(new UrlQueueItem {
                    Url = s
                });
            }
            var backQueueProcessor = Task.Run(() =>
            {
                // Receive from backqueue
                while (true)
                {
                    Console.WriteLine($"Waiting for backqueue");
                    var workload = fromBackQueue.ReceiveFrameString();
                    Console.WriteLine($"Got message from backqueue");
                    UrlQueueItem queueItem = new UrlQueueItem
                    {
                        Url = workload
                    };
                    if (!_urlVisiting.ContainsKey(queueItem.Url))
                    {
                        _urlQueue.Enqueue(queueItem);
                    }
                    else
                    {
                        Console.WriteLine($"Url {queueItem.Url} already sent to processing");
                    }
                }
            });

            var frontQueueProcessor = Task.Run(() =>
            {
                // send to worker
                while (true)
                {
                    do
                    {
                        Parallel.For(0, maxThread, (threadIndex) =>
                        {
                            //while (_urlQueue.IsEmpty)
                            //{
                            //    Task.Delay(5000).Wait();
                            //}
                            if (_urlQueue.TryDequeue(out UrlQueueItem url))
                            {
                                if (_politenessPolicy.CanIGoThere(url, out long crawlDelay))
                                {
                                    Task.Delay((int)crawlDelay).Wait();
                                    url.QueuedOn = DateTime.UtcNow;
                                    Console.WriteLine($"Sending to worker [{url.Url}]");
                                    toWorkers.SendFrame(url.ToJson());
                                    Console.WriteLine($"Sent to worker");
                                    _urlVisiting.AddOrUpdate(url.Url.ToLower(), false, (key, val) => false);
                                }
                            }
                        });
                    } while (!_urlQueue.IsEmpty);
                }
            });