private static async Task RunTask(int i, CancellationToken cancellationToken) { try { //await RotManager.WaitFreePort(RotManager.Rotrc0Port + i, cancellationToken); using (RotManager rot = new RotManager(i)) // keep same proxy for a few time { await rot.WaitStartAsync(cancellationToken); // wait tor for starting the crawling using (ProxyManager proxy = new ProxyManager(i)) // synchro proxy recycle and Rot restart using (SqlManager sql = new SqlManager()) { for (int end = 0; !cancellationToken.IsCancellationRequested && rot.IsProcessOk() && end < maxCallPerTask; end++) // need to determine how many "stable request" can be made before having the "same result for different url" issue { string url; url = await sql.CrawleRequestDequeueAsync(cancellationToken); if (!cancellationToken.IsCancellationRequested) { if (!string.IsNullOrEmpty(url)) { PerfCounter.CounterCrawleStarted.Increment(); if (await CrawleManager.CrawleOneAsync(proxy, url, sql, cancellationToken)) { PerfCounter.CounterCrawleValided.Increment(); } else if (!cancellationToken.IsCancellationRequested) // fail requeue the URL en P5 { await sql.CrawleRequestEnqueueAsync(url, 6, cancellationToken); } } else // empty queue (what a dream!) { await Task.Delay(1000, cancellationToken); } } } } } } catch (OperationCanceledException) { } catch (Exception ex) { Trace.TraceError("WorkerRole.RunTask Exception : " + ex.GetBaseException().ToString()); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } }
protected override async Task RunAsync(CancellationToken cancellationToken) { Trace.TraceInformation("WebRole.RunAsync : Start"); RotManager.TryKillTorIfRequired(); try { using (RotManager rot0 = new RotManager(0)) using (RotManager rot1 = new RotManager(1)) using (RotManager rot2 = new RotManager(2)) { await rot0.WaitStartAsync(cancellationToken); await rot1.WaitStartAsync(cancellationToken); await rot2.WaitStartAsync(cancellationToken); // main loop while (!cancellationToken.IsCancellationRequested && rot0.IsProcessOk() && rot1.IsProcessOk() && rot2.IsProcessOk()) { await Task.Delay(30000, cancellationToken); } } } catch (OperationCanceledException) { } catch (Exception ex) { Trace.TraceError("WebRole.RunAsync Exception : " + ex.GetBaseException().ToString()); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } Trace.TraceInformation("WebRole.RunAsync : End"); }
protected override async Task RunAsync(CancellationToken cancellationToken) { Trace.TraceInformation("WorkerRole.RunAsync : Start"); try { PerfCounter.Init(); RotManager.TryKillTorIfRequired(); // pools init #if DEBUG taskPool = new List <Task>(Settings.Default.NbCrawlersPerInstance / 2); #else taskPool = new List <Task>(Settings.Default.NbCrawlersPerInstance); #endif for (int i = 0; i < taskPool.Capacity; i++) { taskPool.Add(null); } // NormalizeUrl Init using (SqlManager sql = new SqlManager()) { UriManager.NormalizeUrlInit(sql); } int gCFullCollectRemainingMin = Settings.Default.GCFullCollectMin; HtmlDocument.MaxDepthLevel = Int16.MaxValue; // default value Int32.MaxValue is far too hight : an call stack exeption will be raised far before (around 43k)... // main loop while (!cancellationToken.IsCancellationRequested) { for (int i = 0; !cancellationToken.IsCancellationRequested && i < taskPool.Count; i++) { Task task = taskPool[i]; if (task != null && (task.IsCanceled || task.IsCompleted || task.IsFaulted)) { task.Dispose(); taskPool[i] = null; } if (taskPool[i] == null && !cancellationToken.IsCancellationRequested) { int iVarRequiredForLambda = i; // <!> else the i may be changed by next for iteration in this multi task app !!! taskPool[i] = Task.Run(() => { RunTask(iVarRequiredForLambda, cancellationToken).Wait(); // cancel supported by task, so not used for the wait() }, cancellationToken); await Task.Delay(1000, cancellationToken); // avoid violent startup by x tor started in same instant } } await Task.Delay(60000, cancellationToken); // 1 min // We use a lot of large object for short time, si need to change the GC default mode gCFullCollectRemainingMin--; if (!cancellationToken.IsCancellationRequested && gCFullCollectRemainingMin == 0) { gCFullCollectRemainingMin = Settings.Default.GCFullCollectMin; GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; // https://msdn.microsoft.com/en-us/library/system.runtime.gcsettings.largeobjectheapcompactionmode(v=vs.110).aspx GC.Collect(); // take some tome for freeing RAM and reduce LargeObjectHeap fragment } } } catch (OperationCanceledException) { } catch (AggregateException) { } catch (Exception ex) { Trace.TraceError("WorkerRole.RunAsync Exception : " + ex.GetBaseException().ToString()); #if DEBUG if (Debugger.IsAttached) { Debugger.Break(); } #endif } Trace.TraceInformation("WorkerRole.RunAsync : End"); }