private void GetDataFromWebServer(string runningId, ScraperDataWrapper item, int retryCount = 0) { try { //Create response object var response = ScrapperMapper.ToResponse(item); //Raise on dequeue event item.OnDequeue?.Invoke(response); switch (item.ScraperType) { case ScraperType.String: ProcessAsHtml(runningId, item, response, retryCount); return; case ScraperType.Binary: ProcessAsBinary(runningId, item, response, retryCount); return; default: throw new Exception("ScraperType " + item.ScraperType + " not valid"); } } catch (Exception ex) { ExceptionHandlerOnDownloadData(ex, item, runningId, retryCount); } }
private void ExceptionHandlerOnDownloadData(Exception ex, ScraperDataWrapper item, string runningId, int retryCount) { var response = ScrapperMapper.ToResponse(item); response.Exception = ex; if (ex is WebException) { if (retryCount < MaxRetryCount) { Thread.Sleep(2000); GetDataFromWebServer(runningId, item, retryCount + 1); } else { if (item.OnThrownException != null) { item.OnThrownException?.Invoke(response); RemoveItemFromRunningCollection(item, runningId); } } } else { if (item.OnThrownException != null) { item.OnThrownException?.Invoke(response); RemoveItemFromRunningCollection(item, runningId); } } }
/// <summary> /// Enqueue a response. Start process if was manually stoped /// </summary> /// <exception cref="ArgumentException">If Url or OnDataArrived is not provider</exception> /// <param name="data">Item to scraper</param> public void Enqueue(ScraperData data) { if (string.IsNullOrWhiteSpace(data.Url)) { throw new ArgumentException("URL is required."); } if (data.OnDataArrived == null) { throw new ArgumentException("OnDataArrived is required."); } Uri uri; if (!Uri.TryCreate(data.Url, UriKind.RelativeOrAbsolute, out uri)) { throw new ArgumentException("URL '{0}' is invalid", data.Url); } //gets the domain var domain = uri.Authority.ToLower(); //If enqueue method was called in parallel, with no lock //could exists multiple consume threads for the same domain //With lock we fix this problem. lock (LockerObj) { //Check if exists a queue from domain if (Queues.Any(x => x.Key == domain)) { var queue = Queues[domain]; queue.Enqueue(ScrapperMapper.ToWrapper(data, domain, uri)); } else { var queue = new ConcurrentQueue <ScraperDataWrapper>(); queue.Enqueue(ScrapperMapper.ToWrapper(data, domain, uri)); if (!Queues.TryAdd(domain, queue)) { if (!Queues.Any(x => x.Key == domain)) { throw new Exception("Unexpected error when try to create a new Queue for domain " + domain); } } //start a new queue process var t = Task.Factory.StartNew(() => ConsumeFromQueue(domain, queue)); if (!_queueThreads.TryAdd(domain, t)) { if (!_queueThreads.Any(x => x.Key == domain)) { throw new Exception("Unexpected error when try to add a task of queue on QueueThreads for domain " + domain); } } } } }
private void RemoveItemFromRunningCollection(ScraperData item, string key, int retryCount = 0) { string dummyValue; var response = ScrapperMapper.ToResponse(item); if (!Running.TryRemove(key, out dummyValue)) { if (retryCount < MaxRetryCount) { RemoveItemFromRunningCollection(item, key, retryCount + 1); } else { response.Exception = new Exception("The scraper data response cannot be deleted from running collection."); item.OnThrownException?.Invoke(response); } } }