public static async Task <IList <ResourceContentUnit> > CrawlAsync(ResourceLink resourceLink, bool processResponse = true) { var webRequest = WebRequest.Create(resourceLink.Uri) as HttpWebRequest; resourceLink.SetUpWebRequest(webRequest); return(await System.Threading.Tasks.TaskExtensions.Unwrap( webRequest.GetResponseAsync() .ContinueWith(async webResponseTask => { var result = new List <ResourceContentUnit>(); var httpResultUnit = new HttpResultUnit { RequestUrl = resourceLink.Url, Exception = webResponseTask.Exception }; if (webResponseTask.Status == TaskStatus.RanToCompletion) { using (var webResponse = webResponseTask.Result as HttpWebResponse) { webRequest.FixCookies(webResponse); // Use job-based crawling state, if crawling is based off a job. Otherwise, use config-based. var crawlingState = resourceLink.Job?.CrawlingState ?? resourceLink.Config.CrawlingState; crawlingState.Cookies = webResponse.Cookies; httpResultUnit.ResponseUrl = webResponse.ResponseUri.ToString(); httpResultUnit.ContentEncoding = webResponse.ContentEncoding; httpResultUnit.ContentLength = webResponse.ContentLength; httpResultUnit.ContentType = webResponse.ContentType; httpResultUnit.Cookies = webResponse.Cookies; httpResultUnit.Headers = webResponse.Headers; httpResultUnit.HttpStatus = webResponse.StatusCode; httpResultUnit.HttpStatusDescription = webResponse.StatusDescription; if (processResponse) { result.AddRange(await resourceLink.ProcessWebResponseAsync(webResponse)); } else { result.Add(await resourceLink.ReadResponseStringAsync(webResponse)); } } } else { Trace.TraceError("CrawlAsync.GetWebResponse: Failed for queue item {0} with exception {1}", resourceLink, webResponseTask.Exception); } result.Add(httpResultUnit); return result; }))); }
public override Task <bool> OnFileDownloadedAsync(FileLink fileResourceLink, DownloadedFilesUnit downloadedFileUnit, HttpResultUnit httpResultUnit) { return(ExecuteOnAllInterceptorsAsync(eventInterceptor => eventInterceptor.OnFileDownloadedAsync(fileResourceLink, downloadedFileUnit, httpResultUnit) )); }
public override Task <bool> OnDataDocumentDownloadedAsync(ResourceLink resourceLink, ExtractedDataUnit extractedDataUnit, HttpResultUnit httpResultUnit) { return(ExecuteOnAllInterceptorsAsync(eventInterceptor => eventInterceptor.OnDataDocumentDownloadedAsync(resourceLink, extractedDataUnit, httpResultUnit) )); }
/// <returns>True - if file download result was successfully processed, False - download result processing failed (should either re-enqueue, trace and continue or do something else)</returns> public async virtual Task <bool> OnFileDownloadedAsync(FileLink fileResourceLink, DownloadedFilesUnit downloadedFileUnit, HttpResultUnit httpResultUnit) { return(true); }
/// <returns> /// True - if download result was successfully processed /// False - download result processing failed (should either re-enqueue, trace and continue or use some kind of error queue) /// /// NOTE: If there is a sequence of failed processing results - it's a sign that we should stop crawling this job (either duplicates, or site is down, or we got blocked) ///</returns> public async virtual Task <bool> OnDataDocumentDownloadedAsync(ResourceLink resourceLink, ExtractedDataUnit extractedDataUnit, HttpResultUnit httpResultUnit) { return(true); }