Beispiel #1
0
        public static async Task <IList <ResourceContentUnit> > CrawlAsync(ResourceLink resourceLink, bool processResponse = true)
        {
            var webRequest = WebRequest.Create(resourceLink.Uri) as HttpWebRequest;

            resourceLink.SetUpWebRequest(webRequest);

            return(await System.Threading.Tasks.TaskExtensions.Unwrap(
                       webRequest.GetResponseAsync()
                       .ContinueWith(async webResponseTask =>
            {
                var result = new List <ResourceContentUnit>();

                var httpResultUnit = new HttpResultUnit
                {
                    RequestUrl = resourceLink.Url,
                    Exception = webResponseTask.Exception
                };

                if (webResponseTask.Status == TaskStatus.RanToCompletion)
                {
                    using (var webResponse = webResponseTask.Result as HttpWebResponse)
                    {
                        webRequest.FixCookies(webResponse);

                        // Use job-based crawling state, if crawling is based off a job. Otherwise, use config-based.
                        var crawlingState = resourceLink.Job?.CrawlingState ?? resourceLink.Config.CrawlingState;
                        crawlingState.Cookies = webResponse.Cookies;

                        httpResultUnit.ResponseUrl = webResponse.ResponseUri.ToString();
                        httpResultUnit.ContentEncoding = webResponse.ContentEncoding;
                        httpResultUnit.ContentLength = webResponse.ContentLength;
                        httpResultUnit.ContentType = webResponse.ContentType;
                        httpResultUnit.Cookies = webResponse.Cookies;
                        httpResultUnit.Headers = webResponse.Headers;
                        httpResultUnit.HttpStatus = webResponse.StatusCode;
                        httpResultUnit.HttpStatusDescription = webResponse.StatusDescription;

                        if (processResponse)
                        {
                            result.AddRange(await resourceLink.ProcessWebResponseAsync(webResponse));
                        }
                        else
                        {
                            result.Add(await resourceLink.ReadResponseStringAsync(webResponse));
                        }
                    }
                }
                else
                {
                    Trace.TraceError("CrawlAsync.GetWebResponse: Failed for queue item {0} with exception {1}", resourceLink, webResponseTask.Exception);
                }

                result.Add(httpResultUnit);

                return result;
            })));
        }
Beispiel #2
0
 public override Task <bool> OnFileDownloadedAsync(FileLink fileResourceLink, DownloadedFilesUnit downloadedFileUnit, HttpResultUnit httpResultUnit)
 {
     return(ExecuteOnAllInterceptorsAsync(eventInterceptor =>
                                          eventInterceptor.OnFileDownloadedAsync(fileResourceLink, downloadedFileUnit, httpResultUnit)
                                          ));
 }
Beispiel #3
0
 public override Task <bool> OnDataDocumentDownloadedAsync(ResourceLink resourceLink, ExtractedDataUnit extractedDataUnit, HttpResultUnit httpResultUnit)
 {
     return(ExecuteOnAllInterceptorsAsync(eventInterceptor =>
                                          eventInterceptor.OnDataDocumentDownloadedAsync(resourceLink, extractedDataUnit, httpResultUnit)
                                          ));
 }
Beispiel #4
0
 /// <returns>True - if file download result was successfully processed, False - download result processing failed (should either re-enqueue, trace and continue or do something else)</returns>
 public async virtual Task <bool> OnFileDownloadedAsync(FileLink fileResourceLink, DownloadedFilesUnit downloadedFileUnit, HttpResultUnit httpResultUnit)
 {
     return(true);
 }
Beispiel #5
0
 /// <returns>
 ///     True - if download result was successfully processed
 ///     False - download result processing failed (should either re-enqueue, trace and continue or use some kind of error queue)
 ///
 ///     NOTE: If there is a sequence of failed processing results - it's a sign that we should stop crawling this job (either duplicates, or site is down, or we got blocked)
 ///</returns>
 public async virtual Task <bool> OnDataDocumentDownloadedAsync(ResourceLink resourceLink, ExtractedDataUnit extractedDataUnit, HttpResultUnit httpResultUnit)
 {
     return(true);
 }