private bool ParseDownloadedPage(DownloaderResult downloaderResult, CrawlerPage page, CrawlerTask crawlerTask) { page.Data = downloaderResult.Content.Bytes; page.IsHtml = downloaderResult.Content.IsHtmlContent; page.Html = downloaderResult.Content.HtmlText; if (!page.IsHtml) { return(false); } try { page.HtmlDoc = new HtmlDocument(); page.HtmlDoc.LoadHtml(downloaderResult.Content.HtmlText); page.Links = _webPageLinkManager.GetAllLinks(crawlerTask.BaseUri, page.HtmlDoc); } catch (Exception e) { Log.Warn() .Message("Error while process [{0}]", downloaderResult.Uri.AbsoluteUri) .Exception(e) .Write(); return(false); } return(true); }
private DownloaderResult MakeRequest(Uri uri) { var result = new DownloaderResult(uri); HttpWebRequest request; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); ProcessResponseObject(response); } catch (WebException e) { result.SetException(e); if (e.Response != null) { response = (HttpWebResponse)e.Response; } Log.Debug() .Message("Error occurred requesting url [{0}]", uri.AbsoluteUri) .Exception(e) .Write(); } catch (Exception e) { result.SetException(e); Log.Debug() .Message("Error occurred requesting url [{0}]", uri.AbsoluteUri) .Exception(e) .Write(); } finally { try { result.SetResponseParams(response); result.SetResponseData(_contentExtractor.GetContent(response)); response?.Close(); } catch (Exception e) { result.SetException(e); Log.Info() .Message("Error occurred finalizing requesting url [{0}]", uri.AbsoluteUri) .Exception(e) .Write(); } } return(result); }
internal void CopyFrom(DownloaderResult result) { Debug.Assert(Uri.Equals(result.Uri)); Debug.Assert(!_waitCompleteTsc.Task.IsCompleted); WebException = result.WebException; HasError = result.HasError; Content = result.Content; DownloadTimeout = result.DownloadTimeout; result.WaitCompliteTask.ContinueWith(t => _waitCompleteTsc.TrySetResult(t.Result)); }
public DownloaderResult AddToDownloadQueue(string uri) { var result = new DownloaderResult(uri); if (_linkDataStorage.TryGetLinkContent(uri, out var content)) { result.SetResponseData(content); return(result); } _queue.Add(result); return(result); }