private bool ParseDownloadedPage(DownloaderResult downloaderResult, CrawlerPage page, CrawlerTask crawlerTask) { page.Data = downloaderResult.Content.Bytes; page.IsHtml = downloaderResult.Content.IsHtmlContent; page.Html = downloaderResult.Content.HtmlText; if (!page.IsHtml) { return(false); } try { page.HtmlDoc = new HtmlDocument(); page.HtmlDoc.LoadHtml(downloaderResult.Content.HtmlText); page.Links = _webPageLinkManager.GetAllLinks(crawlerTask.BaseUri, page.HtmlDoc); } catch (Exception e) { Log.Warn() .Message("Error while process [{0}]", downloaderResult.Uri.AbsoluteUri) .Exception(e) .Write(); return(false); } return(true); }
public void SavePageToDisk(CrawlerPage page, CrawlerTask crawlerTask) { Log.Info().Message("Start saving page [{0}] path {1} ReplaceUrlToLocal {2}", page.Uri.AbsoluteUri, crawlerTask.LocalPath, crawlerTask.TaskSettings.ReplaceUrlToLocal).Write(); _pageFileSystemStorage.SavePage(page, crawlerTask.LocalPath, crawlerTask.TaskSettings.ReplaceUrlToLocal); Log.Info().Message("Saving page complite. [{0}] path {1}", page.Uri.AbsoluteUri, crawlerTask.LocalPath).Write(); }
private Task <List <CrawlerPage> > ProcessNextPageLevel(CrawlerPage parentPage, CrawlerTask crawlerTask) { var linksProcessed = 0; var tcs = new TaskCompletionSource <List <CrawlerPage> >(); var result = new ConcurrentQueue <CrawlerPage>(); if (parentPage.Level >= crawlerTask.TaskSettings.CrawlDepth || !parentPage.Links.Any()) { tcs.SetResult(result.ToList()); } else { foreach (var lnkNodes in parentPage.Links) { var link = lnkNodes.Key; var isNeedToProcess = !crawlerTask.TaskSettings.IgnoreOtherDomains || IsLinkBelongToDomain(parentPage.Uri, link); if (isNeedToProcess) { var dItm = _downloadManager.AddToDownloadQueue(link); dItm.WaitCompliteTask.ContinueWith(t => { var data = t.Result; if (data.HasContent) { var page = new CrawlerPage { Level = parentPage.Level + 1, Uri = link }; result.Enqueue(page); ParseDownloadedPage(data, page, crawlerTask); } Interlocked.Add(ref linksProcessed, 1); if (linksProcessed == parentPage.Links.Count) { tcs.SetResult(result.ToList()); } }); } else { Interlocked.Add(ref linksProcessed, 1); if (linksProcessed == parentPage.Links.Count) { tcs.SetResult(result.ToList()); } } } } return(tcs.Task); }
public Task <CrawlerPage> ProcessCrawlerTask(CrawlerTask task) { Log.Info().Message("Start crawling page [{0}]", task.BaseUri); var rootPage = _downloadManager.AddToDownloadQueue(task.BaseUri); var rootCrawlerPage = new CrawlerPage { IsRoot = true, Level = 0, Uri = task.BaseUri }; var processTask = rootPage.WaitCompliteTask.ContinueWith(t => { var data = t.Result; if (!data.HasContent) { throw new PageCrawlerException(); } if (ParseDownloadedPage(data, rootCrawlerPage, task)) { var level = new List <CrawlerPage> { rootCrawlerPage }; while (true) { var nextLevel = new List <CrawlerPage>(); foreach (var page in level) { var levelTask = ProcessNextPageLevel(page, task); levelTask.Wait(); nextLevel.AddRange(levelTask.Result); page.ChildPages = levelTask.Result; } if (nextLevel.Count == 0) { return; } level = nextLevel; } } }); return(Task.Factory.StartNew(() => { processTask.Wait(); Log.Info().Message("Crawling page complete [{0}]", task.BaseUri); return rootCrawlerPage; })); }
private static IEnumerable <CrawlerPage> ToEnumerableAtBreadthFirst(CrawlerPage rootPage) { var queue = new Queue <CrawlerPage>(); queue.Enqueue(rootPage); while (queue.Any()) { var t = queue.Dequeue(); yield return(t); foreach (var child in t.ChildPages) { queue.Enqueue(child); } } }
public string ConvertToFileName(CrawlerPage page) { var fileName = page.IsRoot ? page.Uri.Host : Path.GetFileName(page.Uri.LocalPath); if (string.IsNullOrWhiteSpace(fileName)) { fileName = page.Uri.AbsoluteUri.GetHashCode().ToString(); } if (page.IsHtml) { fileName = fileName + ".html"; } return(fileName); }
private void SavePage(CrawlerPage page, string rootPath, string dataPath) { string path; var fileName = _uriToFileNameConverter.ConvertToFileName(page); if (page.IsHtml) { path = Path.Combine(rootPath, fileName); using (var fs = File.OpenWrite(path)) { page.HtmlDoc.Save(fs); } } else { path = Path.Combine(dataPath, fileName); File.WriteAllBytes(path, page.Data); } Log.Debug().Message("Write [{0}] to [{1}]", page.Uri.AbsoluteUri, path).Write(); }
//Note ignore access checks, disk size checks, directory not empty checks... public void SavePage(CrawlerPage page, string rootPath, bool replaceLinksToLocal) { if (!Directory.Exists(rootPath)) { Directory.CreateDirectory(rootPath); } var dataPath = GetDataPath(rootPath); if (!Directory.Exists(dataPath)) { Directory.CreateDirectory(dataPath); } var allPages = ToEnumerableAtBreadthFirst(page).ToList(); if (replaceLinksToLocal) { ReplaceLinks(allPages); } foreach (var currentPage in allPages) { try { SavePage(currentPage, rootPath, dataPath); } catch (Exception e) { Log.Error() .Message("Error while saving page on disk [{0}]. Location: [{1}], [{2}]", currentPage.Uri.AbsoluteUri, rootPath, dataPath) .Exception(e) .Write(); } } }
private string GetRelativePath(CrawlerPage page, string fileName) { return(!page.IsHtml ? "../data/" + fileName : "../" + fileName); }