Esempio n. 1
0
        private bool ParseDownloadedPage(DownloaderResult downloaderResult, CrawlerPage page, CrawlerTask crawlerTask)
        {
            page.Data   = downloaderResult.Content.Bytes;
            page.IsHtml = downloaderResult.Content.IsHtmlContent;
            page.Html   = downloaderResult.Content.HtmlText;
            if (!page.IsHtml)
            {
                return(false);
            }
            try
            {
                page.HtmlDoc = new HtmlDocument();
                page.HtmlDoc.LoadHtml(downloaderResult.Content.HtmlText);

                page.Links = _webPageLinkManager.GetAllLinks(crawlerTask.BaseUri, page.HtmlDoc);
            }
            catch (Exception e)
            {
                Log.Warn()
                .Message("Error while process [{0}]", downloaderResult.Uri.AbsoluteUri)
                .Exception(e)
                .Write();

                return(false);
            }

            return(true);
        }
Esempio n. 2
0
        public void SavePageToDisk(CrawlerPage page, CrawlerTask crawlerTask)
        {
            Log.Info().Message("Start saving page [{0}] path {1} ReplaceUrlToLocal {2}",
                               page.Uri.AbsoluteUri, crawlerTask.LocalPath, crawlerTask.TaskSettings.ReplaceUrlToLocal).Write();

            _pageFileSystemStorage.SavePage(page, crawlerTask.LocalPath, crawlerTask.TaskSettings.ReplaceUrlToLocal);

            Log.Info().Message("Saving page complite. [{0}] path {1}", page.Uri.AbsoluteUri, crawlerTask.LocalPath).Write();
        }
Esempio n. 3
0
        private Task <List <CrawlerPage> > ProcessNextPageLevel(CrawlerPage parentPage, CrawlerTask crawlerTask)
        {
            var linksProcessed = 0;
            var tcs            = new TaskCompletionSource <List <CrawlerPage> >();
            var result         = new ConcurrentQueue <CrawlerPage>();

            if (parentPage.Level >= crawlerTask.TaskSettings.CrawlDepth || !parentPage.Links.Any())
            {
                tcs.SetResult(result.ToList());
            }
            else
            {
                foreach (var lnkNodes in parentPage.Links)
                {
                    var link            = lnkNodes.Key;
                    var isNeedToProcess = !crawlerTask.TaskSettings.IgnoreOtherDomains ||
                                          IsLinkBelongToDomain(parentPage.Uri, link);
                    if (isNeedToProcess)
                    {
                        var dItm = _downloadManager.AddToDownloadQueue(link);
                        dItm.WaitCompliteTask.ContinueWith(t =>
                        {
                            var data = t.Result;
                            if (data.HasContent)
                            {
                                var page = new CrawlerPage {
                                    Level = parentPage.Level + 1, Uri = link
                                };
                                result.Enqueue(page);
                                ParseDownloadedPage(data, page, crawlerTask);
                            }
                            Interlocked.Add(ref linksProcessed, 1);
                            if (linksProcessed == parentPage.Links.Count)
                            {
                                tcs.SetResult(result.ToList());
                            }
                        });
                    }
                    else
                    {
                        Interlocked.Add(ref linksProcessed, 1);
                        if (linksProcessed == parentPage.Links.Count)
                        {
                            tcs.SetResult(result.ToList());
                        }
                    }
                }
            }

            return(tcs.Task);
        }
Esempio n. 4
0
        public Task <CrawlerPage> ProcessCrawlerTask(CrawlerTask task)
        {
            Log.Info().Message("Start crawling page [{0}]", task.BaseUri);

            var rootPage        = _downloadManager.AddToDownloadQueue(task.BaseUri);
            var rootCrawlerPage = new CrawlerPage {
                IsRoot = true, Level = 0, Uri = task.BaseUri
            };

            var processTask = rootPage.WaitCompliteTask.ContinueWith(t =>
            {
                var data = t.Result;
                if (!data.HasContent)
                {
                    throw new PageCrawlerException();
                }

                if (ParseDownloadedPage(data, rootCrawlerPage, task))
                {
                    var level = new List <CrawlerPage> {
                        rootCrawlerPage
                    };
                    while (true)
                    {
                        var nextLevel = new List <CrawlerPage>();
                        foreach (var page in level)
                        {
                            var levelTask = ProcessNextPageLevel(page, task);
                            levelTask.Wait();

                            nextLevel.AddRange(levelTask.Result);
                            page.ChildPages = levelTask.Result;
                        }

                        if (nextLevel.Count == 0)
                        {
                            return;
                        }

                        level = nextLevel;
                    }
                }
            });

            return(Task.Factory.StartNew(() =>
            {
                processTask.Wait();
                Log.Info().Message("Crawling page complete [{0}]", task.BaseUri);
                return rootCrawlerPage;
            }));
        }
Esempio n. 5
0
        private static IEnumerable <CrawlerPage> ToEnumerableAtBreadthFirst(CrawlerPage rootPage)
        {
            var queue = new Queue <CrawlerPage>();

            queue.Enqueue(rootPage);
            while (queue.Any())
            {
                var t = queue.Dequeue();
                yield return(t);

                foreach (var child in t.ChildPages)
                {
                    queue.Enqueue(child);
                }
            }
        }
Esempio n. 6
0
        public string ConvertToFileName(CrawlerPage page)
        {
            var fileName = page.IsRoot
                ? page.Uri.Host
                : Path.GetFileName(page.Uri.LocalPath);

            if (string.IsNullOrWhiteSpace(fileName))
            {
                fileName = page.Uri.AbsoluteUri.GetHashCode().ToString();
            }

            if (page.IsHtml)
            {
                fileName = fileName + ".html";
            }
            return(fileName);
        }
Esempio n. 7
0
        private void SavePage(CrawlerPage page, string rootPath, string dataPath)
        {
            string path;
            var    fileName = _uriToFileNameConverter.ConvertToFileName(page);

            if (page.IsHtml)
            {
                path = Path.Combine(rootPath, fileName);
                using (var fs = File.OpenWrite(path))
                {
                    page.HtmlDoc.Save(fs);
                }
            }
            else
            {
                path = Path.Combine(dataPath, fileName);
                File.WriteAllBytes(path, page.Data);
            }
            Log.Debug().Message("Write [{0}] to [{1}]", page.Uri.AbsoluteUri, path).Write();
        }
Esempio n. 8
0
        //Note ignore access checks, disk size checks, directory not empty checks...
        public void SavePage(CrawlerPage page, string rootPath, bool replaceLinksToLocal)
        {
            if (!Directory.Exists(rootPath))
            {
                Directory.CreateDirectory(rootPath);
            }

            var dataPath = GetDataPath(rootPath);

            if (!Directory.Exists(dataPath))
            {
                Directory.CreateDirectory(dataPath);
            }


            var allPages = ToEnumerableAtBreadthFirst(page).ToList();

            if (replaceLinksToLocal)
            {
                ReplaceLinks(allPages);
            }

            foreach (var currentPage in allPages)
            {
                try
                {
                    SavePage(currentPage, rootPath, dataPath);
                }
                catch (Exception e)
                {
                    Log.Error()
                    .Message("Error while saving page on disk [{0}]. Location: [{1}], [{2}]",
                             currentPage.Uri.AbsoluteUri, rootPath, dataPath)
                    .Exception(e)
                    .Write();
                }
            }
        }
Esempio n. 9
0
 private string GetRelativePath(CrawlerPage page, string fileName)
 {
     return(!page.IsHtml
         ? "../data/" + fileName
         : "../" + fileName);
 }