Beispiel #1
0
        public void StartTestCrawl()
        {
            Thread.Sleep(5000);

            var result = new CrawlerResultsDTO();
            result.ExternalLinksList = new List<ExternalLinkDTO>();
            result.ExternalLinksList.Add(new ExternalLinkDTO
            {
                LinkAnchor = "Test Run",
                LinkPath = "",
                OriginalPageLink = "",
                PageSeedLink = ""
            });
            Task.Factory.StartNew(() =>
            {
                proxy.ReturnCrawlingResults(result);
            });
        }
Beispiel #2
0
        public CrawlerResultsDTO StartCrawlingProcess(IEnumerable<SeedDTO> seedsToCrawl, int maxCrawlLevel = 2)
        {
            var timetracker = new Stopwatch();

            timetracker.Start();

            var startTime = DateTime.Now;

            _maxPageLevel = maxCrawlLevel;
            _forceStop = false;
            foreach (var seed in seedsToCrawl)
            {
                var startingAddress = seed.SeedDomainName;

                _internalLinksIdCounter = 1;
                if (startingAddress == string.Empty)
                {
                    continue;
                }

                //if (_allLinks.Contains(startingAddress))
                //{
                //    AddInternalLink(startingAddress, startingAddress, 0, StartingPageName);
                //}

                FindLinks(startingAddress, 0, startingAddress);

                while (_internalUnprocessedLinks.Count > 0 & _forceStop != true)
                {
                    var selectedLink = _internalUnprocessedLinks.Pop();

                    selectedLink.IsProcessed = true;
                    FindLinks(selectedLink.PageLink, selectedLink.PageLevel, startingAddress);

                }

                //_allLinks.Clear();
            }
            if (_forceStop)
            {
                _forceStop = false;
            }

            timetracker.Stop();
            var runingTime = timetracker.Elapsed.Seconds;
            //MessageBox.Show(RuningTime);
            var batchInfo = new BatchDTO();
            batchInfo.CrawlingTime = runingTime;
            batchInfo.StartTime = startTime;
            batchInfo.NumberOfCrawledExternalLinks = _externalLinksDictionary.Count;
            batchInfo.NumberOfCrawledInternalLinks = _internalLinksIdCounter;
            batchInfo.SeedId = seedsToCrawl.FirstOrDefault().SeedIndex;

            var result = new CrawlerResultsDTO
            {
                BadLinksList = _badLinksList.ToList(),
                ExternalLinksList = _externalLinksDictionary.Select(pair => pair.Value).ToList(),
                InternalLinksList = _internalLinksDictionary.Select(pair => pair.Value).ToList(),
                BatchInfo= batchInfo,
                ProcessedSeed = seedsToCrawl.FirstOrDefault()
            };

            return result;
        }