예제 #1
0
        public void TestMemoryQueueOverFlow()
        {
            BlockingCollection <ParentLink>     _newLinks        = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <ParentLink>     _downloadQueue   = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <DownloadResult> _downloadResults = new BlockingCollection <DownloadResult>(new ConcurrentQueue <DownloadResult>(), 10);
            var mockProgrss = new MockProgess();
            var feeder      = new Feeder(_newLinks, _downloadQueue, 1, 1, mockProgrss);

            feeder.Start();
            for (int i = 0; i < 1001; i++) //Crawler.MaxCollectionSize + 1
            {
                _newLinks.Add(new ParentLink($"http://{i}.co.za", null));
            }

            //download queue should be at max size of 1000
            Thread.Sleep(1000);
            Assert.AreEqual(1000, _downloadQueue.Count);

            //after dequeuing the item, the database item should be readded taking the total back to 1000
            _downloadQueue.Take();
            Thread.Sleep(2000);
            Assert.AreEqual(1000, _downloadQueue.Count);

            //queue should now have decreased by 1.
            _downloadQueue.Take();
            Thread.Sleep(1000);
            Assert.AreEqual(999, _downloadQueue.Count);
            feeder.Stop();
            Thread.Sleep(1000);
        }
예제 #2
0
        public void FlowTest_OneLinkLoopFound()
        {
            BlockingCollection <ParentLink>     _newLinks        = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <ParentLink>     _downloadQueue   = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <DownloadResult> _downloadResults = new BlockingCollection <DownloadResult>(new ConcurrentQueue <DownloadResult>(), 10);
            var progress   = new MockProgess();
            var feeder     = new Feeder(_newLinks, _downloadQueue, 1, 1, progress, false);
            var downloader = new Downloader(_downloadQueue, _downloadResults, new Uri(Const.SEED), new MockSettings(), progress, false);
            var processor  = new Processor(_downloadResults, _newLinks, new Website(new Data.Website()
            {
                Id = 1, Seed = Const.SEED
            }), 1, new MockSettings(), progress, false);

            //mock behaviour from Crawler.Start
            _newLinks.Add(new ParentLink(Const.LINK2, 1));
            Assert.AreEqual(0, progress.TotalLinks);
            Assert.AreEqual(1, _newLinks.Count);

            feeder.Start();
            Assert.AreEqual(0, progress.TotalLinks);
            Assert.AreEqual(0, progress.TotalDiscarded);
            Assert.AreEqual(0, _newLinks.Count);
            Assert.AreEqual(1, _downloadQueue.Count);
            Assert.AreEqual(0, _downloadResults.Count);

            downloader.Start();
            Assert.AreEqual(0, progress.TotalDiscarded);
            Assert.AreEqual(0, progress.TotalLinks);
            Assert.AreEqual(1, progress.TotalDownloadResult);
            Assert.AreEqual(0, _newLinks.Count);
            Assert.AreEqual(0, _downloadQueue.Count);
            Assert.AreEqual(1, _downloadResults.Count);

            processor.Start();
            Assert.AreEqual(0, progress.TotalDiscarded);
            Assert.AreEqual(1, progress.TotalLinks);
            Assert.AreEqual(1, progress.TotalDownloadResult);
            Assert.AreEqual(1, _newLinks.Count);
            Assert.AreEqual(0, _downloadQueue.Count);
            Assert.AreEqual(0, _downloadResults.Count);

            //2nd iteration for already crawled link, don't crawl duplicates.
            feeder.Start();
            Assert.AreEqual(1, progress.TotalDiscarded);
            Assert.AreEqual(1, progress.TotalLinks);
            Assert.AreEqual(1, progress.TotalDownloadResult);
            Assert.AreEqual(0, _newLinks.Count);
            Assert.AreEqual(0, _downloadQueue.Count);
            Assert.AreEqual(0, _downloadResults.Count);
        }
예제 #3
0
        public void FeederTest_NoDuplicateLinks()
        {
            Assert.Empty(Crawler.NewLinks);
            Assert.Empty(Crawler.DownloadQueue);
            Assert.Empty(Crawler.DownloadResults);

            const string mockSeed        = "https://seed.co.za/";
            const string link2           = "https://seed.co.za/link2/";
            const string link3           = "https://seed.co.za/link3/";
            var          seedParentLink  = new ParentLink(mockSeed, null);
            var          link2ParentLink = new ParentLink(link2, 1);
            var          link3ParentLink = new ParentLink(link3, 1);
            var          mockProgrss     = new MockProgess();

            var feeder = new Feeder(1, 1, mockProgrss);

            feeder.Start();
            Crawler.NewLinks.Add(seedParentLink);
            Crawler.NewLinks.Add(link2ParentLink);
            Crawler.NewLinks.Add(link3ParentLink);
            Crawler.NewLinks.Add(link2ParentLink);
            Crawler.NewLinks.Add(link3ParentLink);
            Crawler.NewLinks.Add(link3ParentLink);
            Crawler.NewLinks.Add(link2ParentLink);
            Crawler.NewLinks.Add(link3ParentLink);
            Crawler.NewLinks.Add(link2ParentLink);

            Thread.Sleep(100);
            Assert.Empty(Crawler.NewLinks);
            //should only have 2 items, the duplicate shouldn't come through.
            Assert.Equal(3, Crawler.DownloadQueue.Count);

            //the links should come in the order they were added.
            Assert.Equal(mockSeed, Crawler.DownloadQueue.Take().Link);
            Assert.Equal(link2, Crawler.DownloadQueue.Take().Link);
            Assert.Equal(link3, Crawler.DownloadQueue.Take().Link);

            feeder.Stop();
            Thread.Sleep(1000);

            Assert.Empty(Crawler.NewLinks);
            Assert.Empty(Crawler.DownloadQueue);
        }
예제 #4
0
        public void FlowTest_NoLinks()
        {
            BlockingCollection <ParentLink>     _newLinks        = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <ParentLink>     _downloadQueue   = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <DownloadResult> _downloadResults = new BlockingCollection <DownloadResult>(new ConcurrentQueue <DownloadResult>(), 10);

            var progress = new MockProgess();
            var feeder   = new Feeder(_newLinks, _downloadQueue, 1, 1, progress);

            var downloader = new Downloader(_downloadQueue, _downloadResults, new Uri(Const.SEED), new MockSettings(), progress, false);
            var processor  = new Processor(_downloadResults, _newLinks, new Website(new Data.Website()
            {
                Id = 1, Seed = Const.SEED
            }), 1, new MockSettings(), progress, false);

            //mock behaviour from Crawler.Start
            progress.TotalLinks++;
            _newLinks.Add(new ParentLink(Const.SEED, null));

            Assert.AreEqual(1, progress.TotalLinks);
            feeder.Start();
            Assert.AreEqual(1, progress.TotalLinks);
            Assert.AreEqual(0, progress.TotalDiscarded);
            Assert.AreEqual(0, _downloadQueue.Count);

            downloader.Start();
            Assert.AreEqual(1, progress.TotalLinks);
            Assert.AreEqual(0, progress.TotalDiscarded);
            Assert.AreEqual(1, progress.TotalDownloadResult);
            Assert.AreEqual(0, _newLinks.Count);
            Assert.AreEqual(0, _downloadQueue.Count);
            Assert.AreEqual(1, _downloadResults.Count);

            processor.Start();
            Assert.AreEqual(1, progress.TotalLinks);
            Assert.AreEqual(1, progress.TotalDownloadResult);
            Assert.AreEqual(0, progress.TotalDiscarded);
            Assert.AreEqual(0, _newLinks.Count);
            Assert.AreEqual(0, _downloadQueue.Count);
            Assert.AreEqual(0, _downloadResults.Count);
        }
예제 #5
0
        public void TestMemoryQueueOverFlow()
        {
            Assert.Empty(Crawler.NewLinks);
            Assert.Empty(Crawler.DownloadQueue);
            Assert.Empty(Crawler.DownloadResults);

            var mockProgrss = new MockProgess();
            var feeder      = new Feeder(1, 1, mockProgrss);

            feeder.Start();
            for (int i = 0; i < 1001; i++) //Crawler.MaxCollectionSize + 1
            {
                Crawler.NewLinks.Add(new ParentLink($"http://{i}.co.za", null));
            }

            //download queue should be at max size of 1000
            Thread.Sleep(1000);
            Assert.Equal(1000, Crawler.DownloadQueue.Count);

            //after dequeuing the item, the database item should be readded taking the total back to 1000
            Crawler.DownloadQueue.Take();
            Thread.Sleep(2000);
            Assert.Equal(1000, Crawler.DownloadQueue.Count);

            //queue should now have decreased by 1.
            Crawler.DownloadQueue.Take();
            Thread.Sleep(1000);
            Assert.Equal(999, Crawler.DownloadQueue.Count);
            feeder.Stop();
            Thread.Sleep(1000);

            while (Crawler.DownloadQueue.Count > 0)
            {
                Crawler.DownloadQueue.Take();
            }

            Assert.Empty(Crawler.NewLinks);
            Assert.Empty(Crawler.DownloadQueue);
            Assert.Empty(Crawler.DownloadResults);
        }
예제 #6
0
        public void FeederTest_NoDuplicateLinks()
        {
            BlockingCollection <ParentLink> _newLinks      = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);
            BlockingCollection <ParentLink> _downloadQueue = new BlockingCollection <ParentLink>(new ConcurrentQueue <ParentLink>(), 1000);

            var seedParentLink  = new ParentLink(Const.SEED, null);
            var link2ParentLink = new ParentLink(Const.LINK1, 1);
            var link3ParentLink = new ParentLink(Const.LINK2, 1);
            var mockProgrss     = new MockProgess();

            var feeder = new Feeder(_newLinks, _downloadQueue, 1, 1, mockProgrss);

            feeder.Start();
            _newLinks.Add(seedParentLink);
            _newLinks.Add(link2ParentLink);
            _newLinks.Add(link3ParentLink);
            _newLinks.Add(link2ParentLink);
            _newLinks.Add(link3ParentLink);
            _newLinks.Add(link3ParentLink);
            _newLinks.Add(link2ParentLink);
            _newLinks.Add(link3ParentLink);
            _newLinks.Add(link2ParentLink);

            Thread.Sleep(100);
            Assert.IsTrue(_newLinks.Count == 0);
            //should only have 2 items, the duplicate shouldn't come through.
            Assert.AreEqual(3, _downloadQueue.Count);

            //the links should come in the order they were added.
            Assert.AreEqual(Const.SEED, _downloadQueue.Take().Link);
            Assert.AreEqual(Const.LINK1, _downloadQueue.Take().Link);
            Assert.AreEqual(Const.LINK2, _downloadQueue.Take().Link);

            feeder.Stop();
            Thread.Sleep(1000);
        }