/// <summary>
        /// The method inserts to the DB initial values and must be called from Seed() method of the Configuration class
        /// </summary>
        /// <param name="ctx"></param>
        internal static void SeedDefaults(CrawlerDbContext ctx)
        {
            #region rules

            var linkRule = new CrawlRule {
                DataType      = DataBlockType.Link,
                RegExpression = "(<a.*?>.*?</a>)",
                Name          = "Link"
            };
            var picRule = new CrawlRule {
                DataType      = DataBlockType.Picture,
                RegExpression = "<(img)\b[^>]*>",
                Name          = "Picture"
            };
            var videoRule = new CrawlRule {
                DataType      = DataBlockType.Video,
                RegExpression = @"(?<=<iframe[^>]*?)(?:\s*width=[""'](?<width>[^""']+)[""']|\s*height=[""'](?<height>[^'""]+)[""']|\s*src=[""'](?<src>[^'""]+[""']))+[^>]*?>",
                Name          = "Video"
            };
            ctx.CrawlRules.AddOrUpdate(r => r.Name, linkRule, picRule, videoRule);

            #endregion

            #region settings

            var defaultSettings = new HostSetting {
                Host       = string.Empty,
                CrawlDelay = 60,
                Disallow   = string.Empty,
                RobotsTxt  = string.Empty
            };
            ctx.HostSettings.AddOrUpdate(s => s.Host, defaultSettings);

            #endregion

            #region urls

            var defaultUrl = new UrlItem {
                Url  = "http://binary-notes.ru",
                Host = "binary-notes.ru"
            };
            //ctx.UrlItems.AddOrUpdate(s => s.Url, defaultUrl);

            #endregion
        }
        public void Should_return_all_matching_text_block_when_parsing()
        {
            #region arrange data

            const string testUrl     = "url";
            const string testContent = "<a>text [text0] should be found in [text1] square [text2]brackets</a>";
            var          jobItem     = new JobItem
            {
                Url = new UrlItem
                {
                    Url = testUrl
                }
            };

            var crawlRule = new CrawlRule {
                DataType = DataBlockType.Link, RegExpression = @"\[\w+\d{1}\]"
            };

            #endregion

            var actual = new PipelineRoutines().ParseContent(new ParsingRulesData(jobItem, crawlRule, testContent)).ToList();

            Assert.NotNull(actual);
            Assert.Equal(3, actual.Count);

            Assert.Equal("[text0]", actual[0].Data);
            Assert.Equal("[text1]", actual[1].Data);
            Assert.Equal("[text2]", actual[2].Data);

            Assert.Equal(jobItem, actual[0].Job);
            Assert.Equal(jobItem, actual[1].Job);
            Assert.Equal(jobItem, actual[2].Job);

            Assert.Equal(DataBlockType.Link, actual[0].BlockType);
            Assert.Equal(DataBlockType.Link, actual[1].BlockType);
            Assert.Equal(DataBlockType.Link, actual[2].BlockType);
        }
Ejemplo n.º 3
0
        public void Should_call_pipeline_methods()
        {
            var runner = new Runner(); // it will initialize ServiceLocator

            #region mocks configuration

            #region mock returns

            var jobItem = new JobItem();
            var downloadedContentData = new DownloadedContentData(jobItem, string.Empty);
            var crawlRule             = new CrawlRule();
            var parsingRulesDatas     = new[] {
                new ParsingRulesData(jobItem, crawlRule, string.Empty)
            };
            var parsedContentDatas = new[] {
                new ParsedContentData(jobItem, DataBlockType.Link, string.Empty)
            };
            var crawlRules = new[] {
                crawlRule
            };
            var urlItems = new[] {
                new UrlItem()
            };

            #endregion

            #region pipeline mock

            var mockPipeline = new Mock <IPipeline>();
            mockPipeline.Setup(m => m.DownloadContent(It.IsAny <JobItem>())).Returns(downloadedContentData);
            mockPipeline.Setup(m => m.GetParsingRules(It.IsAny <DownloadedContentData>())).Returns(parsingRulesDatas);
            mockPipeline.Setup(m => m.ParseContent(It.IsAny <ParsingRulesData>())).Returns(parsedContentDatas);
            mockPipeline.Setup(m => m.StoreData(It.IsAny <ParsedContentData>())).Returns(jobItem);
            ServiceLocator.RegisterForDependency(mockPipeline.Object);

            #endregion

            #region frontier mock

            // at least one URL should be exist to allow the downloading of content
            var mockFrontier = new Mock <IUrlFrontierRepository>();
            mockFrontier.Setup(m => m.GetAvailableUrls(It.IsAny <int>(), It.IsAny <DateTime>())).Returns(urlItems);
            ServiceLocator.RegisterForDependency(mockFrontier.Object);

            #endregion

            #region jobs mock

            // job should be created for the URL
            var mockJob = new Mock <IJobRepository>();
            mockJob.Setup(m => m.Start(It.IsAny <UrlItem>())).Returns(jobItem);
            ServiceLocator.RegisterForDependency(mockJob.Object);

            #endregion

            #region settings mock

            // at least one rule should be exist to allow the downloading of content
            var mockSettings = new Mock <ICrawlerSettingsRepository>();
            mockSettings.Setup(m => m.GetParsingRules(It.IsAny <JobItem>())).Returns(crawlRules);
            ServiceLocator.RegisterForDependency(mockSettings.Object);

            #endregion

            #endregion

            runner.Run();

            mockPipeline.Verify(m => m.DownloadContent(It.IsAny <JobItem>()), Times.Once);
            mockPipeline.Verify(m => m.GetParsingRules(It.IsAny <DownloadedContentData>()), Times.Once);
            mockPipeline.Verify(m => m.ParseContent(It.IsAny <ParsingRulesData>()), Times.Once);
            mockPipeline.Verify(m => m.StoreData(It.IsAny <ParsedContentData>()), Times.Once);
        }
Ejemplo n.º 4
0
 public static bool CompareToCrawlRule(this CrawlRuleDefinition crawlRuleDefinition, CrawlRule sharepointRule)
 {
     return(StringComparer.InvariantCultureIgnoreCase.Compare(sharepointRule.Path, crawlRuleDefinition.Path) == 0 && sharepointRule.Type == (crawlRuleDefinition.CrawlRuleConfiguration is Exclude ? CrawlRuleType.ExclusionRule : CrawlRuleType.InclusionRule));
 }