/// <summary> /// The method inserts to the DB initial values and must be called from Seed() method of the Configuration class /// </summary> /// <param name="ctx"></param> internal static void SeedDefaults(CrawlerDbContext ctx) { #region rules var linkRule = new CrawlRule { DataType = DataBlockType.Link, RegExpression = "(<a.*?>.*?</a>)", Name = "Link" }; var picRule = new CrawlRule { DataType = DataBlockType.Picture, RegExpression = "<(img)\b[^>]*>", Name = "Picture" }; var videoRule = new CrawlRule { DataType = DataBlockType.Video, RegExpression = @"(?<=<iframe[^>]*?)(?:\s*width=[""'](?<width>[^""']+)[""']|\s*height=[""'](?<height>[^'""]+)[""']|\s*src=[""'](?<src>[^'""]+[""']))+[^>]*?>", Name = "Video" }; ctx.CrawlRules.AddOrUpdate(r => r.Name, linkRule, picRule, videoRule); #endregion #region settings var defaultSettings = new HostSetting { Host = string.Empty, CrawlDelay = 60, Disallow = string.Empty, RobotsTxt = string.Empty }; ctx.HostSettings.AddOrUpdate(s => s.Host, defaultSettings); #endregion #region urls var defaultUrl = new UrlItem { Url = "http://binary-notes.ru", Host = "binary-notes.ru" }; //ctx.UrlItems.AddOrUpdate(s => s.Url, defaultUrl); #endregion }
public void Should_return_all_matching_text_block_when_parsing() { #region arrange data const string testUrl = "url"; const string testContent = "<a>text [text0] should be found in [text1] square [text2]brackets</a>"; var jobItem = new JobItem { Url = new UrlItem { Url = testUrl } }; var crawlRule = new CrawlRule { DataType = DataBlockType.Link, RegExpression = @"\[\w+\d{1}\]" }; #endregion var actual = new PipelineRoutines().ParseContent(new ParsingRulesData(jobItem, crawlRule, testContent)).ToList(); Assert.NotNull(actual); Assert.Equal(3, actual.Count); Assert.Equal("[text0]", actual[0].Data); Assert.Equal("[text1]", actual[1].Data); Assert.Equal("[text2]", actual[2].Data); Assert.Equal(jobItem, actual[0].Job); Assert.Equal(jobItem, actual[1].Job); Assert.Equal(jobItem, actual[2].Job); Assert.Equal(DataBlockType.Link, actual[0].BlockType); Assert.Equal(DataBlockType.Link, actual[1].BlockType); Assert.Equal(DataBlockType.Link, actual[2].BlockType); }
public void Should_call_pipeline_methods() { var runner = new Runner(); // it will initialize ServiceLocator #region mocks configuration #region mock returns var jobItem = new JobItem(); var downloadedContentData = new DownloadedContentData(jobItem, string.Empty); var crawlRule = new CrawlRule(); var parsingRulesDatas = new[] { new ParsingRulesData(jobItem, crawlRule, string.Empty) }; var parsedContentDatas = new[] { new ParsedContentData(jobItem, DataBlockType.Link, string.Empty) }; var crawlRules = new[] { crawlRule }; var urlItems = new[] { new UrlItem() }; #endregion #region pipeline mock var mockPipeline = new Mock <IPipeline>(); mockPipeline.Setup(m => m.DownloadContent(It.IsAny <JobItem>())).Returns(downloadedContentData); mockPipeline.Setup(m => m.GetParsingRules(It.IsAny <DownloadedContentData>())).Returns(parsingRulesDatas); mockPipeline.Setup(m => m.ParseContent(It.IsAny <ParsingRulesData>())).Returns(parsedContentDatas); mockPipeline.Setup(m => m.StoreData(It.IsAny <ParsedContentData>())).Returns(jobItem); ServiceLocator.RegisterForDependency(mockPipeline.Object); #endregion #region frontier mock // at least one URL should be exist to allow the downloading of content var mockFrontier = new Mock <IUrlFrontierRepository>(); mockFrontier.Setup(m => m.GetAvailableUrls(It.IsAny <int>(), It.IsAny <DateTime>())).Returns(urlItems); ServiceLocator.RegisterForDependency(mockFrontier.Object); #endregion #region jobs mock // job should be created for the URL var mockJob = new Mock <IJobRepository>(); mockJob.Setup(m => m.Start(It.IsAny <UrlItem>())).Returns(jobItem); ServiceLocator.RegisterForDependency(mockJob.Object); #endregion #region settings mock // at least one rule should be exist to allow the downloading of content var mockSettings = new Mock <ICrawlerSettingsRepository>(); mockSettings.Setup(m => m.GetParsingRules(It.IsAny <JobItem>())).Returns(crawlRules); ServiceLocator.RegisterForDependency(mockSettings.Object); #endregion #endregion runner.Run(); mockPipeline.Verify(m => m.DownloadContent(It.IsAny <JobItem>()), Times.Once); mockPipeline.Verify(m => m.GetParsingRules(It.IsAny <DownloadedContentData>()), Times.Once); mockPipeline.Verify(m => m.ParseContent(It.IsAny <ParsingRulesData>()), Times.Once); mockPipeline.Verify(m => m.StoreData(It.IsAny <ParsedContentData>()), Times.Once); }
public static bool CompareToCrawlRule(this CrawlRuleDefinition crawlRuleDefinition, CrawlRule sharepointRule) { return(StringComparer.InvariantCultureIgnoreCase.Compare(sharepointRule.Path, crawlRuleDefinition.Path) == 0 && sharepointRule.Type == (crawlRuleDefinition.CrawlRuleConfiguration is Exclude ? CrawlRuleType.ExclusionRule : CrawlRuleType.InclusionRule)); }