public void TestWebCrawl() { var req = new WebCrawlJob() { SeedUrl = "http://stormwater.wef.org/2016/12/", Depth = 1, SourceId = 103, CrawlUrlPattern = new List <string>() { //"http://stormwater.wef.org/\\d.*/\\d.*/" }, IndexUrlPattern = new List <string>() { "http://stormwater.wef.org/\\d.*/\\d.*/.*", }, TitlePattern = new List <string>() { "\\body\\h1" }, SummaryPattern = "/meta[@property='og:description']", ContentPattern = new List <string>() { "/div[@id='content']" }, }; var crawler = new WebCrawler <WebCrawlBasePage, WebCrawlerSearchDoc>(req); //var results = crawler.Run(); }
private CrawlerResults RunCrawler(WebCrawlJobConfig jobConfig) { var job = new WebCrawlJob(); job.SourceId = jobConfig.SourceId; job.SeedUrl = jobConfig.SeedUrl; job.Depth = jobConfig.Depth; if (!string.IsNullOrEmpty(jobConfig.CrawlUrlPattern)) { job.CrawlUrlPattern = jobConfig.CrawlUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.CrawlSkipUrlPattern)) { job.CrawlSkipUrlPattern = jobConfig.CrawlSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.IndexUrlPattern)) { job.IndexUrlPattern = jobConfig.IndexUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.IndexSkipUrlPattern)) { job.IndexSkipUrlPattern = jobConfig.IndexSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.ContentPattern)) { job.ContentPattern = jobConfig.ContentPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.MetadataPattern)) { job.MetadataPattern = jobConfig.MetadataPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } job.TitlePattern = jobConfig.TitlePattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); job.SummaryPattern = jobConfig.SummaryPattern; job.LinkCleanupPattern = new List <string>() { ";jsessionid=.*$", "#.*$", }; var crawler = new WebCrawler <WebCrawlerPage, WebCrawlerSearchDoc>(job, StatusCallback); var results = crawler.Run(); return(results); }
private CrawlerResults RunCrawler(WebCrawlJobConfig jobConfig) { var job = new WebCrawlJob(); job.SourceId = jobConfig.SourceId; job.SeedUrl = jobConfig.SeedUrl; job.Depth = jobConfig.Depth; if (!string.IsNullOrEmpty(jobConfig.CrawlUrlPattern)) { job.CrawlUrlPattern = jobConfig.CrawlUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.CrawlSkipUrlPattern)) { job.CrawlSkipUrlPattern = jobConfig.CrawlSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.IndexUrlPattern)) { job.IndexUrlPattern = jobConfig.IndexUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } if (!string.IsNullOrEmpty(jobConfig.IndexSkipUrlPattern)) { job.IndexSkipUrlPattern = jobConfig.IndexSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList(); } job.SearchConnectionString = SearchFactory <WebCrawlerSearchDoc> .SearchClient.SrchConnStr; job.Logger = SearchFactory <WebCrawlerSearchDoc> .Logger; job.LinkCleanupPattern = new List <string>() { ";jsessionid=.*$", "#.*$", }; var crawler = new WebCrawler <WebCrawlerSearchDoc>(job, StatusCallback); var results = crawler.Run(); return(results); //return null; }