Ejemplo n.º 1
0
        public void TestWebCrawl()
        {
            var req = new WebCrawlJob()
            {
                SeedUrl         = "http://stormwater.wef.org/2016/12/",
                Depth           = 1,
                SourceId        = 103,
                CrawlUrlPattern = new List <string>()
                {
                    //"http://stormwater.wef.org/\\d.*/\\d.*/"
                },
                IndexUrlPattern = new List <string>()
                {
                    "http://stormwater.wef.org/\\d.*/\\d.*/.*",
                },
                TitlePattern = new List <string>()
                {
                    "\\body\\h1"
                },
                SummaryPattern = "/meta[@property='og:description']",
                ContentPattern = new List <string>()
                {
                    "/div[@id='content']"
                },
            };

            var crawler = new WebCrawler <WebCrawlBasePage, WebCrawlerSearchDoc>(req);

            //var results = crawler.Run();
        }
Ejemplo n.º 2
0
        private CrawlerResults RunCrawler(WebCrawlJobConfig jobConfig)
        {
            var job = new WebCrawlJob();

            job.SourceId = jobConfig.SourceId;
            job.SeedUrl  = jobConfig.SeedUrl;
            job.Depth    = jobConfig.Depth;

            if (!string.IsNullOrEmpty(jobConfig.CrawlUrlPattern))
            {
                job.CrawlUrlPattern = jobConfig.CrawlUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.CrawlSkipUrlPattern))
            {
                job.CrawlSkipUrlPattern = jobConfig.CrawlSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.IndexUrlPattern))
            {
                job.IndexUrlPattern = jobConfig.IndexUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.IndexSkipUrlPattern))
            {
                job.IndexSkipUrlPattern = jobConfig.IndexSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.ContentPattern))
            {
                job.ContentPattern = jobConfig.ContentPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.MetadataPattern))
            {
                job.MetadataPattern = jobConfig.MetadataPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            job.TitlePattern       = jobConfig.TitlePattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            job.SummaryPattern     = jobConfig.SummaryPattern;
            job.LinkCleanupPattern = new List <string>()
            {
                ";jsessionid=.*$",
                "#.*$",
            };

            var crawler = new WebCrawler <WebCrawlerPage, WebCrawlerSearchDoc>(job, StatusCallback);

            var results = crawler.Run();

            return(results);
        }
Ejemplo n.º 3
0
        private CrawlerResults RunCrawler(WebCrawlJobConfig jobConfig)
        {
            var job = new WebCrawlJob();

            job.SourceId = jobConfig.SourceId;
            job.SeedUrl  = jobConfig.SeedUrl;
            job.Depth    = jobConfig.Depth;

            if (!string.IsNullOrEmpty(jobConfig.CrawlUrlPattern))
            {
                job.CrawlUrlPattern = jobConfig.CrawlUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.CrawlSkipUrlPattern))
            {
                job.CrawlSkipUrlPattern = jobConfig.CrawlSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.IndexUrlPattern))
            {
                job.IndexUrlPattern = jobConfig.IndexUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            if (!string.IsNullOrEmpty(jobConfig.IndexSkipUrlPattern))
            {
                job.IndexSkipUrlPattern = jobConfig.IndexSkipUrlPattern.Replace('\r', ' ').Split('\n').Where(p => p != string.Empty).ToList();
            }

            job.SearchConnectionString = SearchFactory <WebCrawlerSearchDoc> .SearchClient.SrchConnStr;
            job.Logger = SearchFactory <WebCrawlerSearchDoc> .Logger;

            job.LinkCleanupPattern = new List <string>()
            {
                ";jsessionid=.*$",
                "#.*$",
            };

            var crawler = new WebCrawler <WebCrawlerSearchDoc>(job, StatusCallback);

            var results = crawler.Run();

            return(results);

            //return null;
        }