Example #1
0
        /// <summary>
        /// Builds the specified destination folder.
        /// </summary>
        /// <param name="destinationFolder">The destination folder.</param>
        /// <param name="baseUrl">The base URL.</param>
        public void Build(string destinationFolder, string baseUrl)
        {
            generatedFiles.Clear();


            this.baseUrl           = baseUrl;
            this.destinationFolder = destinationFolder;
            rootFilePath           = Path.Combine(destinationFolder, DefaultPage);

            DirectoryHelper.Delete(destinationFolder);
            Directory.CreateDirectory(destinationFolder);

            downloader           = new PageDownloader(destinationFolder, new Uri(baseUrl));
            downloader.Processor = ClearHtml;


            folderTree = new StringBuilder(defaultHeader);
            IList <Category> categories = categoryManager.GetRootLevel();

            foreach (var category in categories)
            {
                ExportCategory(tocFile.ChildNodes, category, 0);
            }

            folderTree.AppendLine(defaultFooter);
            File.WriteAllText(rootFilePath, folderTree.ToString());
            generatedFiles.Add(rootFilePath);

            CreateJavascripts();
        }
        public void Should_crawl_website()
        {
            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();
            var pageCrawler    = new SinglePageCrawler(htmlParser, pageDownloader);

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore;
            var persister     = new RavenDbCrawlPersister(documentStore);

            var urlHasher          = new UrlHasher();
            var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var websiteCrawler     = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new Website
            {
                RootUrl = "http://www.karenmillen.com/",
                MaxConcurrentConnections = 100
            });

//			task.Wait(new TimeSpan(0, 10, 0));
//			task.Wait(new TimeSpan(0, 2, 0));
            task.Wait();

            task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion);

            var result = task.Result;

            Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString());
        }
Example #3
0
        private async void GetPagesOnStart()
        {
            var tmp = await PageDownloader.DownloadPagesAsync(PageLibraryManager.GetLinks());

            pages = tmp;
            Log("Se ha completado la descarga inicial de páginas.");
            screenSecondary.Visibility = Visibility.Collapsed;
        }
Example #4
0
 public BlogPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader)
 {
     _blogClient      = blogClient;
     _blogAccount     = blogAccount;
     _credentials     = credentials;
     _blogHomepageUrl = blogHomepageUrl;
     _pageDownloader  = pageDownloader;
 }
 public BlogPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader)
 {
     _blogClient = blogClient;
     _blogAccount = blogAccount;
     _credentials = credentials;
     _blogHomepageUrl = blogHomepageUrl;
     _pageDownloader = pageDownloader;
 }
Example #6
0
        public void Should_extract_links_from_page()
        {
            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();

            var crawler = new SinglePageCrawler(htmlParser, pageDownloader);

            var result = crawler.Crawl(new Uri("http://vladpetroff.com"));
        }
Example #7
0
        static void Main()
        {
            XmlConfigurator.Configure();

            var log = LogManager.GetLogger(typeof(Program));

            var configuration  = new Configuration();
            var pageDownloader = new PageDownloader(configuration);
            var htmlParser     = new HtmlParser();
            var pageCrawler    = new SinglePageCrawler(htmlParser, pageDownloader);

            var urlHasher = new UrlHasher();

            var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore;
//			var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore;
            var persister = new RavenDbCrawlPersister(documentStore);

//			var crawlUrlRepository = new InMemoryCrawlUrlRepository();
            var crawlUrlRepository = new RedisCrawlUrlRepository();

            var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister);

            var task = websiteCrawler.RunAsync(new [] {
                new Website
                {
                    RootUrl = "http://www.karenmillen.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://uk.tommy.com/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://www.houseoffraser.co.uk/",
                    MaxConcurrentConnections = 25
                },
                new Website
                {
                    RootUrl = "http://vladpetroff.com/",
                    MaxConcurrentConnections = 25
                },
            });

            var result = task.Result;

            log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString());
        }
Example #8
0
        public static void Main(string[] args)
        {
            Console.WriteLine("Launch at " + DateTime.Now.ToString());
            Console.WriteLine("Version " + System.Reflection.Assembly.GetExecutingAssembly().GetName().Version.ToString());

            // 実施判定
            if (!hasToCheckUpdate())
            {
                return;
            }

            // URL管理
            UrlManager manager = new UrlManager();

            // URL展開
            UrlExtractor  extractor = new UrlExtractor();
            List <string> urls      = extractor.ExtractUrls();

            // ご新規さんを抽出
            List <string> newUrls = manager.selectNonExists(urls);

            Console.WriteLine("  total: " + urls.Count + ", new: " + newUrls.Count);

            // ダウンロード
            PageDownloader downloader = new PageDownloader();

            foreach (string url in newUrls)
            {
                System.Threading.Thread.Sleep(1000);

                Console.WriteLine("  new address: " + url);

                string fileName = downloader.Download(url);
                if (fileName != null)
                {
                    manager.addUrl(url);
                }
                else
                {
                    Console.WriteLine("    ...fail!!");
                }
            }

            Console.WriteLine("Finish at " + DateTime.Now.ToString());
        }
Example #9
0
        static void Main(string[] args)
        {
            var path     = Environment.GetFolderPath(Environment.SpecialFolder.Personal);
            var filename = Path.Combine(path, "change.csv");

            if (!File.Exists(filename))
            {
                IEnumerable <string> lines = new List <string>()
                {
                    "CAMBIO CHF/EUR",
                    "\"Giorno e ora\";\"cambio\""
                };
                File.AppendAllLines(filename, lines);
            }


            var _sut = new PageDownloader();

            var page   = _sut.DownloadPage();
            var result = page.GetChangeValue();


            File.AppendAllLines(filename, new [] { $"\"{DateTime.Now.ToString("s", CultureInfo.InvariantCulture)}\";\"{result.ToString(CultureInfo.InvariantCulture)}\"" });
        }
Example #10
0
 public TestsSample(PageDownloader sut)
 {
     _sut = sut;
 }
Example #11
0
 public TemporaryPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount,
                                           IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader, BlogPostRegionLocatorBooleanCallback promptForTempPost)
     : base(blogClient, blogAccount, credentials, blogHomepageUrl, pageDownloader)
 {
     this.containsBlogPosts = promptForTempPost;
 }
Example #12
0
 public RecentPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount,
                                        IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader)
     : base(blogClient, blogAccount, credentials, blogHomepageUrl, pageDownloader)
 {
 }
 public TemporaryPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount,
     IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader, BlogPostRegionLocatorBooleanCallback promptForTempPost)
     : base(blogClient, blogAccount, credentials, blogHomepageUrl, pageDownloader)
 {
     this.containsBlogPosts = promptForTempPost;
 }
 public RecentPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount,
     IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader)
     : base(blogClient, blogAccount, credentials, blogHomepageUrl, pageDownloader)
 {
 }
Example #15
0
 public WebBot(ILogger <WebBot> logger, PageDownloader pageDownloader)
 {
     this.logger         = logger;
     this.pageDownloader = pageDownloader;
 }