Example #1
0
        public void Consume_PageProcessorThrowsException_DoesNotCrash()
        {
            //Arrange
            CrawlResult fakeResult = new CrawlResult {
                CrawlContext = GetCrawlContext(_dummyCrawlProcessors)
            };

            CrawledPage crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://www.adamthings.com"));

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult)
            .Callback(() => _fakeWebCrawler
                      .Raise(f => f.PageCrawlCompleted += null, new PageCrawlCompletedArgs(GetCrawlContext(_dummyCrawlProcessors), crawledPage)));
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);
            _fakeProcessor1.Setup(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), crawledPage)).Throws(new Exception("oh no page"));

            //Act
            _uut.Consume(new Domain {
                DomainId = 1, Uri = new Uri("http://www.adamthings.com")
            }, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1));
            _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1));
            _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1));
        }
Example #2
0
        /// <summary>
        /// Crawl and parse data from a file.
        /// </summary>
        /// <param name="filename">Path and filename.</param>
        /// <returns>Parse result.</returns>
        public ParseResult ParseFromFile(string filename)
        {
            if (String.IsNullOrEmpty(filename))
            {
                throw new ArgumentNullException(nameof(filename));
            }

            ParseResult ret = new ParseResult();

            ret.Json = new ParseResult.JsonParseResult();

            FileCrawler crawler = new FileCrawler(filename);
            CrawlResult cr      = crawler.Get();

            if (!cr.Success)
            {
                ret.Time.End = DateTime.UtcNow;
                return(ret);
            }

            byte[] sourceData    = cr.Data;
            string sourceContent = Encoding.UTF8.GetString(sourceData);

            return(ProcessSourceContent(sourceContent));
        }
Example #3
0
        /// <summary>
        /// Download the object to the supplied filename.
        /// </summary>
        /// <param name="filename">The filename where the object should be saved.</param>
        /// <returns>Crawl result.</returns>
        public CrawlResult Download(string filename)
        {
            if (String.IsNullOrEmpty(filename))
            {
                throw new ArgumentNullException(nameof(filename));
            }

            CrawlResult ret = new CrawlResult();

            try
            {
                ret.Metadata      = CrawlResult.ObjectMetadata.FromFileInfo(new FileInfo(Filename));
                ret.ContentLength = new FileInfo(Filename).Length;

                using (FileStream source = new FileStream(Filename, FileMode.Open, FileAccess.Read))
                {
                    using (FileStream target = new FileStream(filename, FileMode.CreateNew, FileAccess.ReadWrite))
                    {
                        source.CopyTo(target);
                    }
                }

                ret.DataStream = null;
                ret.Success    = true;
            }
            catch (Exception e)
            {
                ret.Exception = e;
            }

            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }
Example #4
0
        private static void Main(string[] args)
        {
            try
            {
                Uri uriToCrawl = GetSiteToCrawl();

                // I'm using the default crawler
                var crawler = new PoliteWebCrawler();

                // I need to subscribe to this event in order to process pages that have been crawled
                crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted;

                // Start the crawl
                CrawlResult crawlResult = crawler.Crawl(uriToCrawl);

                // Generate report
                Task <ReportResult> reportTask = GenerateReport();

                PrintResultInformation(reportTask.Result);
            }
            catch (Exception ex)
            {
                System.Console.ForegroundColor = ConsoleColor.Red;
                System.Console.WriteLine("There was an error when trying to crawl page.");
                System.Console.Write(ex);
                System.Console.ReadKey();
            }
        }
        public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements)
        {
            Directory.CreateDirectory(@"c:\temp\WebScraper");

            using (var client = new HttpClient())
            {
                foreach (var img in elements.SelectMany(e => e.QuerySelectorAll("img")))
                {
                    var src = new Uri(crawlResult.RequestUrl, new Uri(img.Attributes["src"].Value, UriKind.RelativeOrAbsolute));

                    var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(src.LocalPath));

                    if (File.Exists(fileName) == false)
                    {
                        Console.WriteLine($"Downloading {src} to {fileName}");

                        using (var f = File.OpenWrite(fileName))
                            using (var s = client.GetStreamAsync(src).Result)
                            {
                                s.CopyTo(f);
                            }
                    }
                    else
                    {
                        Console.WriteLine($"Skipping download of {src} to {fileName}");
                    }

                    img.SetAttribute("data-local-src", fileName);
                }
            }

            return(elements.Select(this.htmlSelector).Aggregate((prod, next) => prod + "\n" + next));
        }
Example #6
0
        static void SqlCrawler()
        {
            string query = Common.InputString("Query:", null, true);

            if (String.IsNullOrEmpty(query))
            {
                return;
            }

            DbSettings db = new DbSettings(
                (DbType)(Enum.Parse(typeof(DbType), Common.InputString("DB type:", "Mysql", false))),
                Common.InputString("Hostname:", "localhost", false),
                Common.InputInteger("Port:", 3306, true, false),
                Common.InputString("Username:"******"root", false),
                Common.InputString("Password:"******"password", false),
                Common.InputString("Instance:", null, true),
                Common.InputString("Database name:", "dbname", false));

            SqlCrawler  sc = new SqlCrawler(db, query);
            CrawlResult cr = sc.Get();

            Console.WriteLine("Success    : " + cr.Success);
            Console.WriteLine("Start time : " + cr.Time.Start.ToString());
            Console.WriteLine("End time   : " + cr.Time.End.ToString());
            Console.WriteLine("Total ms   : " + cr.Time.TotalMs.ToString() + "ms");
            Console.WriteLine("Data       : ");
            if (cr.DataTable != null)
            {
                Console.WriteLine(Common.SerializeJson(Common.DataTableToListDynamic(cr.DataTable), true));
            }
            else
            {
                Console.WriteLine("  (null)");
            }
        }
Example #7
0
        static void S3Crawler()
        {
            string    endpoint  = Common.InputString("Endpoint:", null, true);
            bool      ssl       = Common.InputBoolean("SSL:", true);
            string    bucket    = Common.InputString("Bucket:", null, false);
            string    key       = Common.InputString("Key:", null, false);
            string    accessKey = Common.InputString("Access Key:", null, false);
            string    secretKey = Common.InputString("Secret Key:", null, false);
            AwsRegion region    = (AwsRegion)(Enum.Parse(typeof(AwsRegion), Common.InputString("Region:", "USWest1", false)));
            string    baseUrl   = Common.InputString("Base URL:", "http://localhost:8000/{bucket}/{key}", false);

            S3Crawler s3c = null;

            if (!String.IsNullOrEmpty(endpoint))
            {
                s3c = new S3Crawler(endpoint, ssl, bucket, key, accessKey, secretKey, region, baseUrl);
            }
            else
            {
                s3c = new S3Crawler(bucket, key, accessKey, secretKey, region);
            }

            CrawlResult cr = s3c.Get();

            Console.WriteLine("Success        : " + cr.Success);
            Console.WriteLine("Start time     : " + cr.Time.Start.ToString());
            Console.WriteLine("End time       : " + cr.Time.End.ToString());
            Console.WriteLine("Total ms       : " + cr.Time.TotalMs.ToString() + "ms");
            Console.WriteLine("Content length : " + cr.ContentLength + " bytes");
            Console.WriteLine("Metadata       : " + Common.SerializeJson(cr.Metadata, false));
            Console.WriteLine("Data           :" + Environment.NewLine + Encoding.UTF8.GetString(cr.Data));
        }
Example #8
0
        static void Main(string[] args)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 1;
            crawlConfig.MaxPagesToCrawl      = 1;


            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig);


            crawler.PageCrawlStartingAsync  += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            //crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("http://www.kmhk.kmu.edu.tw/news/list.asp?P_classify=9")); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
Example #9
0
        public CrawlResult Crawl()
        {
            IWebCrawler crawler = InitCrawler();

            Uri uriToCrawl = new Uri("http://rabota.ua/jobsearch/vacancy_list"); //http://rabota.ua/jobsearch/vacancy_list?pg=1000

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains(@"rabota.ua/jobsearch/vacancy_list") &&
                    !pageToCrawl.Uri.AbsoluteUri.Contains(@"period"))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false, Reason = "Parse only job pages"
                });
            });

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(uriToCrawl);

            return(result);
        }
Example #10
0
        public void Consume_ValidDomain_CrawlerCrawlBagSet()
        {
            //Arrange
            Domain domain = new Domain {
                DomainId = 1, Uri = new Uri("http://a.com")
            };
            CrawlContext context    = GetCrawlContext(_dummyCrawlProcessors);
            CrawlResult  fakeResult = new CrawlResult {
                CrawlContext = context
            };

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult);
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);

            //Act
            DomainCrawlResult result = _uut.Consume(domain, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            Assert.AreEqual(domain, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.Domain);
            Assert.AreEqual(_dummyProcessorContext.PrimaryPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.PrimaryPersistenceProvider);
            Assert.AreEqual(_dummyProcessorContext.BackupPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.BackupPersistenceProvider);
            Assert.AreEqual(_dummyCrawlProcessors, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.CrawlProcessors);
        }
        /// <summary>
        /// 运行爬虫
        /// </summary>

        public void StartCrawl()
        {
            //设置爬虫
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            //设置爬取条件
            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //开始爬取
            CrawlResult result = crawler.Crawl(new Uri(link)); //This is synchronous, it will not go to the next line until the crawl has completed

            //返回结果
            if (result.ErrorOccurred)
            {
                log.Error("链接" + result.RootUri.AbsoluteUri + "出现差错爬取完成:" + result.ErrorException.Message);
                Console.WriteLine("链接 {0} 出现差错爬取完成: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                log.Info("链接" + result.RootUri.AbsoluteUri + "无差错爬取完成!");
                Console.WriteLine("链接 {0} 无差错爬取完成.", result.RootUri.AbsoluteUri);
            }
            flag = false;
        }
Example #12
0
        public void Consume_DomainProcessorTimesOut_DoesNotCrash()
        {
            //Arrange
            _dummyConfig.MaxDomainProcessorTimeInMilliSecs = 1000;
            CrawlResult fakeResult = new CrawlResult {
                CrawlContext = GetCrawlContext(_dummyCrawlProcessors)
            };

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult);
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);
            _fakeProcessor1.Setup(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()))
            .Callback((CrawlContext cc) => System.Threading.Thread.Sleep(10000));    //ten seconds

            //Act
            Stopwatch timer = Stopwatch.StartNew();

            _uut.Consume(new Domain {
                DomainId = 1, Uri = new Uri("http://a.com")
            }, _dummyCancellationToken);
            timer.Stop();

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1));
            _fakeProcessor2.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1));
            _fakeProcessor3.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1));

            Assert.IsTrue(timer.ElapsedMilliseconds > 900);
            Assert.IsTrue(timer.ElapsedMilliseconds < 2000);
        }
Example #13
0
        public void Consume_DomainProcessorThrowsException_DoesNotCrash()
        {
            //Arrange
            CrawlResult fakeResult = new CrawlResult {
                CrawlContext = GetCrawlContext(_dummyCrawlProcessors)
            };

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult);
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);
            _fakeProcessor1.Setup(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>())).Throws(new Exception("oh no domain"));

            //Act
            _uut.Consume(new Domain {
                DomainId = 1, Uri = new Uri("http://a.com")
            }, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1));
            _fakeProcessor2.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1));
            _fakeProcessor3.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1));
        }
Example #14
0
        public void Consume_ValidDomain_AppConfigHttpStatusesToProcessNullWebResponse_CrawlsPerformed()
        {
            //Arrange
            CrawlResult fakeResult = new CrawlResult {
                CrawlContext = GetCrawlContext(_dummyCrawlProcessors)
            };

            CrawledPage crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://www.adamthings.com"));

            _dummyConfig.HttpStatusesToProcess = new string[] { };
            crawledPage.HttpWebResponse        = null;

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult)
            .Callback(() => _fakeWebCrawler
                      .Raise(f => f.PageCrawlCompleted += null, new PageCrawlCompletedArgs(GetCrawlContext(_dummyCrawlProcessors), crawledPage)));
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);

            //Act
            _uut.Consume(new Domain {
                DomainId = 1, Uri = new Uri("http://www.adamthings.com")
            }, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1));
            _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1));
            _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1));
        }
Example #15
0
        public bool Crawl(Site s)
        {
            m_currentSite = s;
            shouldAbort   = false;
            Program.Log("Beginning crawl of " + s.Name);
            Program.Status("Crawling " + s.Name);

            try
            {
                CrawlResult result = crawler.Crawl(new Uri(s.Uri));

                if (result.ErrorOccurred)
                {
                    Program.Log(String.Format("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message));
                    return(false);
                }
                else
                {
                    Program.Log(String.Format("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri));
                }

                Program.Log(s.Name + " has been crawled and added to Milkshake successfully.");
                return(true);
            }
            catch (Exception e)
            {
                Program.Log(e.ToString());
                return(false);
            }
        }
Example #16
0
        /// <summary>
        /// Crawl all pages of the website and convert them to NLPTextDocuments
        /// </summary>
        public void ExtractNLPTextDocuments()
        {
            Perfs = new PerfMonitor();

            DisplayMessages(WriteStartMessage);
            DisplayMessages(Perfs.WriteStatusHeader);

            // This is synchronous, it will not go to the next line until the crawl has completed
            CrawlResult result = crawler.Crawl(ExtractorParams.RootUrl);

            Perfs.EndTime = DateTime.Now;

            // Write end status to log file
            Perfs.WriteStatus(messagesWriter);

            string endMessage = null;

            if (result.ErrorOccurred)
            {
                endMessage = "Extraction completed with fatal error \"" + result.ErrorException.Message + "\"";
            }
            else
            {
                endMessage = "Extraction completed";
            }
            DisplayMessages(WriteEndMessage, endMessage);
        }
Example #17
0
        /// <summary>
        /// Crawl, query, and parse data from a SQL database.
        /// </summary>
        /// <param name="dbSettings">Database settings.</param>
        /// <param name="query">Query to execute.</param>
        /// <returns>Parse result.</returns>
        public ParseResult ParseFromQuery(DbSettings dbSettings, string query)
        {
            if (dbSettings == null)
            {
                throw new ArgumentNullException(nameof(dbSettings));
            }
            if (String.IsNullOrEmpty(query))
            {
                throw new ArgumentNullException(nameof(query));
            }

            ParseResult ret = new ParseResult();

            ret.Sql = new ParseResult.SqlParseResult();

            SqlCrawler  crawler = new SqlCrawler(dbSettings, query);
            CrawlResult cr      = crawler.Get();

            if (!cr.Success)
            {
                ret.Time.End = DateTime.UtcNow;
                return(ret);
            }

            return(ProcessSourceContent(cr.DataTable));
        }
Example #18
0
        public async Task <CrawlResult> Crawl(Uri uri)
        {
            var httpClient = new HttpClient();
            var result     = new CrawlResult();

            try
            {
                HttpClient client   = new HttpClient();
                var        response = await client.GetAsync(uri);

                var pageContents = await response.Content.ReadAsStringAsync();

                HtmlDocument pageDocument = new HtmlDocument();
                pageDocument.LoadHtml(pageContents);
                var links       = pageDocument.DocumentNode.Descendants("a");
                var linkedPages = links.Select(a => a.GetAttributeValue("href", null))
                                  .Where(u => !String.IsNullOrEmpty(u));
                result.ContainsHtml = true;
                result.Links        = links;
                result.LinkedPages  = linkedPages;
            } catch
            {
                result.ContainsHtml = false;
            }
            return(result);
        }
Example #19
0
        static void Main(string[] args)
        {
            log4net.Config.XmlConfigurator.Configure();
            PrintDisclaimer();

            Uri uriToCrawl = GetSiteToCrawl(args);

            IWebCrawler crawler;

            //Uncomment only one of the following to see that instance in action
            //crawler = GetDefaultWebCrawler();
            //crawler = GetManuallyConfiguredWebCrawler();
            crawler = GetCustomBehaviorUsingLambdaWebCrawler();

            //Subscribe to any of these asynchronous events, there are also sychronous versions of each.
            //This is where you process data about specific events of the crawl
            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //Start the crawl
            //This is a synchronous call
            CrawlResult result = crawler.Crawl(uriToCrawl);

            //Now go view the log.txt file that is in the same directory as this executable. It has
            //all the statements that you were trying to read in the console window :).
            //Not enough data being logged? Change the app.config file's log4net log level from "INFO" TO "DEBUG"

            PrintDisclaimer();

            Console.ReadKey();
        }
        public void Crawl(CrawlRequest request)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 1000;
            crawlConfig.UserAgentString      = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v1.0 http://code.google.com/p/abot)";
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");
            crawlConfig.MaxCrawlDepth            = 10;
            crawlConfig.DownloadableContentTypes = "text/html, text/plain";

            //Will use the manually created crawlConfig object created above
            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(request.EntryURL));

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
        public async Task <CrawlResult> Scrape()
        {
            CrawlResult crawlResult = new CrawlResult();

            BrowserSettings browserSettings = new BrowserSettings
            {
                WindowlessFrameRate = 1
            };

            using (ChromiumWebBrowser browser = new ChromiumWebBrowser(browserSettings: browserSettings))
            {
                try
                {
                    await WaitForBrowserInit(browser);
                    await LoadAsync(browser, string.Format(CrawlDescription.SearchString, CrawlDescription.Keyword));

                    crawlResult.Ads = await BingExtractor.ExtractTextAds(browser);

                    crawlResult.CrawlResultID = CrawlResultID.Success;
                }
                catch (Exception ex)
                {
                    Log.Error("Scraper Exception({0}): {1}", ex.GetType(), ex.Message);
                    crawlResult.CrawlResultID = CrawlResultID.Failure;
                }
            }

            return(crawlResult);
        }
Example #22
0
        private void InitializeCommands()
        {
            openFileCommand = new CommandImpl(async() =>
            {
                LinkedList <string> sites = new LinkedList <string>();
                sites = processingOpenFileCommand();
                foreach (string link in sites)
                {
                    crawlerHandler.ConsoleOutput = link + "\n";
                }
                if (openFileCommand.CanExecute)
                {
                    openFileCommand.CanExecute = false;
                }

                result = await Task.Run(() => crawler.PerformCrawlingAsync(sites, 0));
                openFileCommand.CanExecute   = true;
                crawlerHandler.ConsoleOutput = result.getSite().Addres;
            });

            iAmAliveCommand = new CommandImpl(async() =>
            {
                ShowIAmAlive();
            });
        }
Example #23
0
        //protected void IsBrowserLoaded(object sender, System.EventArgs e)
        //{
        //	// No need more calls
        //	Browser.BrowserInitialized -= IsBrowserLoaded;

        //	// Continue main thread
        //	SemaphoreObj.Release();
        //}

        protected virtual /*async Task<*/ IEnumerable <CrawlResult> /*>*/ GetSitemapResults(IRobotsSitemap sitemap, CancellationTokenSource cancellationTokenSource)
        {
            List <CrawlResult> results = new List <CrawlResult>();

            if (!sitemap.IsLoaded)
            {
                sitemap = SitemapLoader.Load(sitemap);
            }

            if (sitemap.Sitemaps != null && sitemap.Sitemaps.Any())
            {
                Logger.InfoFormat("Sitemap: {0} | Inner sitemaps' count: {1}", sitemap.Location, sitemap.Sitemaps.Count());

                foreach (IRobotsSitemap derivedSitemap in sitemap.Sitemaps)
                {
                    results.AddRange(/*await*/ GetSitemapResults(derivedSitemap, cancellationTokenSource) /*.Result*/);
                }
            }

            if (sitemap.Items != null && sitemap.Items.Any())
            {
                Logger.InfoFormat("Sitemap: {0} | Uris' count: {1}", sitemap.Location, sitemap.Items.Count());

                CrawlContext.Scheduler.Add(sitemap.Items.Select(x => new PageToCrawl(x.Location)));

                CrawlResult crawlResult = new CrawlResult();
                CrawlComplete = false;
                //await Task.Run(() => ParallelCrawlSite(crawlResult));
                ParallelCrawlSite(crawlResult);
                results.Add(crawlResult);
            }

            return(results);
        }
Example #24
0
        /// <summary>
        /// Crawl and parse data from a URL.
        /// </summary>
        /// <param name="url">Source URL.</param>
        /// <returns>Parse result.</returns>
        public ParseResult ParseFromUrl(string url)
        {
            if (String.IsNullOrEmpty(url))
            {
                throw new ArgumentNullException(nameof(url));
            }

            ParseResult ret = new ParseResult();

            ret.Xml = new ParseResult.XmlParseResult();

            HttpCrawler crawler = new HttpCrawler(url);
            CrawlResult cr      = crawler.Get();

            if (!cr.Success)
            {
                ret.Time.End = DateTime.UtcNow;
                return(ret);
            }

            byte[] sourceData    = cr.Data;
            string sourceContent = Encoding.UTF8.GetString(sourceData);

            return(ProcessSourceContent(sourceContent));
        }
Example #25
0
        static void Main(string[] args)
        {
            CrawlConfiguration config = new CrawlConfiguration();

            config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe.

            // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe.
            // Make sure to dispose this instance or you will have a zombie process!
            IWebDriver driver = CreatePhantomJsDriver(config);

            // Create the content extractor that uses PhantomJS.
            IWebContentExtractor extractor = new JavaScriptContentExtractor(driver);

            // Create a PageRequester that will use the extractor.
            IPageRequester requester = new PageRequester(config, extractor);

            using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) {
                crawler.PageCrawlCompleted += OnPageCrawlCompleted;

                CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/"));
                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }
            }

            Console.Read();
        }
Example #26
0
        public static void Main(string[] args)
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlCompletedAsync += Crawler_ProcessPageCrawlCompleted;
            var         start  = DateTime.Now;
            var         uri    = new Uri("https://lord.technology");
            CrawlResult result = crawler.Crawl(uri);

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
            var finish = DateTime.Now;

            Console.WriteLine((finish - start).TotalMinutes);

            using (FileStream fs = File.Open(@"./crawl.json", FileMode.Create))
                using (StreamWriter sw = new StreamWriter(fs))
                    using (JsonWriter jw = new JsonTextWriter(sw))
                    {
                        jw.Formatting = Formatting.Indented;
                        JsonSerializer serializer = new JsonSerializer();
                        serializer.Serialize(jw, new { nodes = _pages, edges = _relationships });
                    }
        }
Example #27
0
        public static void StartCrawlEbuyer(string url)
        {
            try
            {
                PoliteWebCrawler crawler = new PoliteWebCrawler();
                crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
                crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
                crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
                crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;


                TimeSpan ts = new TimeSpan(0, 0, 5);
                CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts);
                CrawlResult             result = crawler.Crawl(new Uri(url), cancellationTokenSource);

                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }
            }catch (Exception)
            {
            }
            ExtractingHtml.ExtractDetailsEbuyer();
        }
Example #28
0
//Crawling code for GSM
        public static void StartCrawlGSM(string url)
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStartingGSM;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompletedGSM;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowedGSM;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowedGSM;


            TimeSpan ts = new TimeSpan(0, 0, 0);
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts);
            CrawlResult             result = crawler.Crawl(new Uri(url), cancellationTokenSource);

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }


            //FileStream fs = new FileStream("url.txt", FileMode.Open);
            //StreamReader sr = new StreamReader(fs);
            //string str = "";
            //while ((str = sr.ReadLine()) != null)
            //{
            //    StartCrawl(str);
            //}

            ExtractingHtml.ExtractingDetailsGSM();
        }
        public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements)
        {
            Directory.CreateDirectory(@"c:\temp\WebScraper");

            var result = new List <object>();

            using (var client = new HttpClient())
            {
                foreach (var element in elements.Where(e => e.TagName.Equals("a", StringComparison.OrdinalIgnoreCase)))
                {
                    var href     = new Uri(crawlResult.RequestUrl, new Uri(element.Attributes["href"].Value, UriKind.RelativeOrAbsolute));
                    var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(href.LocalPath));

                    if (File.Exists(fileName) == false)
                    {
                        Console.WriteLine($"Downloading {href} to {fileName}");

                        using (var f = File.OpenWrite(fileName))
                            using (var s = client.GetStreamAsync(href).Result)
                            {
                                s.CopyTo(f);
                            }
                    }
                    else
                    {
                        Console.WriteLine($"Skipping download of {href} to {fileName}");
                    }

                    result.Add(new { FileName = fileName, Title = element.TextContent });
                }

                return(result);
            }
        }
Example #30
0
        private void Start()
        {
            _logger.Info("Starting...");
            var url = this.UriTextBox.Text.Trim();

            if (_threadManager == null)
            {
                _threadManager = new TaskThreadManager(10);
                var crawler = CreateCrawler(_threadManager);
                _crawlerCancellationTS = new CancellationTokenSource();
                _products.Clear();
                _crawlerTask = new Task(() =>
                {
                    CrawlResult result = crawler.Crawl(new Uri(url), _crawlerCancellationTS);
                    OnCrawlerCompleted();
                }, _crawlerCancellationTS.Token);
                _crawlerTask.Start();
            }
            else
            {
                _threadManager.Resume();
            }

            NotifyUIOnStatusChange(true);
        }
 public FeedFinishedInfo(DbObjectStandardFeed feed, int index, int count, CrawlResult result)
 {
     this.Feed = feed;
     this.Index = index;
     this.Count = count;
     this.Result = result;
 }