Example #1
0
        private Dictionary <string, string> LookUpLinkAndScrapLinks(string url, string sectionName)
        {
            Dictionary <string, string> subLinks = new Dictionary <string, string>();
            string targetSiteContent             = CrawlerEngine.GetResponseString(url);
            var    doc = new HtmlDocument();

            doc.LoadHtml(targetSiteContent);

            var nodes = doc.DocumentNode.SelectNodes($"//a[@href!=\"\"][contains(text(),'{sectionName}')]");

            if (nodes == null)
            {
                sectionName = sectionName.Split(' ')[0];
                nodes       = doc.DocumentNode.SelectNodes($"//a[@href!=\"\"][contains(text(),'{sectionName}')]");
            }

            if (nodes != null)
            {
                foreach (var node in nodes)
                {
                    try
                    {
                        subLinks.Add(node.GetAttributeValue("href", ""), node.InnerText);
                    }
                    catch
                    {
                    }
                }
            }

            return(subLinks);
        }
Example #2
0
            public static void SimpleTaskSample()
            {
                Console.WriteLine("Start Simple Task Sample");
                var engineConfig = new CrawlerEngineConfig();
                var engine       = new CrawlerEngine(engineConfig);

                engine.Start();
                Console.WriteLine();
                Console.WriteLine("Start Task");

                Uri fileURI = new Uri(URLbox2.Text);

                engine.AddTask(new SimpleTaskRequest {
                    Url = fileURI
                });

                while (!engine.IsIdle)
                {
                    Thread.Sleep(100);
                    TaskResultBase[] results = engine.GetFinishedTaskResults();
                    if (results.Length > 0)
                    {
                        Console.WriteLine();
                        Console.WriteLine("Processing Results");
                        foreach (SimpleTaskResult result in results)
                        {
                            foreach (var link in result.Links)
                            {
                                Console.WriteLine(link);
                            }
                        }
                    }
                }
            }
Example #3
0
        public void IsRobotRestricted()
        {
            var engine = new CrawlerEngine();

            Assert.False(engine.IsRobotRestricted(new Uri("https://www.penny-arcade.com/comic")).Result);
            Assert.True(engine.IsRobotRestricted(new Uri("https://www.penny-arcade.com/feed/test")).Result);
        }
Example #4
0
        public void DisallowedLinks()
        {
            var engine = new CrawlerEngine();
            var links  = engine.RobotRestrictions(new Uri("https://www.penny-arcade.com/comic")).Result;

            Assert.True(links.Any(x => x == "/feed/"), "/feed/");
        }
Example #5
0
        public void GoogleSearchTest()
        {
            ICrawler     crawler = new CrawlerEngine();
            SearchResult result  = crawler.Search(new GoogleSearchEngine(), "Java script");


            Assert.IsTrue(result.Value > 0);
        }
Example #6
0
        public void GetLinks_ReturnsLinks()
        {
            var engine = new CrawlerEngine();
            var links  = engine.GetLinks(new Uri("https://www.penny-arcade.com/comic")).Result;

            Assert.True(links.Any(x => x.OriginalString.Contains("http://paxsite.com/")), "http://paxsite.com/");
            Assert.True(links.Any(x => x.OriginalString.Contains("http://forums.penny-arcade.com/")), "http://forums.penny-arcade.com/");
            Assert.True(links.Any(x => x.OriginalString.Contains("https://www.penny-arcade.com/clubpa")), "/clubpa");
            Assert.False(links.Any(x => x.OriginalString.Contains("https://www.penny-arcade.com/archive/results/search")), "https://www.penny-arcade.com/archive/results/search");
        }
 private void StartCrawler()
 {
     CrawlerEngineConfig config = new CrawlerEngineConfig();
     config.MaxTasksPerMinute = 10;
     config.MaxFinishedTasks = 1000;
     config.MaxWorkingTasks = 3;
     using (CrawlerEngine engine = new CrawlerEngine(config))
     {
         
     }
 }
Example #8
0
        static void Main(string[] args)
        {
            string strMIMETypes = @"text/richtext[0,0];text/html[0,0];audio/x-aiff[0,0];";

            strMIMETypes += @"audio/basic[0,0];audio/wav[0,0];image/gif[0,0];image/jpeg[0,0];";
            strMIMETypes += @"image/pjpeg[0,0];image/tiff[0,0];image/x-png[0,0];image/x-xbitmap[0,0];";
            strMIMETypes += @"image/bmp[0,0];image/x-jg[0,0];image/x-emf[0,0];image/x-wmf[0,0];";
            strMIMETypes += @"video/avi[0,0];video/mpeg[0,0];application/postscript[0,0];application/base64[0,0];";
            strMIMETypes += @"application/macbinhex40[0,0];application/pdf[0,0];application/x-compressed[0,0];";
            strMIMETypes += @"application/x-zip-compressed[0,0];application/x-gzip-compressed[0,0];";
            strMIMETypes += @"application/java[0,0];application/x-msdownload[0,0];";
            CrawlerSettings settings = new CrawlerSettings();

            settings.allowAllMIMETypes_ = false;
            settings.downloadfolder_    = "downloadfolder1";
            settings.excludeFiles_      = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" };
            settings.excludeHosts_      = new string[] { "" };
            settings.excludeWords_      = new string[] { "" };
            settings.keepAlive_         = false;
            settings.keepSameServer_    = false;
            settings.lastRequestCount_  = 0;
            settings.allowedMIMETypes_  = strMIMETypes;
            settings.requestTimeout_    = 10;
            settings.sleepConnectTime_  = 0;
            settings.sleepFetchTime_    = 0;
            settings.threadsCount_      = 1;
            settings.maxThreadCount_    = 20;
            settings.maxDepth_          = 1;
            settings.filePath_          = "CrawlerConsoleSettings.txt";
            settings.lastModified_      = DateTime.Now;
            settings.version_           = 1;
            settings.dataTypeName_      = "CrawlerConsoleSettings";
            settings.WriteToFile();
            settings.ReadFromFile("CrawlerConsoleSettings.txt");

            CrawlerInput input = new CrawlerInput();

            input.domain_  = "baidu.com";
            input.fullUrl_ = "www.baidu.com";

            CrawlerOutput output = new CrawlerOutput();
            CrawlerEngine engine = new CrawlerEngine(settings, input, output);

            engine.RunCrawling();
        }
Example #9
0
        public void SmokeTest()
        {
            var rootPath = @"C:\1\";
            var uri      = new Uri(@"http:\\ya.ru");
            var dwnMock  = new Moq.Mock <IDownloader>();

            dwnMock.Setup(i => i.Download(uri))
            .Returns(() =>
            {
                var result = new DownloaderResult(uri);
                result.SetResponseData(new WebPageContent()
                {
                    Encoding      = Encoding.UTF8,
                    IsHtmlContent = true,
                    Bytes         = Encoding.UTF8.GetBytes(_yaSiteHtml)
                });
                return(result);
            });

            var pageFileStorageMock = new Moq.Mock <IPageFileSystemStorage>();

            pageFileStorageMock.Setup(i => i.SavePage(null, rootPath, true));

            var downLoader         = dwnMock.Object;
            var settings           = new CrawlerSettings();
            var downloadManager    = new DownloadManager(downLoader, new InMemoryLinkDataStorage(), settings);
            var webPageLinkManager = new WebPageLinkManager();
            var engine             = new CrawlerEngine(downloadManager, webPageLinkManager, pageFileStorageMock.Object);

            var taskSettings = new CrawlerTaskSettings()
            {
                CrawlDepth         = 1,
                IgnoreOtherDomains = false,
                ReplaceUrlToLocal  = true
            };

            var task = new CrawlerTask(uri, rootPath, taskSettings);

            var page = engine.ProcessCrawlerTask(task).Result;

            Assert.AreEqual(page.Uri, uri);
            Assert.IsTrue(page.IsHtml);
            Assert.AreEqual(page.Html, _yaSiteHtml);
        }
Example #10
0
        public CrawlerEngineTests()
        {
            this.crawlerRepository  = Substitute.For <ICrawlerRepository>();
            this.httpMessageHandler = Substitute.For <IHttpMessageHandler>();

            var httpClient = new HttpClient(new FakeHttpMessageHandler(this.httpMessageHandler));

            this.crawlerEngine = new CrawlerEngine(this.crawlerRepository, httpClient);

            this.uri = new Uri("http://localhost.crawl.com");
            this.crawlerRepository.GetNext().Returns(new CrawlItem()
            {
                Url = uri.ToString()
            });
            this.httpMessageHandler.SendAsync(Arg.Any <HttpRequestMessage>(), Arg.Any <CancellationToken>())
            .Returns(new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = new StringContent("<a href=\"http://localhost/\"></a>")
            });
        }
Example #11
0
        private string LookUpLinkAndScrapQuizes(string url, string sectionName)
        {
            string targetSiteContent = CrawlerEngine.GetResponseString(BaseUrl + url);

            var iframeMatch = Regex.Match(targetSiteContent, "<iframe.+src=\\\"(?<1>.*?)\\\"");

            if (iframeMatch.Success)
            {
                url = BaseUrl + iframeMatch.Groups[1].Value;
                targetSiteContent = CrawlerEngine.GetResponseString(url);

                var base64DataEncodedMatch = Regex.Match(targetSiteContent, "var data = \\\"(?<1>.*?)\\\";");

                if (base64DataEncodedMatch.Success)
                {
                    return(CrawlerEngine.DecodeBase64String(base64DataEncodedMatch.Groups[1].Value));
                }
            }

            return(null);
        }
Example #12
0
 private void TestCrawl()
 {
     var crawlerInstance = new CrawlerEngine();
     var seed = new SeedDTO {SeedDomainName = "http://webometrics.krc.karelia.ru/"};
     crawlerInstance.StartCrawlingProcess(new[] {seed});
 }
Example #13
0
        public ICollection <QuizModuleGroup> Scrapping()
        {
            List <QuizModuleGroup> moduleGroups = new List <QuizModuleGroup>();

            QuizModuleGroup moduleGroup = new QuizModuleGroup();

            moduleGroup.Title  = "EnglishGrammar";
            moduleGroup.Source = "EnglishTestStore";
            moduleGroup.Status = QuizStatus.JustCreated;

            string targetSiteContent = CrawlerEngine.GetResponseString(BaseUrl + "/index.php?option=com_content&view=article&id=11387&Itemid=380");
            var    doc = new HtmlDocument();

            doc.LoadHtml(targetSiteContent);
            string url = "", subSection = "";

            moduleGroup.QuizModules = new List <QuizModule>();

            var englishGrammarSectionNodes = doc.DocumentNode.SelectNodes("//div[@itemprop='articleBody']//table[1]//li/a[@href]");

            if (englishGrammarSectionNodes != null)
            {
                QuizModule module = new QuizModule();
                module.Title             = "Basic English Grammar exercises and tests";
                module.Source            = "EnglishTestStore";
                module.Status            = QuizStatus.JustCreated;
                module.QuizGroupSections = new List <QuizGroupSection>();

                foreach (var node in englishGrammarSectionNodes.Take(1))
                {
                    QuizGroupSection section = new QuizGroupSection();
                    section.Title      = node.InnerText.Trim();
                    section.Status     = QuizStatus.JustCreated;
                    section.QuizGroups = new List <QuizGroup>();

                    url = BaseUrl + node.GetAttributeValue("href", "");
                    var subLinks = LookUpLinkAndScrapLinks(url, section.Title);

                    foreach (var link in subLinks)
                    {
                        QuizGroup group = new QuizGroup();
                        group.Title = node.InnerText.Trim();
                        string json = LookUpLinkAndScrapQuizes(link.Key, link.Value);
                        group.Quizes = ParseJsonToModel(json);

                        section.QuizGroups.Add(group);
                    }

                    module.QuizGroupSections.Add(section);
                }

                moduleGroup.QuizModules.Add(module);
            }

            moduleGroups.Add(moduleGroup);

            //var englishTenseSectionLinks = new List<string>();
            //if (nodes != null)
            //{
            //    foreach (var node in nodes)
            //    {
            //        englishTenseSectionLinks.Add(node.GetAttributeValue("href", ""));
            //    }
            //}

            //var advEnglishGrammarSectionLinks = new List<string>();
            //if (nodes != null)
            //{
            //    foreach (var node in nodes)
            //    {
            //        advEnglishGrammarSectionLinks.Add(node.GetAttributeValue("href", ""));
            //    }
            //}

            //var commonlyConfusedSectionLinks = new List<string>();
            //if (nodes != null)
            //{
            //    foreach (var node in nodes)
            //    {
            //        commonlyConfusedSectionLinks.Add(node.GetAttributeValue("href", ""));
            //    }
            //}

            return(moduleGroups);
        }