private Dictionary <string, string> LookUpLinkAndScrapLinks(string url, string sectionName) { Dictionary <string, string> subLinks = new Dictionary <string, string>(); string targetSiteContent = CrawlerEngine.GetResponseString(url); var doc = new HtmlDocument(); doc.LoadHtml(targetSiteContent); var nodes = doc.DocumentNode.SelectNodes($"//a[@href!=\"\"][contains(text(),'{sectionName}')]"); if (nodes == null) { sectionName = sectionName.Split(' ')[0]; nodes = doc.DocumentNode.SelectNodes($"//a[@href!=\"\"][contains(text(),'{sectionName}')]"); } if (nodes != null) { foreach (var node in nodes) { try { subLinks.Add(node.GetAttributeValue("href", ""), node.InnerText); } catch { } } } return(subLinks); }
public static void SimpleTaskSample() { Console.WriteLine("Start Simple Task Sample"); var engineConfig = new CrawlerEngineConfig(); var engine = new CrawlerEngine(engineConfig); engine.Start(); Console.WriteLine(); Console.WriteLine("Start Task"); Uri fileURI = new Uri(URLbox2.Text); engine.AddTask(new SimpleTaskRequest { Url = fileURI }); while (!engine.IsIdle) { Thread.Sleep(100); TaskResultBase[] results = engine.GetFinishedTaskResults(); if (results.Length > 0) { Console.WriteLine(); Console.WriteLine("Processing Results"); foreach (SimpleTaskResult result in results) { foreach (var link in result.Links) { Console.WriteLine(link); } } } } }
public void IsRobotRestricted() { var engine = new CrawlerEngine(); Assert.False(engine.IsRobotRestricted(new Uri("https://www.penny-arcade.com/comic")).Result); Assert.True(engine.IsRobotRestricted(new Uri("https://www.penny-arcade.com/feed/test")).Result); }
public void DisallowedLinks() { var engine = new CrawlerEngine(); var links = engine.RobotRestrictions(new Uri("https://www.penny-arcade.com/comic")).Result; Assert.True(links.Any(x => x == "/feed/"), "/feed/"); }
public void GoogleSearchTest() { ICrawler crawler = new CrawlerEngine(); SearchResult result = crawler.Search(new GoogleSearchEngine(), "Java script"); Assert.IsTrue(result.Value > 0); }
public void GetLinks_ReturnsLinks() { var engine = new CrawlerEngine(); var links = engine.GetLinks(new Uri("https://www.penny-arcade.com/comic")).Result; Assert.True(links.Any(x => x.OriginalString.Contains("http://paxsite.com/")), "http://paxsite.com/"); Assert.True(links.Any(x => x.OriginalString.Contains("http://forums.penny-arcade.com/")), "http://forums.penny-arcade.com/"); Assert.True(links.Any(x => x.OriginalString.Contains("https://www.penny-arcade.com/clubpa")), "/clubpa"); Assert.False(links.Any(x => x.OriginalString.Contains("https://www.penny-arcade.com/archive/results/search")), "https://www.penny-arcade.com/archive/results/search"); }
private void StartCrawler() { CrawlerEngineConfig config = new CrawlerEngineConfig(); config.MaxTasksPerMinute = 10; config.MaxFinishedTasks = 1000; config.MaxWorkingTasks = 3; using (CrawlerEngine engine = new CrawlerEngine(config)) { } }
static void Main(string[] args) { string strMIMETypes = @"text/richtext[0,0];text/html[0,0];audio/x-aiff[0,0];"; strMIMETypes += @"audio/basic[0,0];audio/wav[0,0];image/gif[0,0];image/jpeg[0,0];"; strMIMETypes += @"image/pjpeg[0,0];image/tiff[0,0];image/x-png[0,0];image/x-xbitmap[0,0];"; strMIMETypes += @"image/bmp[0,0];image/x-jg[0,0];image/x-emf[0,0];image/x-wmf[0,0];"; strMIMETypes += @"video/avi[0,0];video/mpeg[0,0];application/postscript[0,0];application/base64[0,0];"; strMIMETypes += @"application/macbinhex40[0,0];application/pdf[0,0];application/x-compressed[0,0];"; strMIMETypes += @"application/x-zip-compressed[0,0];application/x-gzip-compressed[0,0];"; strMIMETypes += @"application/java[0,0];application/x-msdownload[0,0];"; CrawlerSettings settings = new CrawlerSettings(); settings.allowAllMIMETypes_ = false; settings.downloadfolder_ = "downloadfolder1"; settings.excludeFiles_ = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" }; settings.excludeHosts_ = new string[] { "" }; settings.excludeWords_ = new string[] { "" }; settings.keepAlive_ = false; settings.keepSameServer_ = false; settings.lastRequestCount_ = 0; settings.allowedMIMETypes_ = strMIMETypes; settings.requestTimeout_ = 10; settings.sleepConnectTime_ = 0; settings.sleepFetchTime_ = 0; settings.threadsCount_ = 1; settings.maxThreadCount_ = 20; settings.maxDepth_ = 1; settings.filePath_ = "CrawlerConsoleSettings.txt"; settings.lastModified_ = DateTime.Now; settings.version_ = 1; settings.dataTypeName_ = "CrawlerConsoleSettings"; settings.WriteToFile(); settings.ReadFromFile("CrawlerConsoleSettings.txt"); CrawlerInput input = new CrawlerInput(); input.domain_ = "baidu.com"; input.fullUrl_ = "www.baidu.com"; CrawlerOutput output = new CrawlerOutput(); CrawlerEngine engine = new CrawlerEngine(settings, input, output); engine.RunCrawling(); }
public void SmokeTest() { var rootPath = @"C:\1\"; var uri = new Uri(@"http:\\ya.ru"); var dwnMock = new Moq.Mock <IDownloader>(); dwnMock.Setup(i => i.Download(uri)) .Returns(() => { var result = new DownloaderResult(uri); result.SetResponseData(new WebPageContent() { Encoding = Encoding.UTF8, IsHtmlContent = true, Bytes = Encoding.UTF8.GetBytes(_yaSiteHtml) }); return(result); }); var pageFileStorageMock = new Moq.Mock <IPageFileSystemStorage>(); pageFileStorageMock.Setup(i => i.SavePage(null, rootPath, true)); var downLoader = dwnMock.Object; var settings = new CrawlerSettings(); var downloadManager = new DownloadManager(downLoader, new InMemoryLinkDataStorage(), settings); var webPageLinkManager = new WebPageLinkManager(); var engine = new CrawlerEngine(downloadManager, webPageLinkManager, pageFileStorageMock.Object); var taskSettings = new CrawlerTaskSettings() { CrawlDepth = 1, IgnoreOtherDomains = false, ReplaceUrlToLocal = true }; var task = new CrawlerTask(uri, rootPath, taskSettings); var page = engine.ProcessCrawlerTask(task).Result; Assert.AreEqual(page.Uri, uri); Assert.IsTrue(page.IsHtml); Assert.AreEqual(page.Html, _yaSiteHtml); }
public CrawlerEngineTests() { this.crawlerRepository = Substitute.For <ICrawlerRepository>(); this.httpMessageHandler = Substitute.For <IHttpMessageHandler>(); var httpClient = new HttpClient(new FakeHttpMessageHandler(this.httpMessageHandler)); this.crawlerEngine = new CrawlerEngine(this.crawlerRepository, httpClient); this.uri = new Uri("http://localhost.crawl.com"); this.crawlerRepository.GetNext().Returns(new CrawlItem() { Url = uri.ToString() }); this.httpMessageHandler.SendAsync(Arg.Any <HttpRequestMessage>(), Arg.Any <CancellationToken>()) .Returns(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("<a href=\"http://localhost/\"></a>") }); }
private string LookUpLinkAndScrapQuizes(string url, string sectionName) { string targetSiteContent = CrawlerEngine.GetResponseString(BaseUrl + url); var iframeMatch = Regex.Match(targetSiteContent, "<iframe.+src=\\\"(?<1>.*?)\\\""); if (iframeMatch.Success) { url = BaseUrl + iframeMatch.Groups[1].Value; targetSiteContent = CrawlerEngine.GetResponseString(url); var base64DataEncodedMatch = Regex.Match(targetSiteContent, "var data = \\\"(?<1>.*?)\\\";"); if (base64DataEncodedMatch.Success) { return(CrawlerEngine.DecodeBase64String(base64DataEncodedMatch.Groups[1].Value)); } } return(null); }
private void TestCrawl() { var crawlerInstance = new CrawlerEngine(); var seed = new SeedDTO {SeedDomainName = "http://webometrics.krc.karelia.ru/"}; crawlerInstance.StartCrawlingProcess(new[] {seed}); }
public ICollection <QuizModuleGroup> Scrapping() { List <QuizModuleGroup> moduleGroups = new List <QuizModuleGroup>(); QuizModuleGroup moduleGroup = new QuizModuleGroup(); moduleGroup.Title = "EnglishGrammar"; moduleGroup.Source = "EnglishTestStore"; moduleGroup.Status = QuizStatus.JustCreated; string targetSiteContent = CrawlerEngine.GetResponseString(BaseUrl + "/index.php?option=com_content&view=article&id=11387&Itemid=380"); var doc = new HtmlDocument(); doc.LoadHtml(targetSiteContent); string url = "", subSection = ""; moduleGroup.QuizModules = new List <QuizModule>(); var englishGrammarSectionNodes = doc.DocumentNode.SelectNodes("//div[@itemprop='articleBody']//table[1]//li/a[@href]"); if (englishGrammarSectionNodes != null) { QuizModule module = new QuizModule(); module.Title = "Basic English Grammar exercises and tests"; module.Source = "EnglishTestStore"; module.Status = QuizStatus.JustCreated; module.QuizGroupSections = new List <QuizGroupSection>(); foreach (var node in englishGrammarSectionNodes.Take(1)) { QuizGroupSection section = new QuizGroupSection(); section.Title = node.InnerText.Trim(); section.Status = QuizStatus.JustCreated; section.QuizGroups = new List <QuizGroup>(); url = BaseUrl + node.GetAttributeValue("href", ""); var subLinks = LookUpLinkAndScrapLinks(url, section.Title); foreach (var link in subLinks) { QuizGroup group = new QuizGroup(); group.Title = node.InnerText.Trim(); string json = LookUpLinkAndScrapQuizes(link.Key, link.Value); group.Quizes = ParseJsonToModel(json); section.QuizGroups.Add(group); } module.QuizGroupSections.Add(section); } moduleGroup.QuizModules.Add(module); } moduleGroups.Add(moduleGroup); //var englishTenseSectionLinks = new List<string>(); //if (nodes != null) //{ // foreach (var node in nodes) // { // englishTenseSectionLinks.Add(node.GetAttributeValue("href", "")); // } //} //var advEnglishGrammarSectionLinks = new List<string>(); //if (nodes != null) //{ // foreach (var node in nodes) // { // advEnglishGrammarSectionLinks.Add(node.GetAttributeValue("href", "")); // } //} //var commonlyConfusedSectionLinks = new List<string>(); //if (nodes != null) //{ // foreach (var node in nodes) // { // commonlyConfusedSectionLinks.Add(node.GetAttributeValue("href", "")); // } //} return(moduleGroups); }