public void AddLinkToCrawl(LinkToCrawl link) { // if the link to add is NOT in the list of links to crawl or crawled links, then add it Thread.Sleep(100); var q = from l in LinksToCrawl.Values where l.SessionId == link.SessionId && string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 && string.Compare(l.TargetUrl, link.TargetUrl, true) == 0 select l; if (!q.Any()) { var q2 = from l in CrawledLinks.Values where l.SessionId == link.SessionId && string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 && string.Compare(l.TargetUrl, link.TargetUrl, true) == 0 select l; if (!q2.Any()) { link.Id = NextId; LinksToCrawl.Add(link.Id, link); } } }
private async Task RunCrawl(string hostname) { HostUrl = VerifyUrlIntegrity(hostname); LinksToCrawl.Enqueue(HostUrl.OriginalString); while (LinksToCrawl.Any()) { List <Task> tasks = new List <Task>(); for (var thread = 1; thread <= ConcurrencyLimit && thread <= LinksToCrawl.Count; thread++) { var link = new Uri(LinksToCrawl.Dequeue()); if (WebsiteMap.ContainsKey(link.Host + link.AbsolutePath)) { continue; } tasks.Add(CrawlPage(link.OriginalString)); } await Task.WhenAll(tasks); } Console.WriteLine($"Found {WebsiteMap.Count} links"); }
private async Task CrawlPage(string link) { var pageResults = await _parser.ParsePage(link); HandlePageResultLinks(pageResults); WebsiteMap.TryAdd(pageResults.PageUrl.Host + pageResults.PageUrl.AbsolutePath, pageResults); pageResults.Links.ForEach(x => LinksToCrawl.Enqueue(x)); }
public void ClearLinksToCrawl(int sessionId, string baseDomain) { var q = from l in LinksToCrawl.Values where l.SessionId == sessionId && string.Compare(l.TargetBaseDomain, baseDomain, false) == 0 select l.Id; foreach (var id in q.ToList()) { LinksToCrawl.Remove(id); } }
public void DeleteLinkToCrawl(Guid id) { Thread.Sleep(100); var q = from l in LinksToCrawl.Values where l.Id == id select l; var link = q.FirstOrDefault(); if (link != null && LinksToCrawl.ContainsKey(link.Id)) { LinksToCrawl.Remove(link.Id); } }
public void DeleteLinkToCrawl(int sessionId, string srcUrl, string targetUrl) { Thread.Sleep(100); var q = from l in LinksToCrawl.Values where l.SessionId == sessionId && l.SourceUrl == srcUrl && l.TargetUrl == targetUrl select l; var link = q.FirstOrDefault(); if (link != null && LinksToCrawl.ContainsKey(link.Id)) { LinksToCrawl.Remove(link.Id); } }
public void GetLinksToCrawl() { for (int i = 1; i < NumPageToCrawl; i++) { var url = BaseUrl + "?p=" + i + "&q=" + UrlEncodedQueryString; Driver.Navigate().GoToUrl(url); var allLinks = Driver.FindElementsByTagName("a"); foreach (var link in allLinks) { if (link.Text.Contains(FileName)) { var linkHref = link.GetAttribute("href"); if (!LinksToCrawl.Contains(linkHref)) { LinksToCrawl.Add(linkHref); } } } } }
public void ProcessLinks(Abot.Poco.CrawledPage page) { if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0) { _logger.DebugFormat("CrawledPage contained 0 parsed links"); LinksToCrawl = new List <LinkToCrawl>(); LinksToByPass = new List <CrawledLink>(); return; } LinksToByPass = new List <CrawledLink>(); MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>(); using (var factory = _provider.GetInstanceOf <IModelFactory>()) { var sessionId = page.PageBag.SessionId; var crawlerId = page.PageBag.CrawlerId; LinkToCrawl link = null; CrawledLink bypassedLink = null; foreach (var targetUri in page.ParsedLinks) { ProcessLink(page, factory, targetUri, sessionId, crawlerId); } LinksToCrawl = MapOfLinksToCrawl.Values.ToList(); MapOfLinksToCrawl.Clear(); MapOfLinksToCrawl = null; if (_logger.IsDebugEnabled) { _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}", String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl))); _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}", String.Join("; ", LinksToByPass.Select(o => o.TargetUrl))); } } }