Пример #1
0
        public void AddLinkToCrawl(LinkToCrawl link)
        {
            // if the link to add is NOT in the list of links to crawl or crawled links, then add it
            Thread.Sleep(100);
            var q = from l in LinksToCrawl.Values
                    where l.SessionId == link.SessionId &&
                    string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 &&
                    string.Compare(l.TargetUrl, link.TargetUrl, true) == 0
                    select l;

            if (!q.Any())
            {
                var q2 = from l in CrawledLinks.Values
                         where l.SessionId == link.SessionId &&
                         string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 &&
                         string.Compare(l.TargetUrl, link.TargetUrl, true) == 0
                         select l;

                if (!q2.Any())
                {
                    link.Id = NextId;
                    LinksToCrawl.Add(link.Id, link);
                }
            }
        }
Пример #2
0
        private async Task RunCrawl(string hostname)
        {
            HostUrl = VerifyUrlIntegrity(hostname);

            LinksToCrawl.Enqueue(HostUrl.OriginalString);

            while (LinksToCrawl.Any())
            {
                List <Task> tasks = new List <Task>();

                for (var thread = 1; thread <= ConcurrencyLimit && thread <= LinksToCrawl.Count; thread++)
                {
                    var link = new Uri(LinksToCrawl.Dequeue());
                    if (WebsiteMap.ContainsKey(link.Host + link.AbsolutePath))
                    {
                        continue;
                    }
                    tasks.Add(CrawlPage(link.OriginalString));
                }

                await Task.WhenAll(tasks);
            }

            Console.WriteLine($"Found {WebsiteMap.Count} links");
        }
Пример #3
0
        private async Task CrawlPage(string link)
        {
            var pageResults = await _parser.ParsePage(link);

            HandlePageResultLinks(pageResults);

            WebsiteMap.TryAdd(pageResults.PageUrl.Host + pageResults.PageUrl.AbsolutePath, pageResults);

            pageResults.Links.ForEach(x => LinksToCrawl.Enqueue(x));
        }
Пример #4
0
        public void ClearLinksToCrawl(int sessionId, string baseDomain)
        {
            var q = from l in LinksToCrawl.Values
                    where l.SessionId == sessionId &&
                    string.Compare(l.TargetBaseDomain, baseDomain, false) == 0
                    select l.Id;

            foreach (var id in q.ToList())
            {
                LinksToCrawl.Remove(id);
            }
        }
Пример #5
0
        public void DeleteLinkToCrawl(Guid id)
        {
            Thread.Sleep(100);
            var q = from l in LinksToCrawl.Values
                    where l.Id == id
                    select l;
            var link = q.FirstOrDefault();

            if (link != null && LinksToCrawl.ContainsKey(link.Id))
            {
                LinksToCrawl.Remove(link.Id);
            }
        }
Пример #6
0
        public void DeleteLinkToCrawl(int sessionId, string srcUrl, string targetUrl)
        {
            Thread.Sleep(100);
            var q = from l in LinksToCrawl.Values
                    where l.SessionId == sessionId &&
                    l.SourceUrl == srcUrl &&
                    l.TargetUrl == targetUrl
                    select l;
            var link = q.FirstOrDefault();

            if (link != null && LinksToCrawl.ContainsKey(link.Id))
            {
                LinksToCrawl.Remove(link.Id);
            }
        }
Пример #7
0
        public void GetLinksToCrawl()
        {
            for (int i = 1; i < NumPageToCrawl; i++)
            {
                var url = BaseUrl + "?p=" + i + "&q=" + UrlEncodedQueryString;
                Driver.Navigate().GoToUrl(url);
                var allLinks = Driver.FindElementsByTagName("a");

                foreach (var link in allLinks)
                {
                    if (link.Text.Contains(FileName))
                    {
                        var linkHref = link.GetAttribute("href");
                        if (!LinksToCrawl.Contains(linkHref))
                        {
                            LinksToCrawl.Add(linkHref);
                        }
                    }
                }
            }
        }
Пример #8
0
        public void ProcessLinks(Abot.Poco.CrawledPage page)
        {
            if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0)
            {
                _logger.DebugFormat("CrawledPage contained 0 parsed links");
                LinksToCrawl  = new List <LinkToCrawl>();
                LinksToByPass = new List <CrawledLink>();
                return;
            }

            LinksToByPass     = new List <CrawledLink>();
            MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>();

            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                var         sessionId    = page.PageBag.SessionId;
                var         crawlerId    = page.PageBag.CrawlerId;
                LinkToCrawl link         = null;
                CrawledLink bypassedLink = null;
                foreach (var targetUri in page.ParsedLinks)
                {
                    ProcessLink(page, factory, targetUri, sessionId, crawlerId);
                }

                LinksToCrawl = MapOfLinksToCrawl.Values.ToList();
                MapOfLinksToCrawl.Clear();
                MapOfLinksToCrawl = null;
                if (_logger.IsDebugEnabled)
                {
                    _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}",
                                        String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl)));
                    _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}",
                                        String.Join("; ", LinksToByPass.Select(o => o.TargetUrl)));
                }
            }
        }