コード例 #1
0
ファイル: UrlFrontier.cs プロジェクト: Remus17/Crawler
        public static void Enqueue(Uri baseUrl, string anchor)
        {
            var hashTagIndex = anchor.IndexOf('#');

            if (hashTagIndex >= 0)
            {
                anchor = anchor.Substring(0, hashTagIndex);
            }

            if (IsAbsoluteUrl(anchor))
            {
                if (UrlsQueue.Count > RunSettings.MaxQueuedUrlsSize || CanBeSkipped(anchor) || !AvailableFormat(new Uri(anchor).Segments.Last()))
                {
                    return;
                }

                if (anchor.Contains(baseUrl.Authority))
                {
                    CurrentWebsiteUrlsQueue.Enqueue(anchor);
                }
                else
                {
                    var newUri      = new Uri(anchor);
                    var absoluteUri = $"{newUri.Scheme}://{newUri.Authority}";
                    if (!CanBeSkipped(absoluteUri))
                    {
                        //enqueue only home page, the crawler will download only what they have exposed on their website
                        UrlsQueue.Enqueue(absoluteUri);
                    }
                }
                return;
            }

            if (!AvailableFormat(anchor))
            {
                return;
            }
            var url = GetUrlFromAnchor(baseUrl, anchor);

            if (url == null)
            {
                return;
            }
            var absoluteUrl = url.AbsoluteUri;

            if (CanBeSkipped(absoluteUrl) || !AvailableFormat(url.Segments.Last()))
            {
                return;
            }

            //if (CurrentWebsiteUrlsQueue.Count > RunSettings.MaxQueuedUrlsSize)
            //{
            //  return;
            //}
            CurrentWebsiteUrlsQueue.Enqueue(absoluteUrl);
        }
コード例 #2
0
ファイル: UrlFrontier.cs プロジェクト: Remus17/Crawler
 public static bool CanBeSkipped(string url)
 {
     if (!url.StartsWith("http"))
     {
         return(true);
     }
     if (ApplicationCache.VisitedUrls.Contains(url) || CurrentWebsiteUrlsQueue.Contains(url) || UrlsQueue.Contains(url))
     {
         return(true);
     }
     return(false);
 }