Esempio n. 1
0
        public UriSources CombineIfRedirection(StandardizedUri sourceUri, Uri redirectionTarget)
        {
            var src        = GetOrCreateSources(sourceUri);
            var stdDestUri = new StandardizedUri(redirectionTarget);

            src.RedirectedUri = redirectionTarget;

            if (sourceUri.Equals(stdDestUri))
            {
                // Not a meaningful redirection
                return(src);
            }

            // The request has been redirected,
            // so the source and destination pages should be considered equivalent.
            // Combine the pages that point to either.
            src.Status = SpiderPageStatus.Redirected;

            var dest          = GetOrCreateSources(stdDestUri);
            var combinedLinks = src.CallingLinks.Union(dest.CallingLinks).ToHashSet();

            src.CallingLinks  = combinedLinks;
            dest.CallingLinks = combinedLinks;

            return(dest);
        }
Esempio n. 2
0
 public bool TryNextUnvisited(out StandardizedUri uri)
 {
     uri = _uris
           .Where(x => x.Value.Status == SpiderPageStatus.Unvisited)
           .Select(x => x.Key)
           .FirstOrDefault();
     return(uri is object);
 }
Esempio n. 3
0
 public UriSources GetOrCreateSources(StandardizedUri uri)
 {
     if (!_uris.TryGetValue(uri, out var sources))
     {
         sources    = new UriSources(uri);
         _uris[uri] = sources;
     }
     return(sources);
 }
Esempio n. 4
0
        private async Task Visit(StandardizedUri uri)
        {
            UriSources sources = null;
            string     html    = "";

            try
            {
                (Uri finalUri, string content) = await DownloadPageAsync(uri.Standardized);

                html    = content;
                sources = _linkTracker.CombineIfRedirection(uri, finalUri);
            }
            catch (HttpRequestException ex)
            {
                sources        = _linkTracker.GetOrCreateSources(uri);
                sources.Status = SpiderPageStatus.Error;
                sources.Error  = ex.Message;
                Debug.WriteLine($"Error downloading page ({uri}): {ex.Message}");
            }

            if (sources.Status != SpiderPageStatus.Unvisited)
            {
                return;
            }

            if (!IsHtml(html))
            {
                sources.Status = SpiderPageStatus.Excluded;
                return;
            }

            sources.Status = SpiderPageStatus.Visited;

            if (_options.FollowLinks)
            {
                ParseLinks(sources.Uri.Uri, html).ForEach(FoundLink);
            }
        }
Esempio n. 5
0
 public bool Contains(StandardizedUri uri) => _uris.ContainsKey(uri);
Esempio n. 6
0
 public UriSources(StandardizedUri uri)
 {
     this.Uri = uri;
 }