public UriSources CombineIfRedirection(StandardizedUri sourceUri, Uri redirectionTarget) { var src = GetOrCreateSources(sourceUri); var stdDestUri = new StandardizedUri(redirectionTarget); src.RedirectedUri = redirectionTarget; if (sourceUri.Equals(stdDestUri)) { // Not a meaningful redirection return(src); } // The request has been redirected, // so the source and destination pages should be considered equivalent. // Combine the pages that point to either. src.Status = SpiderPageStatus.Redirected; var dest = GetOrCreateSources(stdDestUri); var combinedLinks = src.CallingLinks.Union(dest.CallingLinks).ToHashSet(); src.CallingLinks = combinedLinks; dest.CallingLinks = combinedLinks; return(dest); }
public bool TryNextUnvisited(out StandardizedUri uri) { uri = _uris .Where(x => x.Value.Status == SpiderPageStatus.Unvisited) .Select(x => x.Key) .FirstOrDefault(); return(uri is object); }
public UriSources GetOrCreateSources(StandardizedUri uri) { if (!_uris.TryGetValue(uri, out var sources)) { sources = new UriSources(uri); _uris[uri] = sources; } return(sources); }
private async Task Visit(StandardizedUri uri) { UriSources sources = null; string html = ""; try { (Uri finalUri, string content) = await DownloadPageAsync(uri.Standardized); html = content; sources = _linkTracker.CombineIfRedirection(uri, finalUri); } catch (HttpRequestException ex) { sources = _linkTracker.GetOrCreateSources(uri); sources.Status = SpiderPageStatus.Error; sources.Error = ex.Message; Debug.WriteLine($"Error downloading page ({uri}): {ex.Message}"); } if (sources.Status != SpiderPageStatus.Unvisited) { return; } if (!IsHtml(html)) { sources.Status = SpiderPageStatus.Excluded; return; } sources.Status = SpiderPageStatus.Visited; if (_options.FollowLinks) { ParseLinks(sources.Uri.Uri, html).ForEach(FoundLink); } }
public bool Contains(StandardizedUri uri) => _uris.ContainsKey(uri);
public UriSources(StandardizedUri uri) { this.Uri = uri; }