protected bool Equals(QuerySourceOptions other) { return(string.Equals(ResultStreamName, other.ResultStreamName) && string.Equals(PartitionResultStreamNamePattern, other.PartitionResultStreamNamePattern) && ReorderEvents.Equals(other.ReorderEvents) && ProcessingLag == other.ProcessingLag && IsBiState.Equals(other.IsBiState) && DefinesStateTransform.Equals(other.DefinesStateTransform) && DefinesCatalogTransform.Equals(other.DefinesCatalogTransform) && ProducesResults.Equals(other.ProducesResults) && DefinesFold.Equals(other.DefinesFold) && HandlesDeletedNotifications.Equals(other.HandlesDeletedNotifications) && IncludeLinks.Equals(other.IncludeLinks)); }
public override int GetHashCode() { unchecked { int hashCode = (ResultStreamName != null ? ResultStreamName.GetHashCode() : 0); hashCode = (hashCode * 397) ^ (PartitionResultStreamNamePattern != null ? PartitionResultStreamNamePattern.GetHashCode() : 0); hashCode = (hashCode * 397) ^ ReorderEvents.GetHashCode(); hashCode = (hashCode * 397) ^ ProcessingLag; hashCode = (hashCode * 397) ^ IsBiState.GetHashCode(); hashCode = (hashCode * 397) ^ DefinesStateTransform.GetHashCode(); hashCode = (hashCode * 397) ^ DefinesCatalogTransform.GetHashCode(); hashCode = (hashCode * 397) ^ ProducesResults.GetHashCode(); hashCode = (hashCode * 397) ^ DefinesFold.GetHashCode(); hashCode = (hashCode * 397) ^ HandlesDeletedNotifications.GetHashCode(); hashCode = (hashCode * 397) ^ IncludeLinks.GetHashCode(); return(hashCode); } }
//This is where the rubber hits the road async Task DoScrape(Uri uri) { OnScrape(uri); if (endDateTime.HasValue && DateTimeProvider.UtcNow > endDateTime) { return; } if (!scrapedUris.TryAdd(uri)) { return; } if (!DisableRobotsProtocol && !Robots.PathIsAllowed(uri.PathAndQuery)) { return; } var htmlDoc = new HtmlDoc { Uri = uri }; try { htmlDoc.Html = await httpClient.GetString(uri); } catch (Exception exception) { OnHttpClientException(exception); } if (string.IsNullOrEmpty(htmlDoc.Html)) { return; } if (!(ObserverLinkFilter != null && !ObserverLinkFilter.IsMatch(uri.ToString()))) { NotifyObservers(htmlDoc); } var pageBase = htmlDoc.Uri.Segments.Last().Contains('.') ? htmlDoc.Uri.ToString().Substring(0, htmlDoc.Uri.ToString().LastIndexOf('/')) : htmlDoc.Uri.ToString(); if (!pageBase.EndsWith("/")) { pageBase += "/"; } var pageBaseUri = new Uri(pageBase); //only use of the CsQuery lib found so far CQ cq = htmlDoc.Html; //Doing some selecting: anchors and hrefs using JQuery-like syntax //out of the box, DoScrape does only the simplest finding of links var links = cq["a"].Select(x => x.GetAttribute("href")).Where(x => x != null); //looks like we're setup to not follow external links var localLinks = LocalLinks(links).Select(x => NormalizeLink(x, pageBaseUri)).Where(x => x.ToString().StartsWith(baseUri.ToString()) && x.ToString().Length <= 2048); if (IncludeLinks != null) { localLinks = localLinks.Where(x => IncludeLinks.IsMatch(x.ToString())); } if (IgnoreLinks != null) { localLinks = localLinks.Where(x => !IgnoreLinks.IsMatch(x.ToString())); } if (MaxDepth.HasValue) { localLinks = localLinks.Where(x => x.Segments.Length <= MaxDepth + 1); } var tasks = localLinks.Select(DoScrape).ToArray(); //recursive call to scape the links found Task.WaitAll(tasks); }
async Task DoScrape(Uri uri) { OnScrape(uri); if (endDateTime.HasValue && DateTimeProvider.UtcNow > endDateTime) { return; } if (!scrapedUris.TryAdd(uri)) { return; } if (!DisableRobotsProtocol && !Robots.PathIsAllowed(uri.PathAndQuery)) { return; } var htmlDoc = new HtmlDoc { Uri = uri }; try { htmlDoc.Html = await httpClient.GetString(uri); } catch (Exception exception) { OnHttpClientException(exception); } if (string.IsNullOrEmpty(htmlDoc.Html)) { return; } if (!(ObserverLinkFilter != null && !ObserverLinkFilter.IsMatch(uri.ToString()))) { NotifyObservers(htmlDoc); } var pageBase = htmlDoc.Uri.Segments.Last().Contains('.') ? htmlDoc.Uri.ToString().Substring(0, htmlDoc.Uri.ToString().LastIndexOf('/')) : htmlDoc.Uri.ToString(); if (!pageBase.EndsWith("/")) { pageBase += "/"; } var pageBaseUri = new Uri(pageBase); CQ cq = htmlDoc.Html; var links = cq["a"].Select(x => x.GetAttribute("href")).Where(x => x != null); var localLinks = LocalLinks(links).Select(x => NormalizeLink(x, pageBaseUri)).Where(x => x.ToString().StartsWith(baseUri.ToString()) && x.ToString().Length <= 2048); if (IncludeLinks != null) { localLinks = localLinks.Where(x => IncludeLinks.IsMatch(x.ToString())); } if (IgnoreLinks != null) { localLinks = localLinks.Where(x => !IgnoreLinks.IsMatch(x.ToString())); } if (MaxDepth.HasValue) { localLinks = localLinks.Where(x => x.Segments.Length <= MaxDepth + 1); } var tasks = localLinks.Select(DoScrape).ToArray(); Task.WaitAll(tasks); }