static IEnumerable <HttpFetch <HttpContent> > CrawlImpl(Uri rootUrl, int depth, Func <Uri, bool> followPredicate) { var linkSet = new HashSet <Uri> { rootUrl }; var queue = new Queue <KeyValuePair <int, Uri> >(); queue.Enqueue(0.AsKeyTo(rootUrl)); while (queue.Count > 0) { var dequeued = queue.Dequeue(); var url = dequeued.Value; var level = dequeued.Key; // TODO retry intermittent errors? var fetch = Http.Get(url).WithOptions(HttpOptions.Default.WithReturnErroneousFetch(true)).Buffer().Single(); if (!fetch.IsSuccessStatusCode) { continue; } yield return(fetch); if (level >= depth) { continue; } // If content is HTML then sniff links and add them to the // queue assuming they are from the same domain and pass the // user-supplied condition to follow. var contentMediaType = fetch.Content.Headers.ContentType?.MediaType; if (!"text/html".Equals(contentMediaType, StringComparison.OrdinalIgnoreCase)) { continue; } var lq = from e in HttpObservable.Return(_ => Observable.Return(fetch)).Links().Content() select Uri.TryCreate(e, UriKind.Absolute, out url) ? url : null into e where e != null && (e.Scheme == Uri.UriSchemeHttp || e.Scheme == Uri.UriSchemeHttps) && !linkSet.Contains(e) && rootUrl.Host.Equals(e.Host, StringComparison.OrdinalIgnoreCase) && followPredicate(e) select e; foreach (var e in lq.ToEnumerable()) { if (linkSet.Add(e)) { queue.Enqueue((level + 1).AsKeyTo(e)); } } } }
static IHttpObservable Submit(IHttpObservable query, string formSelector, int?formIndex, Uri url, NameValueCollection data) => HttpObservable.Return( from html in query.Html() select HttpQuery.Submit(html.Client, html.Content, formSelector, formIndex, url, data));