Exemple #1
0
        static IEnumerable <HttpFetch <HttpContent> > CrawlImpl(Uri rootUrl, int depth, Func <Uri, bool> followPredicate)
        {
            var linkSet = new HashSet <Uri> {
                rootUrl
            };
            var queue = new Queue <KeyValuePair <int, Uri> >();

            queue.Enqueue(0.AsKeyTo(rootUrl));

            while (queue.Count > 0)
            {
                var dequeued = queue.Dequeue();
                var url      = dequeued.Value;
                var level    = dequeued.Key;
                // TODO retry intermittent errors?
                var fetch = Http.Get(url).WithOptions(HttpOptions.Default.WithReturnErroneousFetch(true)).Buffer().Single();

                if (!fetch.IsSuccessStatusCode)
                {
                    continue;
                }

                yield return(fetch);

                if (level >= depth)
                {
                    continue;
                }

                // If content is HTML then sniff links and add them to the
                // queue assuming they are from the same domain and pass the
                // user-supplied condition to follow.

                var contentMediaType = fetch.Content.Headers.ContentType?.MediaType;
                if (!"text/html".Equals(contentMediaType, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                var lq =
                    from e in HttpObservable.Return(_ => Observable.Return(fetch)).Links().Content()
                    select Uri.TryCreate(e, UriKind.Absolute, out url) ? url : null into e
                        where e != null &&
                    (e.Scheme == Uri.UriSchemeHttp || e.Scheme == Uri.UriSchemeHttps) &&
                    !linkSet.Contains(e) &&
                    rootUrl.Host.Equals(e.Host, StringComparison.OrdinalIgnoreCase) &&
                    followPredicate(e)
                    select e;

                foreach (var e in lq.ToEnumerable())
                {
                    if (linkSet.Add(e))
                    {
                        queue.Enqueue((level + 1).AsKeyTo(e));
                    }
                }
            }
        }
Exemple #2
0
 static IHttpObservable Submit(IHttpObservable query, string formSelector, int?formIndex, Uri url, NameValueCollection data) =>
 HttpObservable.Return(
     from html in query.Html()
     select HttpQuery.Submit(html.Client, html.Content, formSelector, formIndex, url, data));