Beispiel #1
0
        Uri TryGetInlineBaseUrl()
        {
            var baseRef = QuerySelector("html > head > base[href]")?.GetAttributeValue("href");

            if (baseRef == null)
            {
                return(null);
            }

            var baseUrl = TryParse.Uri(baseRef, UriKind.Absolute);

            return(baseUrl.Scheme == Uri.UriSchemeHttp || baseUrl.Scheme == Uri.UriSchemeHttps
                 ? baseUrl : null);
        }
Beispiel #2
0
        public static Config FromDictionary(IDictionary dictionary)
        {
            var url  = TryParse.Uri(dictionary.Get(ConfigKeys.Url), UriKind.RelativeOrAbsolute);
            var host = dictionary.Get(ConfigKeys.Host);

            if (url == null && string.IsNullOrEmpty(host))
            {
                throw new ApplicationException("The \"url\" or \"host\" setting is not found in configuration");
            }

            var project = dictionary.Get(ConfigKeys.Project);

            if (string.IsNullOrEmpty(project))
            {
                throw new ApplicationException("The \"project\" setting is not found in configuration");
            }

            if (url != null)
            {
                return(new Config(url.Host, project)
                {
                    Port = url.Port,
                    UseSsl = url.Scheme.Contains("https"),
                    Path = url.AbsolutePath,
                    Username = dictionary.Get(ConfigKeys.Username),
                    Password = dictionary.Get(ConfigKeys.Passwrod),
                    ReportAsynchronously = TryParse.Boolean(dictionary.Get(ConfigKeys.ReportAsynchronously)).GetValueOrDefault(true)
                });
            }
            else
            {
                return(new Config(host, project)
                {
                    Port = TryParse.Int32(dictionary.Get(ConfigKeys.Port)).GetValueOrDefault(80),
                    UseSsl = TryParse.Boolean(dictionary.Get(ConfigKeys.UseSsl)).GetValueOrDefault(),
                    Path = dictionary.Get(ConfigKeys.Path),
                    Username = dictionary.Get(ConfigKeys.Username),
                    Password = dictionary.Get(ConfigKeys.Passwrod),
                    ReportAsynchronously = TryParse.Boolean(dictionary.Get(ConfigKeys.ReportAsynchronously)).GetValueOrDefault(true)
                });
            }
        }
Beispiel #3
0
        static IEnumerable <QueryResultItem <HttpFetch <HttpContent> > > CrawlImpl(QueryContext context, Uri rootUrl, int depth, Func <Uri, bool> followPredicate)
        {
            var linkSet = new HashSet <Uri> {
                rootUrl
            };
            var queue = new Queue <KeyValuePair <int, Uri> >();

            queue.Enqueue(0.AsKeyTo(rootUrl));

            while (queue.Count > 0)
            {
                var dequeued = queue.Dequeue();
                var url      = dequeued.Value;
                var level    = dequeued.Key;
                // TODO retry intermittent errors?
                var fetchResult = Http.ReturnErrorneousFetch().Get(url).GetResult(context).Single();
                var fetch       = fetchResult.Value;

                if (!fetch.IsSuccessStatusCode)
                {
                    continue;
                }

                yield return(fetchResult);

                context = fetchResult.Context;

                if (level >= depth)
                {
                    continue;
                }

                // If content is HTML then sniff links and add them to the
                // queue assuming they are from the same domain and pass the
                // user-supplied condition to follow.

                var contentMediaType = fetch.Content.Headers.ContentType?.MediaType;
                if (!"text/html".Equals(contentMediaType, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                var lq =
                    from e in Query.Singleton(fetch).Links().Content()
                    select TryParse.Uri(e, UriKind.Absolute) into e
                        where e != null &&
                    (e.Scheme == Uri.UriSchemeHttp || e.Scheme == Uri.UriSchemeHttps) &&
                    !linkSet.Contains(e) &&
                    rootUrl.Host.Equals(e.Host, StringComparison.OrdinalIgnoreCase) &&
                    followPredicate(e)
                    select e;

                var links = lq.GetResult(context);
                foreach (var e in links)
                {
                    if (linkSet.Add(e))
                    {
                        queue.Enqueue((level + 1).AsKeyTo(e.Value));
                    }
                    context = e.Context;
                }
            }
        }
Beispiel #4
0
 public string TryBaseHref(string href) =>
 BaseUrl != null
     ? TryParse.Uri(BaseUrl, href)?.OriginalString ?? href
     : href;
Beispiel #5
0
        static void QueenSongs()
        {
            var q =

                from t in Http.Get(new Uri("https://en.wikipedia.org/wiki/Queen_discography")).Tables().Content()
                .Where(t => t.HasClass("wikitable"))
                .Take(1)
                from tr in t.TableRows((_, trs) => trs)
                select tr.FirstOrDefault(e => e?.AttributeValueEquals("scope", "row") == true) into th
                    where th != null
                let a = th.QuerySelector("a[href]")
                        select new
            {
                Title = a.GetAttributeValue("title")?.Trim(),
                Href  = a.Owner.TryBaseHref(a.GetAttributeValue("href")?.Trim()),
            }
            into e
                select new
            {
                e.Title,
                Url = TryParse.Uri(e.Href, UriKind.Absolute),
            }
            into e
            where !string.IsNullOrEmpty(e.Title) && e.Url != null
            select e
            into album

            from html in Http.Get(album.Url).Html().Content()

            from tb in html.Tables(".tracklist").Take(2)
            let trs                       = tb.QuerySelectorAll("tr")
                                 let hdrs =
                trs.FirstOrDefault(tr => tr.QuerySelectorAll("th").Take(4).Count() >= 3)
                ?.QuerySelectorAll("th")
                .Select(th => th.InnerTextSource.Decoded.Trim())
                .ToArray()
                where hdrs != null
                let idxs =
                    new[] { "Title", "Writer(s)", "Length" }
            .Select(h => Array.FindIndex(hdrs, he => he == h))
            .ToArray()
            let his = new
            {
                Title   = idxs[0],
                Writers = idxs[1],
                Length  = idxs[2],
            }
            from tr in trs
            let tds =
                tr.QuerySelectorAll("td")
                .Select(td => td.InnerTextSource.Decoded)
                .ToArray()
                where tds.Length >= 3
                select new
            {
                Album    = album.Title,
                Title    = tds[his.Title],
                Author   = his.Writers >= 0 ? tds[his.Writers] : null,
                Duration = tds[his.Length],
            };

            q.Dump();
        }