Uri TryGetInlineBaseUrl() { var baseRef = QuerySelector("html > head > base[href]")?.GetAttributeValue("href"); if (baseRef == null) { return(null); } var baseUrl = TryParse.Uri(baseRef, UriKind.Absolute); return(baseUrl.Scheme == Uri.UriSchemeHttp || baseUrl.Scheme == Uri.UriSchemeHttps ? baseUrl : null); }
public static Config FromDictionary(IDictionary dictionary) { var url = TryParse.Uri(dictionary.Get(ConfigKeys.Url), UriKind.RelativeOrAbsolute); var host = dictionary.Get(ConfigKeys.Host); if (url == null && string.IsNullOrEmpty(host)) { throw new ApplicationException("The \"url\" or \"host\" setting is not found in configuration"); } var project = dictionary.Get(ConfigKeys.Project); if (string.IsNullOrEmpty(project)) { throw new ApplicationException("The \"project\" setting is not found in configuration"); } if (url != null) { return(new Config(url.Host, project) { Port = url.Port, UseSsl = url.Scheme.Contains("https"), Path = url.AbsolutePath, Username = dictionary.Get(ConfigKeys.Username), Password = dictionary.Get(ConfigKeys.Passwrod), ReportAsynchronously = TryParse.Boolean(dictionary.Get(ConfigKeys.ReportAsynchronously)).GetValueOrDefault(true) }); } else { return(new Config(host, project) { Port = TryParse.Int32(dictionary.Get(ConfigKeys.Port)).GetValueOrDefault(80), UseSsl = TryParse.Boolean(dictionary.Get(ConfigKeys.UseSsl)).GetValueOrDefault(), Path = dictionary.Get(ConfigKeys.Path), Username = dictionary.Get(ConfigKeys.Username), Password = dictionary.Get(ConfigKeys.Passwrod), ReportAsynchronously = TryParse.Boolean(dictionary.Get(ConfigKeys.ReportAsynchronously)).GetValueOrDefault(true) }); } }
static IEnumerable <QueryResultItem <HttpFetch <HttpContent> > > CrawlImpl(QueryContext context, Uri rootUrl, int depth, Func <Uri, bool> followPredicate) { var linkSet = new HashSet <Uri> { rootUrl }; var queue = new Queue <KeyValuePair <int, Uri> >(); queue.Enqueue(0.AsKeyTo(rootUrl)); while (queue.Count > 0) { var dequeued = queue.Dequeue(); var url = dequeued.Value; var level = dequeued.Key; // TODO retry intermittent errors? var fetchResult = Http.ReturnErrorneousFetch().Get(url).GetResult(context).Single(); var fetch = fetchResult.Value; if (!fetch.IsSuccessStatusCode) { continue; } yield return(fetchResult); context = fetchResult.Context; if (level >= depth) { continue; } // If content is HTML then sniff links and add them to the // queue assuming they are from the same domain and pass the // user-supplied condition to follow. var contentMediaType = fetch.Content.Headers.ContentType?.MediaType; if (!"text/html".Equals(contentMediaType, StringComparison.OrdinalIgnoreCase)) { continue; } var lq = from e in Query.Singleton(fetch).Links().Content() select TryParse.Uri(e, UriKind.Absolute) into e where e != null && (e.Scheme == Uri.UriSchemeHttp || e.Scheme == Uri.UriSchemeHttps) && !linkSet.Contains(e) && rootUrl.Host.Equals(e.Host, StringComparison.OrdinalIgnoreCase) && followPredicate(e) select e; var links = lq.GetResult(context); foreach (var e in links) { if (linkSet.Add(e)) { queue.Enqueue((level + 1).AsKeyTo(e.Value)); } context = e.Context; } } }
public string TryBaseHref(string href) => BaseUrl != null ? TryParse.Uri(BaseUrl, href)?.OriginalString ?? href : href;
static void QueenSongs() { var q = from t in Http.Get(new Uri("https://en.wikipedia.org/wiki/Queen_discography")).Tables().Content() .Where(t => t.HasClass("wikitable")) .Take(1) from tr in t.TableRows((_, trs) => trs) select tr.FirstOrDefault(e => e?.AttributeValueEquals("scope", "row") == true) into th where th != null let a = th.QuerySelector("a[href]") select new { Title = a.GetAttributeValue("title")?.Trim(), Href = a.Owner.TryBaseHref(a.GetAttributeValue("href")?.Trim()), } into e select new { e.Title, Url = TryParse.Uri(e.Href, UriKind.Absolute), } into e where !string.IsNullOrEmpty(e.Title) && e.Url != null select e into album from html in Http.Get(album.Url).Html().Content() from tb in html.Tables(".tracklist").Take(2) let trs = tb.QuerySelectorAll("tr") let hdrs = trs.FirstOrDefault(tr => tr.QuerySelectorAll("th").Take(4).Count() >= 3) ?.QuerySelectorAll("th") .Select(th => th.InnerTextSource.Decoded.Trim()) .ToArray() where hdrs != null let idxs = new[] { "Title", "Writer(s)", "Length" } .Select(h => Array.FindIndex(hdrs, he => he == h)) .ToArray() let his = new { Title = idxs[0], Writers = idxs[1], Length = idxs[2], } from tr in trs let tds = tr.QuerySelectorAll("td") .Select(td => td.InnerTextSource.Decoded) .ToArray() where tds.Length >= 3 select new { Album = album.Title, Title = tds[his.Title], Author = his.Writers >= 0 ? tds[his.Writers] : null, Duration = tds[his.Length], }; q.Dump(); }