protected override async IAsyncEnumerable <IContentItem> ListDirectoryAsync(Uri dir) { var httpResponse = await httpClient.GetAsync(dir); var responseBodyText = httpResponse.Content.ReadAsStringAsync(); var document = new HtmlDocument(); document.LoadHtml(await responseBodyText); var tables = document.DocumentNode.Descendants("table"); foreach (var table in tables) { foreach (var tableRow in table.Descendants("tr").Skip(1)) // Skip header { var linkNode = tableRow.ChildNodes.Descendants("a").FirstOrDefault(); if (linkNode == null) { continue; } var relativeLink = linkNode.Attributes["href"].Value; var completeUri = new Uri(dir, relativeLink); var dateNode = linkNode.ParentNode.NextSibling; var sizeNode = dateNode?.NextSibling; IContentItem item = relativeLink.EndsWith("/") ? (IContentItem) new FolderContentItem() : ContentItemStore.GetOrCreateByDownloadUrl <ContentItem>(completeUri); item.Source = dir; item.Title = relativeLink; item.DownloadLinks.Add(completeUri); item.ReportedSizeInBytes = FileSizeUtilities.ParseFromSuffixedString(sizeNode?.InnerText); if (DateTime.TryParse(dateNode?.InnerText, out DateTime dateTimeParsed)) { item.DatePosted = dateTimeParsed; } yield return(item); } } }
public async IAsyncEnumerable <IContentItem> GetLatestItemsAsync(int count) { var feed = SyndicationFeed.Load(XmlReader.Create(feedUrlBase)); var items = feed.Items.OrderByDescending(i => i.PublishDate).Take(count); foreach (var item in items) { var uri = item.Links.FirstOrDefault()?.Uri; if (uri == null) { continue; } var ci = ContentItemStore.GetBySourceUrl(uri); if (ci == null) { ci = await ParsePageContentsAsync(uri); } yield return(ci); } }
protected override async IAsyncEnumerable <IContentItem> ListDirectoryAsync(Uri dir) { var httpResponse = await httpClient.GetAsync(dir); var document = new HtmlDocument(); document.LoadHtml(await httpResponse.Content.ReadAsStringAsync()); var linkNodes = document.DocumentNode.Descendants("a"); foreach (var linkNode in linkNodes.Skip(1)) // Skip header { var relativeLink = linkNode.Attributes["href"].Value; var completeUri = new Uri(dir, relativeLink); var split = linkNode.NextSibling.InnerText.Trim().Split(" "); var dateText = string.Join(" ", split.Take(split.Length - 1)); var sizeText = split.Last(); IContentItem item = relativeLink.EndsWith("/") ? (IContentItem) new FolderContentItem() : ContentItemStore.GetOrCreateByDownloadUrl <ContentItem>(completeUri); item.Source = dir; item.Title = relativeLink; item.DownloadLinks.Add(completeUri); if (DateTime.TryParse(dateText, out DateTime dateTimeParsed)) { item.DatePosted = dateTimeParsed; } if (ulong.TryParse(sizeText, out ulong sizeParsed)) { item.ReportedSizeInBytes = sizeParsed; } yield return(item); } }
private async Task <IContentItem> ParsePageContentsAsync(Uri url) { // The timeout here is 3 hours, why? Based on my monitoring of the website, // it doesn't update too often - the individual pages even less often - therefore a 3 hour timeout // Should be a good balance. Also reduces hammering the poor, useful website. var contentItem = ContentItemStore.GetOrCreateBySourceUrl <ContentItem>(url, timeout: TimeSpan.FromHours(3)); var httpResponse = await httpClient.GetAsync(url); var document = new HtmlDocument(); document.LoadHtml(await httpResponse.Content.ReadAsStringAsync()); contentItem.Title = HttpUtility.HtmlDecode(document.DocumentNode.Descendants().Where(node => node.HasClass("post-title")).FirstOrDefault().InnerText); contentItem.DatePosted = DateTime.Parse(document.DocumentNode.Descendants() .Where(node => node.HasClass("post-date")).FirstOrDefault() .Descendants("time").FirstOrDefault() .GetAttributeValue("datetime", "")); contentItem.Source = url; contentItem.ReportedSizeInBytes = FileSizeUtilities.ParseFromSuffixedString(document.DocumentNode.Descendants() .Where(node => node.InnerText.StartsWith("Size", StringComparison.OrdinalIgnoreCase)) .FirstOrDefault().NextSibling.InnerText); // Filter down to relevant links based on what I deduced from looking at the page source var linkNodes = document.DocumentNode.Descendants("section") .Where(child => child.HasClass("post-contents")) .SelectMany(node => node.Descendants("div")) .Where(node => node.HasClass("aio-pulse")) .SelectMany(node => node.Descendants("a")); foreach (var linkNode in linkNodes) { var link = linkNode.GetAttributeValue("href", "<could not scrape link>"); var linkUri = new Uri(link); var resolvedUri = linkUri; contentItem.DownloadLinks.Add(resolvedUri); } return(contentItem); }