public void TestSizing() { Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("1MB"), (ulong)1024 * 1024); Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("4k"), (ulong)1024 * 4); Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("23gb"), (ulong)1024 * 1024 * 1024 * 23); Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("7.2MB"), (ulong)Math.Floor(1024 * 1024 * 7.2)); Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("14b"), (ulong)1 * 14); }
protected override async IAsyncEnumerable <IContentItem> ListDirectoryAsync(Uri dir) { var httpResponse = await httpClient.GetAsync(dir); var responseBodyText = httpResponse.Content.ReadAsStringAsync(); var document = new HtmlDocument(); document.LoadHtml(await responseBodyText); var tables = document.DocumentNode.Descendants("table"); foreach (var table in tables) { foreach (var tableRow in table.Descendants("tr").Skip(1)) // Skip header { var linkNode = tableRow.ChildNodes.Descendants("a").FirstOrDefault(); if (linkNode == null) { continue; } var relativeLink = linkNode.Attributes["href"].Value; var completeUri = new Uri(dir, relativeLink); var dateNode = linkNode.ParentNode.NextSibling; var sizeNode = dateNode?.NextSibling; IContentItem item = relativeLink.EndsWith("/") ? (IContentItem) new FolderContentItem() : ContentItemStore.GetOrCreateByDownloadUrl <ContentItem>(completeUri); item.Source = dir; item.Title = relativeLink; item.DownloadLinks.Add(completeUri); item.ReportedSizeInBytes = FileSizeUtilities.ParseFromSuffixedString(sizeNode?.InnerText); if (DateTime.TryParse(dateNode?.InnerText, out DateTime dateTimeParsed)) { item.DatePosted = dateTimeParsed; } yield return(item); } } }
private async Task <IContentItem> ParsePageContentsAsync(Uri url) { // The timeout here is 3 hours, why? Based on my monitoring of the website, // it doesn't update too often - the individual pages even less often - therefore a 3 hour timeout // Should be a good balance. Also reduces hammering the poor, useful website. var contentItem = ContentItemStore.GetOrCreateBySourceUrl <ContentItem>(url, timeout: TimeSpan.FromHours(3)); var httpResponse = await httpClient.GetAsync(url); var document = new HtmlDocument(); document.LoadHtml(await httpResponse.Content.ReadAsStringAsync()); contentItem.Title = HttpUtility.HtmlDecode(document.DocumentNode.Descendants().Where(node => node.HasClass("post-title")).FirstOrDefault().InnerText); contentItem.DatePosted = DateTime.Parse(document.DocumentNode.Descendants() .Where(node => node.HasClass("post-date")).FirstOrDefault() .Descendants("time").FirstOrDefault() .GetAttributeValue("datetime", "")); contentItem.Source = url; contentItem.ReportedSizeInBytes = FileSizeUtilities.ParseFromSuffixedString(document.DocumentNode.Descendants() .Where(node => node.InnerText.StartsWith("Size", StringComparison.OrdinalIgnoreCase)) .FirstOrDefault().NextSibling.InnerText); // Filter down to relevant links based on what I deduced from looking at the page source var linkNodes = document.DocumentNode.Descendants("section") .Where(child => child.HasClass("post-contents")) .SelectMany(node => node.Descendants("div")) .Where(node => node.HasClass("aio-pulse")) .SelectMany(node => node.Descendants("a")); foreach (var linkNode in linkNodes) { var link = linkNode.GetAttributeValue("href", "<could not scrape link>"); var linkUri = new Uri(link); var resolvedUri = linkUri; contentItem.DownloadLinks.Add(resolvedUri); } return(contentItem); }