public void TestSizing()
 {
     Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("1MB"), (ulong)1024 * 1024);
     Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("4k"), (ulong)1024 * 4);
     Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("23gb"), (ulong)1024 * 1024 * 1024 * 23);
     Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("7.2MB"), (ulong)Math.Floor(1024 * 1024 * 7.2));
     Assert.AreEqual(FileSizeUtilities.ParseFromSuffixedString("14b"), (ulong)1 * 14);
 }
        protected override async IAsyncEnumerable <IContentItem> ListDirectoryAsync(Uri dir)
        {
            var httpResponse = await httpClient.GetAsync(dir);

            var responseBodyText = httpResponse.Content.ReadAsStringAsync();
            var document         = new HtmlDocument();

            document.LoadHtml(await responseBodyText);

            var tables = document.DocumentNode.Descendants("table");

            foreach (var table in tables)
            {
                foreach (var tableRow in table.Descendants("tr").Skip(1)) // Skip header
                {
                    var linkNode = tableRow.ChildNodes.Descendants("a").FirstOrDefault();
                    if (linkNode == null)
                    {
                        continue;
                    }
                    var relativeLink = linkNode.Attributes["href"].Value;
                    var completeUri  = new Uri(dir, relativeLink);
                    var dateNode     = linkNode.ParentNode.NextSibling;
                    var sizeNode     = dateNode?.NextSibling;

                    IContentItem item = relativeLink.EndsWith("/")
                        ? (IContentItem) new FolderContentItem()
                        : ContentItemStore.GetOrCreateByDownloadUrl <ContentItem>(completeUri);

                    item.Source = dir;
                    item.Title  = relativeLink;
                    item.DownloadLinks.Add(completeUri);
                    item.ReportedSizeInBytes = FileSizeUtilities.ParseFromSuffixedString(sizeNode?.InnerText);

                    if (DateTime.TryParse(dateNode?.InnerText, out DateTime dateTimeParsed))
                    {
                        item.DatePosted = dateTimeParsed;
                    }

                    yield return(item);
                }
            }
        }
Пример #3
0
        private async Task <IContentItem> ParsePageContentsAsync(Uri url)
        {
            // The timeout here is 3 hours, why? Based on my monitoring of the website,
            // it doesn't update too often - the individual pages even less often - therefore a 3 hour timeout
            // Should be a good balance. Also reduces hammering the poor, useful website.
            var contentItem = ContentItemStore.GetOrCreateBySourceUrl <ContentItem>(url, timeout: TimeSpan.FromHours(3));

            var httpResponse = await httpClient.GetAsync(url);

            var document = new HtmlDocument();

            document.LoadHtml(await httpResponse.Content.ReadAsStringAsync());

            contentItem.Title      = HttpUtility.HtmlDecode(document.DocumentNode.Descendants().Where(node => node.HasClass("post-title")).FirstOrDefault().InnerText);
            contentItem.DatePosted = DateTime.Parse(document.DocumentNode.Descendants()
                                                    .Where(node => node.HasClass("post-date")).FirstOrDefault()
                                                    .Descendants("time").FirstOrDefault()
                                                    .GetAttributeValue("datetime", ""));
            contentItem.Source = url;
            contentItem.ReportedSizeInBytes = FileSizeUtilities.ParseFromSuffixedString(document.DocumentNode.Descendants()
                                                                                        .Where(node => node.InnerText.StartsWith("Size", StringComparison.OrdinalIgnoreCase))
                                                                                        .FirstOrDefault().NextSibling.InnerText);

            // Filter down to relevant links based on what I deduced from looking at the page source
            var linkNodes = document.DocumentNode.Descendants("section")
                            .Where(child => child.HasClass("post-contents"))
                            .SelectMany(node => node.Descendants("div"))
                            .Where(node => node.HasClass("aio-pulse"))
                            .SelectMany(node => node.Descendants("a"));

            foreach (var linkNode in linkNodes)
            {
                var link        = linkNode.GetAttributeValue("href", "<could not scrape link>");
                var linkUri     = new Uri(link);
                var resolvedUri = linkUri;
                contentItem.DownloadLinks.Add(resolvedUri);
            }
            return(contentItem);
        }