/// <summary> /// Load information for every book. /// </summary> /// <param name="book">Book that contains the url.</param> /// <param name="htmlParser">HTML parser.</param> /// <param name="webClient">Wbepages downloader.</param> private static void LoadBookMeta(Book book, HtmlParser htmlParser, HtmlWeb webClient) { //listen-download clearfix book.Chapters = new List <Chapter>(); string innerHtml = webClient.LoadFromWebAsync(book.Url).GetAwaiter().GetResult().DocumentNode.InnerHtml; IHtmlDocument document = htmlParser.Parse(innerHtml); var sidebar = document.QuerySelector("div.book-page-sidebar"); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> htmlCollection = document.QuerySelectorAll("dd"); var bookInfoSide = htmlCollection.Select(x => x.TextContent).ToArray(); book.Duration = bookInfoSide[4]; var bookTextNode = document.QuerySelectorAll("a").Where(x => x.TextContent.ToLower() == "online text").FirstOrDefault(); if (bookTextNode != null) { book.OnlineText = bookTextNode.GetAttribute("href"); } var chapterNodes = document.QuerySelector("table.chapter-download").QuerySelector("tbody").QuerySelectorAll("tr"); var columns = document.QuerySelector("table.chapter-download").QuerySelector("thead").QuerySelectorAll("th").Select(x => x.TextContent).ToList(); int chapterIndex = columns.FindIndex(x => x.ToLower().Contains("chapter")); int sectionIndex = columns.FindIndex(x => x.ToLower().Contains("section")); int readerIndex = columns.FindIndex(x => x.ToLower().Contains("reader")); int durationIndex = columns.FindIndex(x => x.ToLower().Contains("time")); int languageIndex = columns.FindIndex(x => x.ToLower().Contains("language")); int sourceTextIndex = columns.FindIndex(x => x.ToLower().Contains("source")); foreach (var chapterNode in chapterNodes) { var chapterInfo = chapterNode.QuerySelectorAll("td").ToArray(); string chapterMp3 = chapterIndex != -1 ? chapterInfo[chapterIndex].QuerySelector("a").GetAttribute("href") : string.Empty; string chapterName = chapterIndex != -1 ? chapterInfo[chapterIndex].QuerySelector("a").TextContent : string.Empty; AngleSharp.Dom.IElement readerNameElement = chapterInfo[readerIndex].QuerySelector("a"); string readerName = readerNameElement != null ? readerNameElement.TextContent : ""; string chapterDuration = chapterInfo[durationIndex].TextContent; book.Chapters.Add(new Chapter { AudioLink = chapterMp3, Section = chapterInfo[sectionIndex].TextContent, Duration = chapterDuration, Name = chapterName, Reader = readerName, TextSource = sourceTextIndex != -1 ? chapterInfo[sourceTextIndex].GetAttribute("href") : string.Empty }); } }
public override async Task <string?> GetMetadataAsync(PackageURL purl, bool useCache = true) { Logger.Trace("GetMetadata {0}", purl?.ToString()); if (purl == null || purl.Name == null) { return(null); } StringBuilder metadataContent = new(); HttpClient httpClient = CreateHttpClient(); foreach (string distroUrlPrefix in GetBaseURLs(purl)) { try { string?html = await GetHttpStringCache(httpClient, distroUrlPrefix, useCache : useCache, neverThrow : true); if (html != null) { AngleSharp.Html.Dom.IHtmlDocument?document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement?anchor in document.QuerySelectorAll("a")) { string?anchorHref = anchor.GetAttribute("href"); if (anchorHref.EndsWith(".dsc")) { Logger.Debug("Found a .dsc file: {0}", anchorHref); string?dscContent = await GetHttpStringCache(httpClient, distroUrlPrefix + anchorHref, neverThrow : true); if (dscContent == null) { continue; } metadataContent.AppendLine(dscContent); } } } } catch (Exception ex) { Logger.Debug("Error obtaining .dsc file for {0}: {1}", purl.ToString(), ex.Message); } // Fallback to packages.ubuntu.com if we haven't seen any .dsc files if (metadataContent.Length == 0) { try { string?searchResults = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/search?keywords={purl.Name}&searchon=names&exact=1&suite=all§ion=all", useCache); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(searchResults); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchorItems = document.QuerySelectorAll("a.resultlink"); IEnumerable <string> metadataUrlList = anchorItems.Select(s => s.GetAttribute("href") ?? ""); foreach (string metadataUrl in metadataUrlList) { metadataContent.AppendLine(await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/{metadataUrl}")); } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu metadata: {0}", ex.Message); } } } return(metadataContent.ToString()); }
private void crawler_CrawlerComplete(object sender, PageCrawlCompletedArgs e) { CrawledPage page = e.CrawledPage; if (page.WebException != null || page.HttpWebResponse.StatusCode != System.Net.HttpStatusCode.OK) { Console.WriteLine("## Error on {0}", page.Uri.ToString()); Console.WriteLine(); AddToCrawledPages(page.Uri.ToString()); } else { Console.WriteLine("Crawl OK: {0}", page.Uri.ToString()); string currentURL = page.Uri.ToString(); AddToCrawledPages(currentURL); if (IncludeMetaData) { Console.Write("Extracting meta: "); // identiyfying non page link if (IsPageUrl(currentURL)) { AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument; // output Models.MetaData metaDataDTO = new Models.MetaData(); metaDataDTO.Url = currentURL; // options string titleSelector = "title"; string metaSelector = "meta"; string[] metaAttributes = new[] { "description", "keywords" }; // title AngleSharp.Dom.IElement titleElement = htmlPage.QuerySelector(titleSelector); metaDataDTO.Title = (titleElement != null) ? titleElement.TextContent : ""; Console.Write("title "); // description AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> metaElements = htmlPage.QuerySelectorAll(metaSelector); foreach (var item in metaElements) { if (item.HasAttribute("name")) { if (item.Attributes["name"].Value == metaAttributes[0]) { try { metaDataDTO.Description = item.Attributes["content"].Value; } catch (Exception) { throw; } Console.Write(metaAttributes[0] + " "); } if (item.Attributes["name"].Value == metaAttributes[1]) { try { metaDataDTO.Keywords = item.Attributes["content"].Value; } catch (Exception) { throw; } Console.Write(metaAttributes[1] + " "); } } } // add MetaData.Add(metaDataDTO); // output to console Console.WriteLine(" OK"); } else { Console.WriteLine(" NO VALID PAGE"); } } if (ExtractLinks) { AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument; // get all Links in class selector // generate selector // add prefix if it is set string selector = "a"; if (!String.IsNullOrWhiteSpace(ClassSelector)) { selector = ClassSelector + " " + selector; } // all links AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> links = htmlPage.QuerySelectorAll(selector); // extract links List <string> linksFound = links.Select((x) => { if (x.HasAttribute("href")) { string linkValue = x.Attributes["href"].Value; // dismiss non valid values if (linkValue.Contains("javascript:")) { return(null); } if (linkValue.Contains("mailto:")) { return(null); } if (linkValue.Contains("tel:")) { return(null); } // base uri string baseUri = page.Uri.GetLeftPart(UriPartial.Authority); if (!linkValue.StartsWith("http")) { linkValue = baseUri + "/" + linkValue; } return(linkValue); } return(null); } ) .Where(x => !string.IsNullOrWhiteSpace(x)) .ToList(); // store in dictionary this.PagesCrawledLinks.Add(currentURL, linksFound); } // new line Console.WriteLine(); } }