예제 #1
0
        /// <summary>
        /// Load information for every book.
        /// </summary>
        /// <param name="book">Book that contains the url.</param>
        /// <param name="htmlParser">HTML parser.</param>
        /// <param name="webClient">Wbepages downloader.</param>
        private static void LoadBookMeta(Book book, HtmlParser htmlParser, HtmlWeb webClient)
        {
            //listen-download clearfix
            book.Chapters = new List <Chapter>();
            string        innerHtml = webClient.LoadFromWebAsync(book.Url).GetAwaiter().GetResult().DocumentNode.InnerHtml;
            IHtmlDocument document  = htmlParser.Parse(innerHtml);
            var           sidebar   = document.QuerySelector("div.book-page-sidebar");

            AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> htmlCollection = document.QuerySelectorAll("dd");
            var bookInfoSide = htmlCollection.Select(x => x.TextContent).ToArray();

            book.Duration = bookInfoSide[4];


            var bookTextNode = document.QuerySelectorAll("a").Where(x => x.TextContent.ToLower() == "online text").FirstOrDefault();

            if (bookTextNode != null)
            {
                book.OnlineText = bookTextNode.GetAttribute("href");
            }
            var chapterNodes = document.QuerySelector("table.chapter-download").QuerySelector("tbody").QuerySelectorAll("tr");

            var columns = document.QuerySelector("table.chapter-download").QuerySelector("thead").QuerySelectorAll("th").Select(x => x.TextContent).ToList();

            int chapterIndex    = columns.FindIndex(x => x.ToLower().Contains("chapter"));
            int sectionIndex    = columns.FindIndex(x => x.ToLower().Contains("section"));
            int readerIndex     = columns.FindIndex(x => x.ToLower().Contains("reader"));
            int durationIndex   = columns.FindIndex(x => x.ToLower().Contains("time"));
            int languageIndex   = columns.FindIndex(x => x.ToLower().Contains("language"));
            int sourceTextIndex = columns.FindIndex(x => x.ToLower().Contains("source"));

            foreach (var chapterNode in chapterNodes)
            {
                var    chapterInfo = chapterNode.QuerySelectorAll("td").ToArray();
                string chapterMp3  = chapterIndex != -1 ? chapterInfo[chapterIndex].QuerySelector("a").GetAttribute("href") : string.Empty;
                string chapterName = chapterIndex != -1 ? chapterInfo[chapterIndex].QuerySelector("a").TextContent : string.Empty;
                AngleSharp.Dom.IElement readerNameElement = chapterInfo[readerIndex].QuerySelector("a");
                string readerName      = readerNameElement != null ? readerNameElement.TextContent : "";
                string chapterDuration = chapterInfo[durationIndex].TextContent;

                book.Chapters.Add(new Chapter
                {
                    AudioLink  = chapterMp3,
                    Section    = chapterInfo[sectionIndex].TextContent,
                    Duration   = chapterDuration,
                    Name       = chapterName,
                    Reader     = readerName,
                    TextSource = sourceTextIndex != -1 ? chapterInfo[sourceTextIndex].GetAttribute("href") : string.Empty
                });
            }
        }
예제 #2
0
        public override async Task <string?> GetMetadataAsync(PackageURL purl, bool useCache = true)
        {
            Logger.Trace("GetMetadata {0}", purl?.ToString());

            if (purl == null || purl.Name == null)
            {
                return(null);
            }

            StringBuilder metadataContent = new();
            HttpClient    httpClient      = CreateHttpClient();

            foreach (string distroUrlPrefix in GetBaseURLs(purl))
            {
                try
                {
                    string?html = await GetHttpStringCache(httpClient, distroUrlPrefix, useCache : useCache, neverThrow : true);

                    if (html != null)
                    {
                        AngleSharp.Html.Dom.IHtmlDocument?document = await new HtmlParser().ParseDocumentAsync(html);
                        foreach (AngleSharp.Dom.IElement?anchor in document.QuerySelectorAll("a"))
                        {
                            string?anchorHref = anchor.GetAttribute("href");
                            if (anchorHref.EndsWith(".dsc"))
                            {
                                Logger.Debug("Found a .dsc file: {0}", anchorHref);
                                string?dscContent = await GetHttpStringCache(httpClient, distroUrlPrefix + anchorHref, neverThrow : true);

                                if (dscContent == null)
                                {
                                    continue;
                                }
                                metadataContent.AppendLine(dscContent);
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    Logger.Debug("Error obtaining .dsc file for {0}: {1}", purl.ToString(), ex.Message);
                }

                // Fallback to packages.ubuntu.com if we haven't seen any .dsc files
                if (metadataContent.Length == 0)
                {
                    try
                    {
                        string?searchResults = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/search?keywords={purl.Name}&searchon=names&exact=1&suite=all&section=all", useCache);

                        HtmlParser parser = new();
                        AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(searchResults);

                        AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchorItems = document.QuerySelectorAll("a.resultlink");
                        IEnumerable <string> metadataUrlList = anchorItems.Select(s => s.GetAttribute("href") ?? "");

                        foreach (string metadataUrl in metadataUrlList)
                        {
                            metadataContent.AppendLine(await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/{metadataUrl}"));
                        }
                    }
                    catch (Exception ex)
                    {
                        Logger.Debug(ex, "Error fetching Ubuntu metadata: {0}", ex.Message);
                    }
                }
            }

            return(metadataContent.ToString());
        }
예제 #3
0
        private void crawler_CrawlerComplete(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage page = e.CrawledPage;

            if (page.WebException != null || page.HttpWebResponse.StatusCode != System.Net.HttpStatusCode.OK)
            {
                Console.WriteLine("## Error on {0}", page.Uri.ToString());
                Console.WriteLine();
                AddToCrawledPages(page.Uri.ToString());
            }
            else
            {
                Console.WriteLine("Crawl OK: {0}", page.Uri.ToString());
                string currentURL = page.Uri.ToString();
                AddToCrawledPages(currentURL);

                if (IncludeMetaData)
                {
                    Console.Write("Extracting meta: ");
                    // identiyfying non page link
                    if (IsPageUrl(currentURL))
                    {
                        AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument;

                        // output
                        Models.MetaData metaDataDTO = new Models.MetaData();
                        metaDataDTO.Url = currentURL;

                        // options
                        string   titleSelector  = "title";
                        string   metaSelector   = "meta";
                        string[] metaAttributes = new[] { "description", "keywords" };

                        // title
                        AngleSharp.Dom.IElement titleElement = htmlPage.QuerySelector(titleSelector);
                        metaDataDTO.Title = (titleElement != null) ? titleElement.TextContent : "";
                        Console.Write("title ");

                        // description
                        AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> metaElements = htmlPage.QuerySelectorAll(metaSelector);
                        foreach (var item in metaElements)
                        {
                            if (item.HasAttribute("name"))
                            {
                                if (item.Attributes["name"].Value == metaAttributes[0])
                                {
                                    try
                                    {
                                        metaDataDTO.Description = item.Attributes["content"].Value;
                                    }
                                    catch (Exception)
                                    {
                                        throw;
                                    }
                                    Console.Write(metaAttributes[0] + " ");
                                }
                                if (item.Attributes["name"].Value == metaAttributes[1])
                                {
                                    try
                                    {
                                        metaDataDTO.Keywords = item.Attributes["content"].Value;
                                    }
                                    catch (Exception)
                                    {
                                        throw;
                                    }
                                    Console.Write(metaAttributes[1] + " ");
                                }
                            }
                        }

                        // add
                        MetaData.Add(metaDataDTO);

                        // output to console
                        Console.WriteLine("     OK");
                    }
                    else
                    {
                        Console.WriteLine("     NO VALID PAGE");
                    }
                }

                if (ExtractLinks)
                {
                    AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument;
                    // get all Links in class selector
                    // generate selector

                    // add prefix if it is set
                    string selector = "a";
                    if (!String.IsNullOrWhiteSpace(ClassSelector))
                    {
                        selector = ClassSelector + " " + selector;
                    }

                    // all links
                    AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> links = htmlPage.QuerySelectorAll(selector);

                    // extract links
                    List <string> linksFound = links.Select((x) =>
                    {
                        if (x.HasAttribute("href"))
                        {
                            string linkValue = x.Attributes["href"].Value;

                            // dismiss non valid values
                            if (linkValue.Contains("javascript:"))
                            {
                                return(null);
                            }
                            if (linkValue.Contains("mailto:"))
                            {
                                return(null);
                            }
                            if (linkValue.Contains("tel:"))
                            {
                                return(null);
                            }

                            // base uri
                            string baseUri = page.Uri.GetLeftPart(UriPartial.Authority);

                            if (!linkValue.StartsWith("http"))
                            {
                                linkValue = baseUri + "/" + linkValue;
                            }
                            return(linkValue);
                        }
                        return(null);
                    }
                                                            )
                                               .Where(x => !string.IsNullOrWhiteSpace(x))
                                               .ToList();

                    // store in dictionary
                    this.PagesCrawledLinks.Add(currentURL, linksFound);
                }

                // new line
                Console.WriteLine();
            }
        }