/// <summary>
        /// See http://www.w3.org/TR/html-imports/#dfn-import-request.
        /// </summary>
        public override async Task LoadAsync(IConfiguration configuration, IResourceLoader loader)
        {
            var link = Link;
            var document = link.Owner;
            var list = ImportLists.GetOrCreateValue(document);
            var location = Url;
            var request = link.CreateRequestFor(location);
            var item = new ImportEntry 
            { 
                Relation = this,
                IsCycle = CheckCycle(document, location)
            };
            _isasync = link.HasAttribute(AttributeNames.Async);
            list.Add(item);
            
            if (!item.IsCycle)
            {
                var nestedStatus = new TaskCompletionSource<Boolean>();
                var download = loader.DownloadAsync(request);
                SetDownload(download);

                await link.ProcessResponse(download, async response =>
                {
                    var context = new BrowsingContext(document.Context, Sandboxes.None);
                    var options = new CreateDocumentOptions(response, configuration)
                    {
                        ImportAncestor = document
                    };
                    _import = await context.OpenAsync(options, CancellationToken.None).ConfigureAwait(false);
                    nestedStatus.SetResult(true);
                }).ConfigureAwait(false);
                await nestedStatus.Task.ConfigureAwait(false);
            }
        }
 protected override async Task ProcessResponseAsync(IResponse response)
 {
     var context = new BrowsingContext(_parentDocument.Context, Sandboxes.None);
     var options = new CreateDocumentOptions(response, _configuration, _parentDocument);
     var factory = _configuration.GetFactory<IDocumentFactory>();
     ChildDocument = await factory.CreateAsync(context, options, CancellationToken.None).ConfigureAwait(false);
 }
        private async Task GetProjectNuGetDataAsync(IDocument input, IExecutionContext context, ConcurrentDictionary <string, object> metadata)
        {
            List <Package>         packageData = new List <Package>();
            IReadOnlyList <string> packages    = input.GetList("NuGet", Array.Empty <string>());

            foreach (string package in packages.Where(x => !string.IsNullOrWhiteSpace(x)))
            {
                context.LogInformation($"Getting NuGet data for {package}");
                try
                {
                    IBrowsingContext         browsingContext = BrowsingContext.New(AngleSharpConfig);
                    AngleSharp.Dom.IDocument document        = await browsingContext.OpenAsync($"https://www.nuget.org/packages/{package}");

                    if (document.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        context.LogWarning($"Bad status code for {package}: {document.StatusCode}");
                    }
                    else if (document == null)
                    {
                        context.LogWarning($"Could not get document for {package}");
                    }
                    else
                    {
                        Package data = new Package
                        {
                            Id = package
                        };

                        // Get statistics
                        AngleSharp.Dom.IElement statistics = document
                                                             .QuerySelectorAll(".package-details-info h2")
                                                             .First(x => x.TextContent == "Statistics")
                                                             .NextElementSibling;
                        data.TotalDownloads = statistics.Children
                                              .First(x => x.TextContent.Contains("total downloads"))
                                              .TextContent.Trim().Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0];
                        data.PerDayDownloads = statistics.Children
                                               .First(x => x.TextContent.Contains("per day"))
                                               .TextContent.Trim().Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0];

                        // Get versions
                        data.Versions = document
                                        .QuerySelectorAll("#version-history table tbody tr")
                                        .Select(x => new PackageVersion(x))
                                        .ToList();

                        // Add the data
                        packageData.Add(data);
                    }
                }
                catch (Exception ex)
                {
                    context.LogWarning($"Error getting NuGet data for {package}: {ex.Message}");
                }
            }

            if (packageData.Count > 0)
            {
                metadata.TryAdd("NuGetPackages", packageData);
            }
        }
        public static async Task <Article> DecorateSingleArticle(Article article)
        {
            var config = Configuration.Default.WithDefaultLoader();

            IDocument document = await BrowsingContext.New(config).OpenAsync(article.Url);

            try
            {
                string pattern = @"\d{4}\/\d{2}\/\d{2}";
                Match  match   = Regex.Match(article.Url, pattern, RegexOptions.IgnoreCase);

                if (match.Success)
                {
                    string   dateString = match.Groups[0].Value;
                    DateTime date       = DateTime.Parse(dateString);
                    article.ExactPublishDate = date;
                }
            }
            catch (Exception) { }



            var claimCssSelector      = ".Claim >p";
            var authorCssSelector     = "a.author";
            var blogDateCssSelector   = ".blog-date";
            var conclusionCssSelector = ".conclusion >p";
            var articleCssSelector    = ".main-content-body .type-post";
            var allLinksSelector      = ".main-content-body .type-post a";

            var claimsResults        = document.QuerySelectorAll(claimCssSelector);
            var conclusionResults    = document.QuerySelectorAll(conclusionCssSelector);
            var articleResult        = document.QuerySelector(articleCssSelector);
            var authorResult         = document.QuerySelector(authorCssSelector);
            var dateResult           = document.QuerySelector(authorCssSelector);
            var blogDateResult       = document.QuerySelector(blogDateCssSelector);
            var modifiedDateTime     = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("article:modified_time", x.OuterHtml));
            var publishedDateTime    = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("article:published_time", x.OuterHtml));
            var articleTitleResult   = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("og:title", x.OuterHtml));
            var imageUrlResult       = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("og:image", x.OuterHtml));
            var descriptionUrlResult = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("og:description", x.OuterHtml));
            var allLinks             = document.QuerySelectorAll(allLinksSelector);
            var sourceLinks          = allLinks.Where(x => x.InnerHtml?.ToLower() == "πηγή");
            var restOfTheLinks       = allLinks.Where(x => x.InnerHtml?.ToLower() != "πηγή");

            article.Author         = authorResult?.InnerHtml;
            article.Claim          = claimsResults?.Select(x => x.InnerHtml).ToList();
            article.Result         = conclusionResults?.Select(x => x.InnerHtml).ToList();
            article.RawArticleHtml = articleResult?.InnerHtml;
            article.RawText        = articleResult?.Text();
            article.PublishDate    = blogDateResult?.InnerHtml;
            DateTime.TryParse(modifiedDateTime?.GetAttribute("Content"), out var modifiedDateValue);
            DateTime.TryParse(publishedDateTime?.GetAttribute("Content"), out var publishedDateValue);
            article.ModifiedDate         = modifiedDateValue;
            article.ExactPublishDate     = publishedDateValue;
            article.GreekHoaxTitle       = articleTitleResult?.GetAttribute("Content");
            article.ImageUrl             = imageUrlResult?.GetAttribute("Content");
            article.GreekHoaxDescription = descriptionUrlResult?.GetAttribute("Content");
            article.ProofLinks           = sourceLinks?.Select(x => x.GetAttribute("href")).ToList();
            article.HoaxSourceLinks      = restOfTheLinks?.Select(x => x.GetAttribute("href")).ToList();
            return(article);
        }
Exemple #5
0
        public async Task <string[]> GetInfo(string address)
        {
            string name = System.Web.HttpUtility.UrlDecode(address.Split('/') [4].Replace("_Ecology", "").Replace("_", " "));

            Console.WriteLine(String.Format("Attempting to write data for {0}.", name));
            var config  = Configuration.Default.WithDefaultLoader();
            var context = BrowsingContext.New(config);
            var page    = await context.OpenAsync(address);

            string image_url;

            try {
                var imelement = (IHtmlImageElement)page.QuerySelector("#In-Game_Information")
                                .ParentElement.NextElementSibling.FirstElementChild.FirstElementChild;
                image_url = imelement.Source;
                if (image_url.Contains("data:image"))
                {
                    image_url = imelement.GetAttribute("data-src");
                }
            } catch (Exception) {
                try {
                    var imelement = (IHtmlImageElement)page.QuerySelector("#In-Game_Description")
                                    .ParentElement.NextElementSibling.FirstElementChild.FirstElementChild;
                    image_url = imelement.Source;
                    if (image_url.Contains("data:image"))
                    {
                        image_url = imelement.GetAttribute("data-src");
                    }
                } catch (Exception) {
                    image_url = "https://vignette.wikia.nocookie.net/monsterhunter/images/2/2e/MHFU-Question_Mark_Icon.png/revision/latest?cb=20100610145952";
                }
            }
            var      tax = page.QuerySelector("#Taxonomy"); //can contain list elements so needs to be handled seperately
            IElement ul;

            try {
                ul = tax.ParentElement.QuerySelector("ul");
            } catch (NullReferenceException) {
                ul = null;
            }
            string habitat, niche, biology, behavior, taxonomy;

            try {
                if (ul != null)
                {
                    taxonomy = String.Join('\n', ul.QuerySelectorAll("li").Select(li => li.TextContent));
                }
                else
                {
                    taxonomy = tax.ParentElement.NextElementSibling.TextContent;
                }
                habitat = page.QuerySelector("#Habitat_Range").ParentElement.NextElementSibling.TextContent;
                niche   = page.QuerySelector("#Ecological_Niche").ParentElement.NextElementSibling.TextContent;
                try // people's lovely spelling errors :)
                {
                    biology = page.QuerySelector("#Biological_Adaptations").ParentElement.NextElementSibling.TextContent;
                } catch {
                    biology = page.QuerySelector("#Biological_Adaptions").ParentElement.NextElementSibling.TextContent;
                }
                behavior = page.QuerySelector("#Behavior").ParentElement.NextElementSibling.TextContent;
                return(new string[] { name, image_url, taxonomy, habitat, niche, biology, behavior });
            } catch (Exception) { Console.WriteLine("Either " + name + " is not a monster or some spelling error prevented the data from being accessed."); }
            return(new string[] { "ERROR" });
        }
        private async Task GetContent()
        {
            try
            {
                using (var scope = _scopeFactory.CreateScope())
                {
                    var      dbContext = scope.ServiceProvider.GetRequiredService <ApplicationDbContext>();
                    DateTime startTime = DateTime.Now;
                    dbContext.EventLogs.Add(new EventLog
                    {
                        Message     = "Start. Crawling content: ",
                        EventType   = "scheduler",
                        CreatedDate = DateTime.Now
                    });
                    foreach (Category c in categories)
                    {
                        if (string.IsNullOrEmpty(c.ExternalUrl))
                        {
                            continue;
                        }
                        dbContext.EventLogs.Add(new EventLog
                        {
                            Message     = "Start. Crawling for category: " + c.Name + " from " + c.ExternalUrl,
                            EventType   = "scheduler",
                            CreatedDate = DateTime.Now
                        });

                        try
                        {
                            int      categoryId = c.Id;
                            string[] urls       = c.ExternalUrl.SplitCommas();
                            foreach (string url in urls)
                            {
                                var config  = Configuration.Default.WithDefaultLoader();
                                var context = BrowsingContext.New(config);
                                //string url = "https://mraovat.nguoi-viet.com/classified/phong-cho-thue-rooms-to-share-browse-88.aspx";
                                var document = await context.OpenAsync(url);

                                //var selectedItems = document.All.Where(m => m.LocalName == "td" && m.ClassList.Contains("TBLRoll"));
                                //view mor selectors syntax at
                                //https://www.w3schools.com/cssref/css_selectors.asp
                                //OR
                                var selectedItems = document.QuerySelectorAll("table.listingsTBL td");
                                //string userId = "3eb8064b-f4cd-480c-9e4c-f18d0cbadcc3";
                                foreach (var item in selectedItems)
                                {
                                    if (item.QuerySelector("div.ListingNewNDate>img") != null)
                                    {
                                        string title    = item.QuerySelector("div.ListingDescription>a").Text().Trim();
                                        string datetime = item.QuerySelector("div.ListingNewNDate>span").Text();

                                        Post model = new Post
                                        {
                                            OwnerId      = _userId,
                                            CreatedDate  = DateTime.Now,
                                            ModifiedDate = DateTime.Now,
                                            Title        = title,
                                            Content      = null,
                                            CityId       = null,
                                            CategoryId   = categoryId,
                                            PostalCode   = null,
                                            ContactEmail = null,
                                            ContactPhone = null
                                        };

                                        dbContext.Posts.Add(model);
                                    }
                                }
                            }

                            dbContext.EventLogs.Add(new EventLog
                            {
                                Message     = "Done. Crawling for category: " + c.Name + " from " + c.ExternalUrl,
                                EventType   = "scheduler",
                                CreatedDate = DateTime.Now
                            });
                            var result = await dbContext.SaveChangesAsync();
                        }
                        catch (Exception ex1)
                        {
                            dbContext.EventLogs.Add(new EventLog
                            {
                                Message     = "Error. Crawling for category: " + c.Name + " from " + c.ExternalUrl,
                                EventType   = "scheduler",
                                CreatedDate = DateTime.Now
                            });
                            var result1 = await dbContext.SaveChangesAsync();
                        }
                    }

                    dbContext.EventLogs.Add(new EventLog
                    {
                        Message     = "End. Crawling content: " + (DateTime.Now - startTime).TotalSeconds + " (s)",
                        EventType   = "scheduler",
                        CreatedDate = DateTime.Now
                    });
                    var result2 = await dbContext.SaveChangesAsync();
                }
            }
            catch (Exception ex)
            {
                _logger.LogError(ex.ToString());
            }
        }
 public Scraper()
 {
     _config  = Configuration.Default.WithDefaultLoader();
     _context = BrowsingContext.New(_config);
 }
Exemple #8
0
        private static async Task <Bike> GetBikeInfo(BikesUrl b)
        {
            // Get DOM
            var context  = BrowsingContext.New(Configuration.Default);
            var document = await context.OpenAsync(async req => req.Content(await GetContent("https://www.bikebd.com/bikes/hero-achiever-150/")));

            //var document = await context.OpenAsync(async req => req.Content(await GetContent(b.Url)));

            Bike bike = new Bike();

            bike.BrandId = b.BrandId;

            bike.PostTitle = document.Title;

            var imageOwls = document
                            .QuerySelectorAll(
                "body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.single_post_thumb")
                            .FirstOrDefault()?.ChildNodes[0].ChildNodes;


            List <string> bikeImages = new List <string>();

            foreach (var img in imageOwls)
            {
                var imageDiv = img.ToHtml();

                string src = Regex.Match(imageDiv, "<img.+?src=[\"'](.+?)[\"'].*?>", RegexOptions.IgnoreCase).Groups[1].Value;
                bikeImages.Add(src);
            }

            bike.Images = bikeImages.ToArray();


            bike.Name = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(1) > div.col-sm-9 > div > h4").FirstOrDefault()?.TextContent;

            // Basic
            bike.Features       = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(1) > table > tbody > tr > td:nth-child(1)").FirstOrDefault()?.TextContent;
            bike.DisplacementCC = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(1) > table > tbody > tr > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Mileage        = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(1) > table > tbody > tr > td:nth-child(3)").FirstOrDefault()?.TextContent;

            // Bike Overview
            bike.Price                    = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.FuelSupplySystem         = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.StartingMethod           = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.CoolingSystem            = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.EngineeOilRecommendation = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.TyresType                = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.TopSpeed                 = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent;

            // Specifications
            bike.EngineeType      = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.MaximumPower     = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.MaximumTorque    = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Bore             = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Stroke           = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.CompressionRatio = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.NoOfCylinders    = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent;

            // Transmission
            bike.TransmissionType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(6) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.NoOfGears        = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(6) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.ClutchType       = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(6) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;


            // Chassis & Suspension
            bike.ChassisType     = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(7) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.FrontSuspension = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(7) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.RearSuspension  = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(7) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;


            // Brakes
            bike.FrontBrakeType            = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.RearBrakeType             = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.FrontBrakeDiameter        = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.RearBrakeDiameter         = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.AntiLockBrakingSystem_ABS = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent;


            // Wheels & Tires
            bike.FrontTireSize = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(9) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.RearTireSize  = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(9) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.TubelessTires = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(9) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;


            // Dimensions
            bike.OverallLength    = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.OverallWidth     = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.OverallHeight    = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.GroundClearance  = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Weight           = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.FuelTankCapacity = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Wheelbase        = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent;

            // Electricals
            bike.BatteryType    = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.BatteryVoltage = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.HeadLight      = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.TailLight      = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Indicators     = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent;

            // Features
            bike.Speedometer       = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.Odometer          = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.RPMMeter          = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.HandleType        = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.SeatType          = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.PassengerGrabRail = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent;
            bike.EngineKillSwitch  = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent;

            return(bike);
        }
Exemple #9
0
        public async Task FetchDataAsync(NewsConditions conditions, Saver saver)
        {
            //var result = new List<ChinatimesNewsModel>();

            var client = new HttpClient();

            var config  = Configuration.Default;
            var context = BrowsingContext.New(config);

            //get html content
            var page  = 1;
            var hrefs = new List <string>();
            var paginationDataTotal = 0;

            do
            {
                var url             = $"https://www.ettoday.net/news_search/doSearch.php?keywords={conditions.Keyword}&daydiff=3&page={page}";
                var responseMessage = await client.GetAsync(url);

                var responseResult = await responseMessage.Content.ReadAsStringAsync();

                var document = await context.OpenAsync(res => res.Content(responseResult));

                var datetimePattern = "2[0-9]{3}-[0|1][0-9]-[0-2][0-9] [0-2][0-9]:[0-5][0-9]";
                var paginationHrefs = document.QuerySelectorAll(".archive.clearfix")
                                      .Where(x => DateTime.Parse(Regex.Match(x.QuerySelector(".date").TextContent, datetimePattern).Value) > DateTime.Now.Add(conditions.timeSpan))
                                      .Select(x => x.QuerySelector("h2 a").GetAttribute("href"));
                paginationDataTotal = paginationHrefs.Count();
                hrefs.AddRange(paginationHrefs);

                page++;
            } while (paginationDataTotal > 0);



            foreach (var href in hrefs)
            {
                var responseMessage = await client.GetAsync(href);

                var responseResult = await responseMessage.Content.ReadAsStringAsync();

                var document = await context.OpenAsync(res => res.Content(responseResult));

                var title = document.QuerySelector("h1.title").TextContent;

                var story = document.QuerySelector("[itemprop=articleBody]");
                //刪除圖文廣告外嵌
                foreach (var d in story.QuerySelectorAll("iframe,a,img"))
                {
                    story.QuerySelector("iframe,a,img").Remove();
                }
                //圖註解
                var contentItems = story.QuerySelectorAll("p")
                                   .Where(x => x.TextContent.Length > 20 && !x.TextContent.StartsWith("▲")).Select(x => x.TextContent);
                var content  = string.Join(string.Empty, contentItems);
                var postDate = DateTime.Parse(document.QuerySelector(".date").GetAttribute("datetime"));
                var post     = new EttodayNewsModel
                {
                    Title   = title,
                    Content = content,
                    Date    = postDate.ToString("yyyyMMdd"),
                    Source  = href
                };

                //save result
                //var result = mapper.Map<NewsDataModel>(model);
                saver.Save(post);
            }

            return;
        }
Exemple #10
0
 internal XmlDocument(IBrowsingContext context, TextSource source)
     : base(context ?? BrowsingContext.New(), source)
 {
     ContentType = MimeTypes.Xml;
 }
Exemple #11
0
        private async void Server_OnPostProcessingMessage(object sender, IrcMessage theMessage)
        {
            if (theMessage.IsIgnored)
            {
                return;
            }

            try
            {
                List <string> links = theMessage.CommandArgs.Where(x => x.StartsWith("http://") || x.StartsWith("https://")).Distinct().ToList();
                if (links.Count == 0)
                {
                    return;
                }

                var tasks = new List <Task <HttpResponseMessage> >(links.Count);
                foreach (string link in links)
                {
                    Uri address = new Uri(link);
                    if (IPAddress.TryParse(address.DnsSafeHost, out var ip) && IsInternal(ip))
                    {
                        continue;
                    }
                    else
                    {
                        try
                        {
                            var addresses = Dns.GetHostAddresses(address.DnsSafeHost);
                            if (addresses.Any(IsInternal))
                            {
                                continue;
                            }
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine(ex);
                        }
                    }

                    tasks.Add(Client.GetAsync(address, HttpCompletionOption.ResponseContentRead));
                }

                if (tasks.Count == 0)
                {
                    return;
                }

                try
                {
                    await Task.WhenAll(tasks);
                }
                catch
                {
                }

                foreach (var task in tasks)
                {
                    try
                    {
                        var response = await task;
                        var stream   = await response.Content.ReadAsStreamAsync();

                        IDocument doc = await BrowsingContext.New().OpenAsync(x => x.Content(stream, true));

                        string title = Regex.Replace(doc.Title.Replace("\n", "").Replace("\r", "").Replace("–", "–"), "[ ]{2,}", " ");
                        if (!String.IsNullOrWhiteSpace(title))
                        {
                            theMessage.Answer("[url] " + title);
                        }
                    }
                    catch
                    {
                    }
                }
            }
            catch (Exception ex)
            {
                Log.Error(ex, "Fehler beim Downloaden der Webseite");
            }
        }
        public async Task LoadGtfsData()
        {
            await _loadGtfsDataChannel.Writer.WriteAsync("Downloading GTFS data");

            using var client = new HttpClient();

            var content = await client.GetStringAsync("http://www.dtpm.cl/index.php/gtfs-vigente");

            using var document = await BrowsingContext
                                 .New(Configuration.Default)
                                 .OpenAsync(req => req.Content(content));

            var downloadLink = document
                               .QuerySelectorAll("a")
                               .OfType <IHtmlAnchorElement>()
                               .Where(a => a.Href.ToLower().Contains("gtfs") && a.Href.ToLower().Contains(".zip"))
                               .Select(a => a.Href)
                               .FirstOrDefault();

            await using var gtfsStreamZippedFile = await client.GetStreamAsync(downloadLink);

            using var file = new ZipArchive(gtfsStreamZippedFile);

            foreach (var entry in file.Entries)
            {
                using var reader = new StreamReader(entry.Open());
                using var csv    = new CsvReader(reader, CultureInfo.InvariantCulture);

                switch (entry.Name)
                {
                case "agency.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk agencies data");

                    csv.Configuration.RegisterClassMap <AgencyMap>();
                    var agencyRecords = csv.GetRecords <Agency>();
                    await _agencyRepository.Bulk(agencyRecords);

                    break;

                case "calendar_dates.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk calendar dates data");

                    csv.Configuration.RegisterClassMap <CalendarDateMap>();
                    var calendarDatesRecords = csv.GetRecords <CalendarDate>();
                    await _calendarDateRepository.Bulk(calendarDatesRecords);

                    break;

                case "calendar.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk calendar data");

                    csv.Configuration.RegisterClassMap <CalendarMap>();
                    var calendarRecords = csv.GetRecords <Calendar>();
                    await _calendarRepository.Bulk(calendarRecords);

                    break;

                case "feed_info.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk feed info data");

                    csv.Configuration.RegisterClassMap <FeedInfoMap>();
                    var feedInfoRecords = csv.GetRecords <FeedInfo>();
                    await _feedInfoRepository.Bulk(feedInfoRecords);

                    break;

                case "frequencies.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk frequencies data");

                    csv.Configuration.RegisterClassMap <FrequencyMap>();
                    var frequencyRecords = csv.GetRecords <Frequency>();
                    await _frequencyRepository.Bulk(frequencyRecords);

                    break;

                case "routes.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk routes data");

                    csv.Configuration.RegisterClassMap <RouteMap>();
                    var routeRecords = csv.GetRecords <Route>();
                    await _routeRepository.Bulk(routeRecords);

                    break;

                case "shapes.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk shapes data");

                    csv.Configuration.RegisterClassMap <ShapeMap>();
                    var shapeRecords = csv.GetRecords <Shape>();
                    await _shapeRepository.Bulk(shapeRecords);

                    break;

                case "stop_times.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk stop times data");

                    csv.Configuration.RegisterClassMap <StopTimeMap>();
                    var stopTimeRecords = csv.GetRecords <StopTime>();
                    await _stopTimeRepository.Bulk(stopTimeRecords);

                    break;

                case "stops.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk stops data");

                    csv.Configuration.RegisterClassMap <StopMap>();
                    var stopRecords = csv.GetRecords <Stop>();
                    await _stopRepository.Bulk(stopRecords);

                    break;

                case "trips.txt":
                    await _loadGtfsDataChannel.Writer.WriteAsync("Bulk trips data");

                    csv.Configuration.RegisterClassMap <TripMap>();
                    var tripRecords = csv.GetRecords <Trip>();
                    await _tripRepository.Bulk(tripRecords);

                    break;
                }
            }
        }
Exemple #13
0
        static async Task Main()
        {
            var config = Configuration.Default.WithDefaultLoader();

            var url2 = new Url("https://phys.org/news/2020-11-rare-species-small-cats-inadequately.html");
            var doc2 = await BrowsingContext.New(config).OpenAsync(url2);

            //get image
            var image = doc2.GetElementsByClassName("article-img")[0].GetElementsByTagName("img")[0].GetAttribute("src");

            //credit:
            var credit = doc2.GetElementsByClassName("article-img")[0].GetElementsByTagName("figcaption")[0].TextContent.Trim();

            //mainText
            var textParagraphs = doc2.GetElementsByClassName("article-main")[0].GetElementsByTagName("p");
            var sb             = new StringBuilder();

            foreach (var paragraph in textParagraphs)
            {
                sb.AppendLine(paragraph.TextContent.Trim());
                sb.AppendLine();
            }

            sb.ToString().TrimEnd();

            //title
            var title2 = doc2.GetElementsByClassName("news-article")[0].GetElementsByTagName("h1")[0].TextContent;

            Console.WriteLine(title2);

            int newsNumber = 0;

            for (int i = 1; i < 1; i++)
            {
                var url = new Url("https://phys.org/biology-news/ecology/sort/date/all/page" + i + ".html");
                var doc = await BrowsingContext.New(config).OpenAsync(url);

                var x = doc.GetElementsByClassName("sorted-article")[0];

                var MainPhoto = x.GetElementsByClassName("sorted-article-figure")[0]
                                .GetElementsByTagName("img")[0]
                                .GetAttribute("data-src");
                var MainPage = x.GetElementsByClassName("sorted-article-figure")[0]
                               .GetElementsByTagName("a")[0]
                               .GetAttribute("href");
                var Tile = x.GetElementsByClassName("sorted-article-content")[0]
                           .GetElementsByTagName("a")[0]
                           .TextContent;
                var ShortIntro = x.GetElementsByClassName("sorted-article-content")[0]
                                 .GetElementsByTagName("p")[0]
                                 .TextContent
                                 .Trim();
                var Category = x.GetElementsByClassName("article__info")[0]
                               .GetElementsByTagName("p")[0]
                               .TextContent
                               .Trim();
                var PostedOn = x.GetElementsByClassName("article__info")[0]
                               .GetElementsByTagName("p")[1]
                               .TextContent
                               .Trim();

                //   foreach (var item in elements)
                //   {
                //       Console.WriteLine(item.MainPage);
                //       Console.WriteLine(item.MainPhoto);
                //       Console.WriteLine(item.Tile);
                //       Console.WriteLine(item.ShortIntro);
                //       Console.WriteLine(item.Category);
                //       Console.WriteLine(item.PostedOn);
                //       Console.WriteLine();
                //       //get main photo
                //       //Console.WriteLine(item.GetElementsByClassName("sorted-article-figure")[0].GetElementsByTagName("img")/[0].GetAttribute("data-src"));
                //
                //       //get more info page
                //       // Console.WriteLine(item.GetElementsByClassName("sorted-article-figure")[0].GetElementsByTagName("a")/[0].GetAttribute("href"));
                //
                //       //get title
                //       // Console.WriteLine(item.GetElementsByClassName("sorted-article-content")[0].GetElementsByTagName("a")/[0].TextContent);
                //
                //       //get short intro
                //       //Console.WriteLine(item.GetElementsByClassName("sorted-article-content")[0].GetElementsByTagName("p")/[0].TextContent.Trim());
                //
                //       //get category
                //       //Console.WriteLine(item.GetElementsByClassName("article__info")[0].GetElementsByTagName("p")/[0].TextContent.Trim());
                //
                //       //get postedOn
                //       //Console.WriteLine(item.GetElementsByClassName("article__info")[0].GetElementsByTagName("p")/[1].TextContent.Trim());
                //
                //       //Console.WriteLine(item.TextContent);
                //       //Console.WriteLine("Innet htnl___________"+item.InnerHtml);
                //       //Console.WriteLine("Outer html___________"+item.OuterHtml);
                //       //Console.WriteLine("to html______________"+item.ToHtml());
                //   }
            }
        }
Exemple #14
0
        private static async Task <Action> processCommentedOnProfile(UserProfile userProfile, Bot bot)
        {
            WebBrowser webBrowser = ASF.WebBrowser;
            Uri        url        = new("https://steamcommunity.com/comment/Profile/render/" + bot.SteamID);
            JObject    response   = (await webBrowser.UrlGetToJsonObject <JObject>(url)).Content;

            if (!response.GetValue("success").ToObject <bool>())
            {
                return(new Action("none"));
            }

            var context      = BrowsingContext.New(Configuration.Default);
            var htmlDocument = await context.OpenAsync(req => req.Content(response.GetValue("comments_html").ToString()));

            List <KeyValuePair <string, string> > comments = new List <KeyValuePair <string, string> >();
            var nodes       = htmlDocument.QuerySelectorAll("div.commentthread_comment");
            var groupedData = new List <KeyValuePair <string, string> >().ToLookup(x => x.Key, x => x.Value);

            if (nodes != null)
            {
                foreach (var node in nodes)
                {
                    var authorLinkNode = node.QuerySelector("a.commentthread_author_link");
                    var commentNode    = node.QuerySelector("div.commentthread_comment_text");

                    Uri    authorUri = new Uri(authorLinkNode.GetAttribute("href"));
                    string comment   = commentNode.Text().Trim().Normalize();

                    string authorProfileID = authorUri.Segments[authorUri.Segments.Count() - 1].Replace(@"/", "");
                    comments.Add(new KeyValuePair <string, string>(authorProfileID, comment));
                }

                Uri    senderProfileUri = new Uri(userProfile.profileUrl);
                string senderProfileID  = senderProfileUri.Segments[senderProfileUri.Segments.Count() - 1].Replace(@"/", "");

                groupedData = comments.ToLookup(x => x.Key, x => x.Value);
            }

            Config.FriendInviteConfigs.TryGetValue(bot, out Config config);
            string defaultAction = "none";

            foreach (ConfigItem item in config.Comments)
            {
                switch (item.condition)
                {
                case "less_than":
                    if (!groupedData.Contains(userProfile.steamId64.ToString()) && (Convert.ToInt32(item.value) > 0))
                    {
                        return(new Action(item.action, "Number of comments < " + Convert.ToInt32(item.value)));
                    }

                    if (groupedData.Contains(userProfile.steamId64.ToString()) && (groupedData[userProfile.steamId64.ToString()].Count() < Convert.ToInt32(item.value)))
                    {
                        return(new Action(item.action, "Number of comments < " + Convert.ToInt32(item.value)));
                    }
                    break;

                case "more_than":
                    if (groupedData.Contains(userProfile.steamId64.ToString()) && (groupedData[userProfile.steamId64.ToString()].Count() > Convert.ToInt32(item.value)))
                    {
                        return(new Action(item.action, "Number of comments > " + Convert.ToInt32(item.value)));
                    }
                    break;

                case "equal":
                    if (groupedData.Contains(userProfile.steamId64.ToString()) && groupedData[userProfile.steamId64.ToString()].Contains(item.value))
                    {
                        return(new Action(item.action, "Comment is " + item.value));
                    }
                    break;

                case "contain":
                    if (groupedData.Contains(userProfile.steamId64.ToString()) && (groupedData[userProfile.steamId64.ToString()].Where(comment => comment.Contains(item.value, StringComparison.OrdinalIgnoreCase)).Count() > 0))
                    {
                        return(new Action(item.action, "Profile comment contains " + item.value));
                    }
                    break;

                case "default":
                    defaultAction = item.action;
                    break;
                }
            }
            return(new Action(defaultAction));
        }
Exemple #15
0
        public static IEnumerable <CWMVolume> ScrapeVolumes()
        {
            var context = BrowsingContext.New(
                Configuration.Default.WithDefaultLoader());

            var volumesListPageUrl = _archiveDomain;

            using (var document = context
                                  .OpenAsync(volumesListPageUrl)
                                  .GetAwaiter()
                                  .GetResult())
            {
                var content = document
                              .GetElementById("canvas-wrapper")
                              .QuerySelector("div#canvas")
                              .QuerySelector("div#page-body-wrapper")
                              .QuerySelector("div#page-body")
                              .QuerySelector("div#content-wrapper")
                              .QuerySelector("div#content");

                var mainContentDiv = content
                                     .Children[1];

                var mainContentRow = mainContentDiv
                                     .QuerySelector("div.sqs-layout")
                                     .QuerySelector("div.row.sqs-row");

                var unorderedLists = mainContentRow
                                     .QuerySelectorAll(
                    "ul.archive-group-list")
                                     .ToArray();

                foreach (var unorderedList in unorderedLists)
                {
                    var volumeListItemArchiveGroupEntries = unorderedList
                                                            .QuerySelectorAll(
                        "li.archive-group")
                                                            .ToArray();

                    foreach (var volumeListItemArchiveGroupEntry in volumeListItemArchiveGroupEntries)
                    {
                        var volumeLinkElement = volumeListItemArchiveGroupEntry
                                                .QuerySelector("a.archive-group-name-link");

                        var hrefRelativeVolumeLink = volumeLinkElement.GetAttribute("href");

                        var hrefVolumeLinkRawText = volumeLinkElement.TextContent;

                        var formattedVolumeText = hrefVolumeLinkRawText.Trim();
                        var volumeTextMatch     = _volumeLinkTextRegex.Match(formattedVolumeText);

                        var volumeYearMatchText   = volumeTextMatch.Groups["year"].Value;
                        var volumeNumberMatchText = volumeTextMatch.Groups["volumeNumber"].Value;

                        if (!int.TryParse(volumeYearMatchText, out var volumeYear))
                        {
                            throw new FormatException(
                                      $"Cannot parse the 'year' from the text {volumeYearMatchText.Quote()}.");
                        }

                        if (!int.TryParse(volumeNumberMatchText, out var volumeNumber))
                        {
                            throw new FormatException(
                                      $"Cannot parse the 'volumeNumber' from the text {volumeNumberMatchText.Quote()}.");
                        }

                        var volumePageAbsoluteUrl = $"{_domain.TrimEnd('/')}{hrefRelativeVolumeLink}";

                        yield return(new CWMVolume(
                                         volumeYear,
                                         volumeNumber,
                                         volumePageAbsoluteUrl));
                    }
                }
            }
        }
Exemple #16
0
        public static IEnumerable <CWMArticle> ScrapeIssueArticles(
            CWMIssue cwmIssue)
        {
            var context = BrowsingContext.New(
                Configuration.Default.WithDefaultLoader());

            var downloadPageUrl = cwmIssue.IssuePageAbsoluteUrl;

            using (var document = context
                                  .OpenAsync(downloadPageUrl)
                                  .GetAwaiter()
                                  .GetResult())
            {
                var canvasWrapper = document
                                    .GetElementById("canvas-wrapper");

                var contentDiv = canvasWrapper
                                 .QuerySelector(
                    "div#canvas > " +
                    "div#page-body-wrapper > " +
                    "div#page-body > " +
                    "div#content-wrapper > " +
                    "div#content");

                var mainContentDiv = contentDiv.Children[2];

                var productBlockContentElement = mainContentDiv
                                                 .QuerySelector(
                    "div#productWrapper > " +
                    "div.product-description > " +
                    "div.sqs-layout > " +
                    "div.row.sqs-row > " +
                    "div.col > " +
                    "div.sqs-block.html-block > " +
                    "div.sqs-block-content");

                var currentArticleCategory = "Unknown";

                var magazineSections = ValueEnum
                                       .EnumerateValues <MagazineSection, string>()
                                       .ToArray();

                foreach (var productBlockElement in productBlockContentElement.Children)
                {
                    var articeInfoStr = productBlockElement
                                        .TextContent
                                        .Replace("&nbsp;", "")
                                        .Trim();

                    var isMagazineSection = magazineSections
                                            .Contains(
                        articeInfoStr,
                        new FuzzyStringMatchingComparer(2));

                    if (isMagazineSection)
                    {
                        currentArticleCategory = articeInfoStr;
                        continue;
                    }

                    var splitTerms = articeInfoStr.Split('-');

                    if (splitTerms.Length == 2)
                    {
                        var articleName   = splitTerms[0].Trim();
                        var articleAuthor = splitTerms[1].Trim();

                        yield return(new CWMArticle(
                                         currentArticleCategory,
                                         articleName,
                                         articleAuthor,
                                         cwmIssue));
                    }
                    else
                    {
                        var articleName = articeInfoStr.Trim();

                        yield return(new CWMArticle(
                                         currentArticleCategory,
                                         articleName,
                                         "unknown",
                                         cwmIssue));
                    }
                }
            }
        }
Exemple #17
0
        public static IEnumerable <CWMIssue> ScrapeVolumeIssues(
            CWMVolume cwmVolume)
        {
            var context = BrowsingContext.New(
                Configuration.Default.WithDefaultLoader());

            var downloadPageUrl = cwmVolume.VolumePageAbsoluteUrl;

            using (var document = context
                                  .OpenAsync(downloadPageUrl)
                                  .GetAwaiter()
                                  .GetResult())
            {
                var canvasWrapper = document
                                    .GetElementById("canvas-wrapper");

                var contentDiv = canvasWrapper
                                 .QuerySelector(
                    "div#canvas > " +
                    "div#page-body-wrapper > " +
                    "div#page-body > " +
                    "div#content-wrapper > " +
                    "div#content");

                var mainContentDiv = contentDiv.Children[2];

                var mainProductList = mainContentDiv
                                      .QuerySelector("div#productList");

                var issueLinkElements = mainProductList
                                        .QuerySelectorAll(
                    "a.product")
                                        .ToArray();

                foreach (var issueLinkElement in issueLinkElements)
                {
                    var hrefRelativeIssueLink = issueLinkElement.GetAttribute("href");
                    var formattedIssueText    = hrefRelativeIssueLink.Trim();

                    if (!_issueLinkTextRegex.IsMatch(formattedIssueText))
                    {
                        //Console.WriteLine($"Cannot parse IssueLinkText {formattedIssueText.Quote()}");
                        continue;
                    }
                    var issueTextMatch = _issueLinkTextRegex.Match(formattedIssueText);

                    var issueMagazineText     = issueTextMatch.Groups["magazine"].Value;
                    var volumeNumberMatchText = issueTextMatch.Groups["volumeNumber"].Value;
                    var issueNumberMatchText  = issueTextMatch.Groups["issueNumber"].Value;

                    if (issueNumberMatchText.IsNullOrEmptyEx())
                    {
                        //Console.WriteLine($"Cannot parse IssueLinkText regex {formattedIssueText.Quote()}");
                        continue;
                    }
                    var issueMagazine = Magazine.GetMagazineFromPrefix(issueMagazineText);

                    if (!int.TryParse(volumeNumberMatchText, out var volumeNumber))
                    {
                        throw new FormatException(
                                  $"Cannot parse the 'volumeNumber' from the text {volumeNumberMatchText.Quote()}.");
                    }

                    if (!int.TryParse(issueNumberMatchText, out var issueNumber))
                    {
                        throw new FormatException(
                                  $"Cannot parse the 'issueNumber' from the text {issueNumberMatchText.Quote()}.");
                    }


                    var issuePageAbsoluteUrl = $"{_domain.TrimEnd('/')}{hrefRelativeIssueLink}";

                    yield return(new CWMIssue(
                                     volumeNumber,
                                     issueNumber,
                                     issueMagazine,
                                     issuePageAbsoluteUrl,
                                     cwmVolume));
                }
            }
        }
        public async Task <List <DownloadedPost> > ProcessPostTask(IEnumerable <string> urls)
        {
            var angleSharpConfig = Configuration.Default
                                   .WithCulture("es-es")
                                   .WithDefaultLoader()
                                   .WithCss()
                                   .WithJs()
                                   .WithXPath();
            var angleSharpContext = BrowsingContext.New(angleSharpConfig);

            var result = new List <DownloadedPost>();

            try
            {
                foreach (var url in urls)
                {
                    _logger.LogInformation("Starts processing posts for URL {0}", url);
                    var mainPageDocument = await angleSharpContext.OpenAsync(url);

                    var links = await _strategyLoaderService.ParseLinksByStrategy(mainPageDocument);

                    var groupedResult = links.GroupBy(x => x.Href);
                    _logger.LogInformation("Got a total of {0} links for URL {1}", groupedResult.Count(), url);
                    foreach (var link in groupedResult)
                    {
                        var downloadedPost = new DownloadedPost();
                        var postDocument   = await angleSharpContext.OpenAsync(link.First().Href);

                        var title = await _strategyLoaderService.ParseTitleByStrategy(postDocument);

                        var content = await _strategyLoaderService.ParseContentByStrategy(postDocument);

                        if (string.IsNullOrWhiteSpace(title))
                        {
                            var extractedTitle = string.Empty;
                            foreach (var words in content.ToList().Select(paragraph => paragraph.SplitSpaces()))
                            {
                                extractedTitle = string.Join("", words.Take(7));
                                if (words.Length == 7)
                                {
                                    break;
                                }
                            }

                            title = extractedTitle;
                        }
                        downloadedPost.Title        = title;
                        downloadedPost.PostContents = content;

                        result.Add(downloadedPost);
                    }
                }
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Rest in pepperoni, innerEx: {0}", e.InnerException);
            }

            result.RemoveAll(x => !x.PostContents.Any() || x.PostContents.All(string.IsNullOrWhiteSpace));
            return(result);
        }
        public async Task <ParseResult <Parse.Data> > GetMoviesByDateAsync(DateTime date, List <Cinema> cinemas, List <Movie> movies_proto)
        {
            List <Movie> movies = new List <Movie>();

            movies.AddRange(movies_proto);
            string param = "";

            param = $"{date.Year}-{date.Month}-{date.Day}";
            ConcurrentDictionary <string, string> headers = new ConcurrentDictionary <string, string>();

            headers.TryAdd("ContentType", "application/json");
            headers.TryAdd("Cookie", "cookieLanguageCode=en-US; _gcl_au=1.1.1238722976.1602705939; _fbp=fb.1.1602705939926.1048953155; _gid=GA1.2.445262349.1603531359; moe_uuid=56694985-c80f-499b-b8ee-353d2ba58144; movieDetails=qeoEzVnzBc-6cYMbYWwS4scp_uWZetsLEfEwEONYBHu6Ug3EcLkPOkkIJRxNIl21LoXAHMoiLl-K2GFvWFxLoLetERKNJ9mg2n5nJuGQnCTm4cLcKFW7kZ4wKrzk7QcgU5mbqjSflnb4_PeFk656nOlvLWaS7p8iFkS-zpiG8sRM94UJOeX_RAJBHeSg-2ca0iolX65I08aDzQXISNe834N45rFvnlXKnczOqHQxCJvkiScCXPEMudUt0mZ50_zT4mA6cD08aIHg-Hc-kaOI0Sg3RjPFVqt3tsn14JDZoCeu0PFyc7VLVr79jwvPn6r--qKRLudWoNJO5aJNQv_2smOTfz0_ZUF7RxCpwQ8geT_vx7XKrH_0c9o23bsv5jiokSdBTwrkHbiwYk4ITrwCkmhHtkHeJVYs3sKRoroNevKirCwY585fkH59_SHcrTA8RUc6JMj2ixXsezJeeszlngooxw8GHDdH5o1ASeVsvKFbOcHeWPCNjA5XgQZvyIFKoHDQjZXS5N-Tu0-G_wNFmL4aQBnOdvIbFw0_DfbDHLaFTfeJY9NuTSyhoZJP8mOaebaIZMokCCFINDhBcgerB1CcAhP5SUnKdXkeLbBADZskqh_sU-f8QzGsesP1dOa1tNQmRDwenGOKb3MDof1EkQCYenShYRMmJC7VY1SBqmlM8n0aOMycbc1kcoLME98Z7F4BGJUKMMxa4PB1XU_e3OIEZYobFZP6czI1juhh9yYHWum4Yyd_JVGXqjh622CGWUhIXWAjkwDX1n6n5C-nb5ar5UVbclfatS1YKyuUXAU_zODz2OFT8Wxo5Lto1OFtqEGYkKiIu-m9A7w4l8yQIUpb1tLpOvJnEbXe_1CWiqM6hEaixfrc7jJHQ-FncGApH7i0WNF4iALHGV0aeIwjHiL5jbE5rMrHi1K_suW7TM2xrzVg93DQQBlgvhnd7g5UJ4jTCR4XyWMKKMcPBhw3R-aNQEnlbLyhWznB3ljwjYwOV4EGb8iBSqBw0I2V0uTfk7tPhv8DRZi-GkAZznbyGxcNPw32wxy2hU33iWPTbk09TfVWsyySmAhSvdSZ3240g2jYn36zA11QnAIT-UlJynV5Hi2wQA2bJKof7FPn_EMxrHrD9XTmtbTUJkQ8HpxjAwoeVLSoTKPQ4y2TWLCzR9Ei5wWaWcn3MjZnddtzlBGOiECIDhKg5cvAi5xIRSpTWuDwdV1P3tziBGWKGxWT-nMyNY7oYfcmB1rpuGMnfyVRl6o4ettghllISF1gI5MA19Z-p-Qvp2-Wq-VFW3JX2lovt8CIT7PaD5XH9yjF98wF5oI0EPF53Lc738yxFmYEfw1Kbiphe4_NiXlcBxA2rIj5MI_W0zvmFZchj8C0eES6TXP_fFrixhgjHoboPhh0jSDlwXIV-kd1zBprw6hS4YPF4MlAm57mDdEnjV8a-GNx2JO20V7SeT9Jz852f9ooX6uZGVYXcaF7tpJ-xlK_lQJOV3XVyz_KJZRfILUk-QCoND7wXTraBj3qRM7h7rfdJ3SbyxHW4kpaM6i2DODUh5ZT8_yOKDzi7Ueba1g_6F3Uf97Rk_w4uNL8C58OyDdurk9Q_OweYQuFfmV4aoKNnF5r5Vq_Kc69zTw79tbFWEL0GrFW9QTj8WTr7r6dyup5NhKbmuzd6OL1mdCdxQGASSVesopwH44N0UIx-_UCNAn3zhfAlxD53CiOdBMGW5Clq-CQyaBo9TFZ2UH-CqdlTqDK_2z0XZKxvnSlBp--b9VRWoHoa9fnPMkB2lBmtrQHR1BO9A7Et6IPlpwqbOK2572nGc3FLffigcWEYxLiTv3N0QzKnxka6oldA7f0-Apqe40I5hu9OzFn-8kMwEnWxf1hWkUABpZeHdgdZkIRi5Ahz2mpBnqJwyDCX5pRE2tgjje_ST3iOzn7PbcXQhQzzt6nR0jz-ADG7-JkLXZ68Waud65ZrZ5oBkZDRk-JwMR0kM4LD9ARW-EVXsPJ9S5Y_fcAgUHIrU7AL_M_c5Eplx-OcA-zua4UhO5r5h69qLewm-E7V0KvfmJf77HDDjXxK-fFXsKQnjVzV18WLXLB-A35MgPPwaJ1R4ewcF8SNup8DeAykd6FzgS21uYAO4jMus-qf6rg4_RwUJ80eWBXSAoXHQV-EL5ULP74mXTdCQ5fsaaDv-NVkFreM1J8ci1geDlrqrH8QC0W5X-CJzqE-EzQ5RwkWYIdOx7v_LPhb5hi6bzodBbRzg_vCxKbz2dKDJs2K4vSvJPkXW50W1kUGWW6TXP_fFrixmV7HljSa-BEThFF1iSQwnPyGqH5QvX3kcOKuAqadrSKht423Ge2agI_uwUw9WL3I0zfewFuTT6RMFfaIWOAzRJtHzT-cK8X4e3OKggm-cCfFIA4KTo8DPGm48OQSnQHnDTzQPjeoSV0M-31M8cNvd6He6j6RS4QZmiaD4VShb8zkZu7h-UO-hsgt7Hlw8ufys580_sSRY0fzLEtSnB2As02oU262OM-_1ioC6hC_shFKq4XSUgj-lmi3iJsFVLL4ismpqxRrlaPrBdamlWixdh7sspXVzz_8r2dVcQxP57nmgmfg0kQ7e_lN68fsvocEbFsGrVmFvhBDBZR4C5ieSJZ2Mjx4jek_-c9epWdVKjG-7aYgR7b5Wy3TmnUpnzPLFSRzKMyQzF07yi9BRSlt0JNqJgxZut9aezSGoMxJOXEvmR28hbWK9fOHaqH_YgXWLnWwmPE_oE2w9v24I9mRqdz-Ev2Qrlb2itS5La3cXrApIqHcZdHKdQeXydAX9ZwzaRAcq1I8wLDToCntq1opN2sqkEYwOfK1iAz3kV16I7A-oXjWvBiYX848t7iua4KaYACrIc96WArIxMxW1zflgXJ1AAD0nLzLIZkMPd5lo7K257dGMCKo5TtlzVJNkrHMEXMV2FHgFqsS_UBHv9OLxynpA0ixs1Ixho-D4WpMgOiX2JLt3vyRG3CWNKrjoF5mY7AN_yiVifETdNtFtcWF6SJjku5ZJLHmD-pAW0Yx1gqFgbboUu1T8YhdHu58Km5Ri77GCZnj0wVcVFeePYSCMw=; _gat_UA-117333109-1=1; AccessTokenInfo=u2x96MRfae-glKK-nne0UV_A-GUIp1Pp36QiNk9wKOOkCGuHUNUsNAMdZQBNFno3kZ9VqVTPBMIxMGUOdzfyb1rGxtEq6lS8-iP8KeTvybVFqjc-5HctD_wkIJEF7-7kx8meVjzWxySwVgwkEZmqv5fGeVqv0EsLhQa05blbJJoF2HSdEMiOBwgyeTQ_lR6gOo_4s-N2npeI4xHYtG4p_TFRfLXvVa8llp8mIxBrHwUKfAJH_osh7K6v1RWm7ZvNIo4XwnuyUTmDeaeNmWtotuLjKE3MvCYYycGrak3AnBMuEX1EpR4synaZBqmGFgJK_RYyfuRoiqL-aFm9GVSfmKVGi2kiLBuAwTp0nuDlMoqjG6r53iG9x8D2RIRgv5DsgOgN8SS7KhNrQFoaL-rxlKmtrTSYPzWci8igX0lEBF7RrJV8qYnN1ER6V6KOQkwLQ1MhpROy3Lk3fu12Tz-_0UYHIzp9QNgFpL4y8b1ZsUmVxcyGkAqFg6TOPBi32mvnUdCre-QPTbA_dLwmF0yNETtVxAVZo4aeDOdihITWOZMCj6H5OLSglKMfWj615P9bWMf_Pihog7uRy-sDA5TjxLYhU4ZvH97xyQKEUWk6rnT_A6mH8DnfipmRBGFO1kkKUyrJvJBvADCt4nL567uFBvbGdFadUcq-JzBjzmFIe9CoU_VsEYtBWhAGV0sbajk_k_9Ls190ziS3d9lOSi5YRxAMLCipEKM3yGi2IduB_ERn1d5exZTCRolUrSdD4d3yFNlu1FQS7QtJD75FxN5BYCB03KgMBRUnyNejPU0RiqHx-liJQMQiPrRkDBpyxo-de5MBKoNSgt-nW44dmEWMrPK5h2jNpOwjtfbwxGxFFSp6txscWxaMDbo9mGSJeE6e27tb2WvX5YSNi68fccyo3PZLTF-H8ZixtfoTn4g1nL1JGA55-oXDCjq95ivHCJfGSdXmV99vayz6I_wp5O_JtUWqNz7kdy0Pf_SsPVbZS7bHyZ5WPNbHJLBWDCQRmaq_l8Z5Wq_QSwv4obJ14EiI6gO4QyDgrNISZkkg3fwMSDiTDKMoMVLi0xsXF7xJnGUoaH2Dvehd22ZFaozIpZB94wHIE0elFyUtXCLVqeNo-gyAUcIOdORSk7XqfCElKheoGzI4Op1kfF3rouycJtpjGcztaWblB-nY52VmuwOpsfRatMz60tA06ZSAQK3FzQmbJNPi_LyWagUDokf3pvAxfz1LJgnxe7cn55cPBW8IWOuHoxAPIZ7HePAxFtttra0C2F9_CB6SI11M9QSweb-rSgZwU_xc5wFsEb2CIWj3SgE4gbO3WxnbCJcXTd8RK_7yG9OzGhvLk-uzz7W6g1KXVFjb8mCWoxgGMK97SL_CjmkTQGcgwup24JzVo_IGkxVNJQyUDhfndbHoxLZstyyfKlrGxtEq6lS8JMPOGXaUEQK3yn_wkykYX0yrtmgM99sTXzHcpK1916aI_ev7q5X2KUjdDofJfybNKEVcSE8n-hSP8b652vCeFdkY7P4cXN29; _ga_EJRWLKGJNW=GS1.1.1603541129.5.1.1603542988.47; _ga=GA1.2.87188470.1602703684");
            var result = await _http.Execute(o =>
            {
                o.Host    = new Uri("https://www.amccinemas.com/showtime/getallmovies");
                o.Method  = HttpMethod.Post;
                o.Headers = headers;
                o.Body    = @"{""url"":""showdate=" + param + @"""}";
            });

            if (result.IsSuccess)
            {
                var config = Configuration.Default;

                //Create a new context for evaluating webpages with the given config
                var context = BrowsingContext.New(config);

                //Just get the DOM representation
                var document = await context.OpenAsync(req => req.Content(result.Response));

                //Select all sessions of films
                var listofgroups = document.All.Where(m => m.ClassName == "amc-time-list");

                foreach (var item in listofgroups)
                {
                    string cinemaName = item.GetAttribute("data-cinemaname");
                    string movieName  = item.GetAttribute("data-moviename");
                    var    childdoc   = await context.OpenAsync(req => req.Content(item.InnerHtml));

                    var            listofsessions = childdoc.QuerySelectorAll("span").Where(m => m.ClassName == "amc-time");
                    List <Session> sessions       = new List <Session>();
                    foreach (var c_session in listofsessions)
                    {
                        Session  session     = new Session();
                        DateTime sessionDate = new DateTime(date.Year, date.Month, date.Day);
                        string   time        = c_session.InnerHtml;
                        time = time.Remove(0, 1);
                        time = time.Remove(time.Length - 1, 1);
                        time = "10/10/2000 " + time;
                        DateTime parsedTime = DateTime.ParseExact(time, "M/d/yyyy h:mm tt", CultureInfo.InvariantCulture);
                        sessionDate      = sessionDate.AddHours(parsedTime.Hour);
                        sessionDate      = sessionDate.AddMinutes(parsedTime.Minute);
                        session.ShowTime = sessionDate;
                        session.CinemaId = cinemas.First(x => x.Name == cinemaName).ExternalId;
                        sessions.Add(session);
                    }

                    movies.First(x => x.Title == movieName).Sessions.AddRange(sessions);
                }
                return(new ParseResult <Data>(Result.Success)
                {
                    Data = new Data()
                    {
                        Movies = movies
                    }
                });
            }

            return(new ParseResult <Parse.Data>(Result.Error));
        }
        /// <summary>
        /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set
        /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>.
        /// If an image does need to be rotated or does not fit then a local copy is made of
        /// the <paramref name="inputUri"/> file.
        /// </summary>
        /// <param name="inputUri">The uri of the webpage</param>
        /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param>
        /// <param name="rotate">When set to <c>true</c> then the EXIF information of an
        ///     image is read and when needed the image is automatic rotated</param>
        /// <param name="sanitizeHtml">When set to <c>true</c> then the HTML with get sanitized</param>
        /// <param name="pageSettings"><see cref="PageSettings"/></param>
        /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise
        ///     <c>null</c> is returned</param>
        /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns>
        /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception>
        public bool Validate(ConvertUri inputUri,
                             bool resize,
                             bool rotate,
                             bool sanitizeHtml,
                             PageSettings pageSettings,
                             out ConvertUri outputUri)
        {
            outputUri = null;

            string localDirectory = null;

            if (inputUri.IsFile)
            {
                localDirectory = Path.GetDirectoryName(inputUri.OriginalString);
            }

            using (var webpage = inputUri.IsFile
                ? File.OpenRead(inputUri.OriginalString)
                : DownloadStream(inputUri))
            {
                var maxWidth  = (pageSettings.PaperWidth - pageSettings.MarginLeft - pageSettings.MarginRight) * 96.0;
                var maxHeight = (pageSettings.PaperHeight - pageSettings.MarginTop - pageSettings.MarginBottom) * 96.0;

                var htmlChanged = false;
                var config      = Configuration.Default.WithCss();
                var context     = BrowsingContext.New(config);

                IDocument document;

                try
                {
                    // ReSharper disable AccessToDisposedClosure
                    document = inputUri.Encoding != null
                        ? context.OpenAsync(m =>
                                            m.Content(webpage).Header("Content-Type",
                                                                      $"text/html; charset={inputUri.Encoding.WebName}"))
                               .Result
                        : context.OpenAsync(m => m.Content(webpage)).Result;

                    // ReSharper restore AccessToDisposedClosure
                }
                catch (Exception exception)
                {
                    WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}");
                    return(true);
                }

                if (sanitizeHtml)
                {
                    WriteToLog("Sanitizing HTML");
                    new HtmlSanitizer().DoSanitize(document as IHtmlDocument, document.DocumentElement);
                    htmlChanged = true;
                    WriteToLog("HTML sanitized");
                }

                WriteToLog("Validating all images if they need to be rotated and if they fit the page");
                var unchangedImages = new List <IHtmlImageElement>();

                // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop
                foreach (var htmlImage in document.Images)
                {
                    var imageChanged = false;

                    if (string.IsNullOrWhiteSpace(htmlImage.Source))
                    {
                        WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'");
                        continue;
                    }

                    Image image  = null;
                    var   source = htmlImage.Source.Contains("?")
                        ? htmlImage.Source.Split('?')[0]
                        : htmlImage.Source;

                    var extension = Path.GetExtension(FileManager.RemoveInvalidFileNameChars(source));

                    var fileName = GetTempFile(extension);

                    try
                    {
                        // The local width and height attributes always go before css width and height
                        var width  = htmlImage.DisplayWidth;
                        var height = htmlImage.DisplayHeight;

                        if (rotate)
                        {
                            image = GetImage(htmlImage.Source, localDirectory);

                            if (image == null)
                            {
                                continue;
                            }

                            if (RotateImageByExifOrientationData(image))
                            {
                                htmlImage.DisplayWidth  = image.Width;
                                htmlImage.DisplayHeight = image.Height;
                                WriteToLog($"Image rotated and saved to location '{fileName}'");
                                image.Save(fileName);
                                htmlImage.DisplayWidth  = image.Width;
                                htmlImage.DisplayHeight = image.Height;
                                htmlImage.SetStyle(string.Empty);
                                htmlImage.Source = new Uri(fileName).ToString();
                                htmlChanged      = true;
                                imageChanged     = true;
                            }

                            width  = image.Width;
                            height = image.Height;
                        }

                        if (resize)
                        {
                            if (height == 0 && width == 0)
                            {
                                var style = context.Current.GetComputedStyle(htmlImage);
                                if (style != null)
                                {
                                    width  = ParseValue(style.GetPropertyValue("width"));
                                    height = ParseValue(style.GetPropertyValue("height"));
                                }
                            }

                            // If we don't know the image size then get if from the image itself
                            if (width <= 0 || height <= 0)
                            {
                                if (image == null)
                                {
                                    image = GetImage(htmlImage.Source, localDirectory);
                                }

                                if (image == null)
                                {
                                    continue;
                                }
                                width  = image.Width;
                                height = image.Height;
                            }

                            if (width > maxWidth || height > maxHeight)
                            {
                                // If we did not load the image already then load it

                                if (image == null)
                                {
                                    image = GetImage(htmlImage.Source, localDirectory);
                                }

                                if (image == null)
                                {
                                    continue;
                                }

                                image = ScaleImage(image, (int)maxWidth);
                                WriteToLog(
                                    $"Image resized to width {image.Width} and height {image.Height} and saved to location '{fileName}'");
                                image.Save(fileName);
                                htmlImage.DisplayWidth  = image.Width;
                                htmlImage.DisplayHeight = image.Height;
                                htmlImage.SetStyle(string.Empty);
                                htmlImage.Source = new Uri(fileName).ToString();
                                htmlChanged      = true;
                                imageChanged     = true;
                            }
                        }
                    }
                    finally
                    {
                        image?.Dispose();
                    }

                    if (!imageChanged)
                    {
                        unchangedImages.Add(htmlImage);
                    }
                }

                if (!htmlChanged)
                {
                    return(true);
                }

                foreach (var unchangedImage in unchangedImages)
                {
                    using (var image = GetImage(unchangedImage.Source, localDirectory))
                    {
                        if (image == null)
                        {
                            WriteToLog($"Could not load unchanged image from location '{unchangedImage.Source}'");
                            continue;
                        }

                        var extension = Path.GetExtension(unchangedImage.Source.Contains("?")
                            ? unchangedImage.Source.Split('?')[0]
                            : unchangedImage.Source);
                        var fileName = GetTempFile(extension);

                        WriteToLog($"Unchanged image saved to location '{fileName}'");
                        image.Save(fileName);
                        unchangedImage.Source = new Uri(fileName).ToString();
                    }
                }

                var outputFile = GetTempFile(".htm");
                outputUri = new ConvertUri(outputFile, inputUri.Encoding);

                try
                {
                    using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write))
                    {
                        if (inputUri.Encoding != null)
                        {
                            using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                        else
                        {
                            using (var textWriter = new StreamWriter(fileStream))
                                document.ToHtml(textWriter, new HtmlMarkupFormatter());
                        }
                    }

                    return(false);
                }
                catch (Exception exception)
                {
                    WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}");
                    return(true);
                }
            }
        }
Exemple #21
0
    public async Task Upload(Solver solver)
    {
        var color = Console.ForegroundColor;

        Console.WriteLine();
        var solverResult = Runner.RunSolver(solver);

        Console.WriteLine();

        if (solverResult.errors.Any())
        {
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("Uhh-ohh the solution doesn't pass the tests...");
            Console.ForegroundColor = color;
            Console.WriteLine();
            return;
        }

        var problem = await DownloadProblem(GetContext(), GetBaseAddress(), solver.Year(), solver.Day());

        if (problem.Answers.Length == 2)
        {
            Console.WriteLine("Both parts of this puzzle are complete!");
            Console.WriteLine();
        }
        else if (solverResult.answers.Length <= problem.Answers.Length)
        {
            Console.WriteLine($"You need to work on part {problem.Answers.Length + 1}");
            Console.WriteLine();
        }
        else
        {
            var level  = problem.Answers.Length + 1;
            var answer = solverResult.answers[problem.Answers.Length];
            Console.WriteLine($"Uploading answer ({answer}) for part {level}...");

            // https://adventofcode.com/{year}/day/{day}/answer
            // level={part}&answer={answer}

            var cookieContainer = new CookieContainer();
            using var handler = new HttpClientHandler()
                  {
                      CookieContainer = cookieContainer
                  };
            using var client = new HttpClient(handler)
                  {
                      BaseAddress = GetBaseAddress()
                  };

            var content = new FormUrlEncodedContent(new[] {
                new KeyValuePair <string, string>("level", level.ToString()),
                new KeyValuePair <string, string>("answer", answer),
            });

            cookieContainer.Add(GetBaseAddress(), new Cookie("session", GetSession()));
            var result = await client.PostAsync($"/{solver.Year()}/day/{solver.Day()}/answer", content);

            result.EnsureSuccessStatusCode();
            var responseString = await result.Content.ReadAsStringAsync();

            var config   = Configuration.Default;
            var context  = BrowsingContext.New(config);
            var document = await context.OpenAsync(req => req.Content(responseString));

            var article = document.Body.QuerySelector("body > main > article").TextContent;
            article = Regex.Replace(article, @"\[Continue to Part Two.*", "", RegexOptions.Singleline);
            article = Regex.Replace(article, @"You have completed Day.*", "", RegexOptions.Singleline);
            article = Regex.Replace(article, @"\(You guessed.*", "", RegexOptions.Singleline);
            article = Regex.Replace(article, @"  ", "\n", RegexOptions.Singleline);

            using (var repo = new Git.Repository(".git"))
                if (article.StartsWith("That's the right answer") || article.Contains("You've finished every puzzle"))
                {
                    Git.Commands.Stage(repo, "*");

                    Console.ForegroundColor = ConsoleColor.Green;
                    Console.WriteLine(article);
                    Console.ForegroundColor = color;
                    Console.WriteLine();
                    await Update(solver.Year(), solver.Day());

                    var signature = new Git.Signature(repo.Config.Get <string>("user.name").Value, repo.Config.Get <string>("user.email").Value, DateTime.Now);
                    if (article.StartsWith('T'))
                    {
                        Git.Commands.Stage(repo, "**/input.refout");
                        repo.Commit($"Solved P1", signature, signature, new());
                        Git.Commands.Stage(repo, "*");
                        repo.Commit("P2", signature, signature, new());
                    }
                    else
                    {
                        Git.Commands.Stage(repo, "*");
                        repo.Commit($"Solved P2", signature, signature, new());
                    }
                }
                else if (article.StartsWith("That's not the right answer"))
                {
                    Git.Commands.Stage(repo, "*");

                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine(article);
                    Console.ForegroundColor = color;
                    Console.WriteLine();
                }
                else if (article.StartsWith("You gave an answer too recently"))
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine(article);
                    Console.ForegroundColor = color;
                    Console.WriteLine();
                }
                else
                {
                    Console.ForegroundColor = ConsoleColor.White;
                    Console.WriteLine(article);
                    Console.ForegroundColor = color;
                }
        }
    }
Exemple #22
0
        public async Task CssWithImportRuleShouldBeAbleToHandleNestedStylesheets()
        {
            var files = new Dictionary <String, String>
            {
                { "index.html", "<!doctype html><html><link rel=stylesheet href=origin.css type=text/css><style>@import url('linked2.css');</style>" },
                { "origin.css", "@import url(linked1.css);" },
                { "linked1.css", "" },
                { "linked2.css", "@import url(\"linked3.css\"); @import 'linked4.css';" },
                { "linked3.css", "" },
                { "linked4.css", "" },
            };
            var requester = new TestServerRequester(files);
            var config    = Configuration.Default
                            .With(requester)
                            .WithDefaultLoader(new LoaderOptions {
                IsResourceLoadingEnabled = true
            })
                            .WithCss();
            var document = await BrowsingContext.New(config).OpenAsync("http://localhost/index.html");

            var link  = document.QuerySelector <IHtmlLinkElement>("link");
            var style = document.QuerySelector <IHtmlStyleElement>("style");

            await Task.Delay(100);

            Assert.IsNotNull(link);
            Assert.IsNotNull(style);

            var origin = link.Sheet as ICssStyleSheet;

            Assert.IsNotNull(origin);
            Assert.AreEqual("http://localhost/origin.css", origin.Href);
            Assert.AreEqual(1, origin.Rules.Length);
            Assert.AreEqual(CssRuleType.Import, origin.Rules[0].Type);

            var linked1 = (origin.Rules[0] as ICssImportRule).Sheet;

            Assert.IsNotNull(linked1);
            Assert.AreEqual("http://localhost/linked1.css", linked1.Href);
            Assert.AreEqual(0, linked1.Rules.Length);

            var styleSheet = style.Sheet as ICssStyleSheet;

            Assert.IsNotNull(styleSheet);
            Assert.AreEqual(null, styleSheet.Href);
            Assert.AreEqual(1, styleSheet.Rules.Length);
            Assert.AreEqual(CssRuleType.Import, styleSheet.Rules[0].Type);

            var linked2 = (styleSheet.Rules[0] as ICssImportRule).Sheet;

            Assert.IsNotNull(linked2);
            Assert.AreEqual("http://localhost/linked2.css", linked2.Href);
            Assert.AreEqual(2, linked2.Rules.Length);
            Assert.AreEqual(CssRuleType.Import, linked2.Rules[0].Type);
            Assert.AreEqual(CssRuleType.Import, linked2.Rules[1].Type);

            var linked3 = (linked2.Rules[0] as ICssImportRule).Sheet;

            Assert.IsNotNull(linked3);
            Assert.AreEqual("http://localhost/linked3.css", linked3.Href);
            Assert.AreEqual(0, linked3.Rules.Length);

            var linked4 = (linked2.Rules[1] as ICssImportRule).Sheet;

            Assert.IsNotNull(linked4);
            Assert.AreEqual("http://localhost/linked4.css", linked4.Href);
            Assert.AreEqual(0, linked4.Rules.Length);
        }
        public async Task <NovelResult> GetNovelData(string query)
        {
            if (string.IsNullOrWhiteSpace(query))
            {
                throw new ArgumentNullException(nameof(query));
            }

            query = query.Replace(" ", "-", StringComparison.InvariantCulture);
            try
            {
                var link = "http://www.novelupdates.com/series/" + Uri.EscapeDataString(query.Replace("/", " ", StringComparison.InvariantCulture));
                link          = link.ToLowerInvariant();
                var(ok, data) = await _cache.TryGetNovelDataAsync(link).ConfigureAwait(false);

                if (!ok)
                {
                    var config = Configuration.Default.WithDefaultLoader();
                    using (var document = await BrowsingContext.New(config).OpenAsync(link).ConfigureAwait(false))
                    {
                        var imageElem = document.QuerySelector("div.seriesimg > img");
                        if (imageElem == null)
                        {
                            return(null);
                        }
                        var imageUrl = ((IHtmlImageElement)imageElem).Source;

                        var descElem = document.QuerySelector("div#editdescription > p");
                        var desc     = descElem.InnerHtml;

                        var genres = document.QuerySelector("div#seriesgenre").Children
                                     .Select(x => x as IHtmlAnchorElement)
                                     .Where(x => x != null)
                                     .Select(x => $"[{x.InnerHtml}]({x.Href})")
                                     .ToArray();

                        var authors = document
                                      .QuerySelector("div#showauthors")
                                      .Children
                                      .Select(x => x as IHtmlAnchorElement)
                                      .Where(x => x != null)
                                      .Select(x => $"[{x.InnerHtml}]({x.Href})")
                                      .ToArray();

                        var score = ((IHtmlSpanElement)document
                                     .QuerySelector("h5.seriesother > span.uvotes"))
                                    .InnerHtml;

                        var status = document
                                     .QuerySelector("div#editstatus")
                                     .InnerHtml;
                        var title = document
                                    .QuerySelector("div.w-blog-content > div.seriestitlenu")
                                    .InnerHtml;

                        var obj = new NovelResult()
                        {
                            Description = desc,
                            Authors     = authors,
                            Genres      = genres,
                            ImageUrl    = imageUrl,
                            Link        = link,
                            Score       = score,
                            Status      = status,
                            Title       = title,
                        };

                        await _cache.SetNovelDataAsync(link,
                                                       JsonConvert.SerializeObject(obj)).ConfigureAwait(false);

                        return(obj);
                    }
                }

                return(JsonConvert.DeserializeObject <NovelResult>(data));
            }
            catch (Exception ex)
            {
                _log.Error(ex);
                return(null);
            }
        }
Exemple #24
0
            public async Task Mal([Remainder] string name)
            {
                if (string.IsNullOrWhiteSpace(name))
                {
                    return;
                }

                var fullQueryLink = "https://myanimelist.net/profile/" + name;

                var config   = Configuration.Default.WithDefaultLoader();
                var document = await BrowsingContext.New(config).OpenAsync(fullQueryLink);

                var imageElem = document.QuerySelector("body > div#myanimelist > div.wrapper > div#contentWrapper > div#content > div.content-container > div.container-left > div.user-profile > div.user-image > img");
                var imageUrl  = ((IHtmlImageElement)imageElem)?.Source ?? "http://icecream.me/uploads/870b03f36b59cc16ebfe314ef2dde781.png";

                var stats = document.QuerySelectorAll("body > div#myanimelist > div.wrapper > div#contentWrapper > div#content > div.content-container > div.container-right > div#statistics > div.user-statistics-stats > div.stats > div.clearfix > ul.stats-status > li > span").Select(x => x.InnerHtml).ToList();

                var favorites = document.QuerySelectorAll("div.user-favorites > div.di-tc");

                var favAnime = GetText("anime_no_fav");

                if (favorites[0].QuerySelector("p") == null)
                {
                    favAnime = string.Join("\n", favorites[0].QuerySelectorAll("ul > li > div.di-tc.va-t > a")
                                           .Shuffle()
                                           .Take(3)
                                           .Select(x =>
                    {
                        var elem = (IHtmlAnchorElement)x;
                        return($"[{elem.InnerHtml}]({elem.Href})");
                    }));
                }

                var info = document.QuerySelectorAll("ul.user-status:nth-child(3) > li.clearfix")
                           .Select(x => Tuple.Create(x.Children[0].InnerHtml, x.Children[1].InnerHtml))
                           .ToList();

                var daysAndMean = document.QuerySelectorAll("div.anime:nth-child(1) > div:nth-child(2) > div")
                                  .Select(x => x.TextContent.Split(':').Select(y => y.Trim()).ToArray())
                                  .ToArray();

                var embed = new EmbedBuilder()
                            .WithOkColor()
                            .WithTitle(GetText("mal_profile", name))
                            .AddField(efb => efb.WithName("💚 " + GetText("watching")).WithValue(stats[0]).WithIsInline(true))
                            .AddField(efb => efb.WithName("💙 " + GetText("completed")).WithValue(stats[1]).WithIsInline(true));

                if (info.Count < 3)
                {
                    embed.AddField(efb => efb.WithName("💛 " + GetText("on_hold")).WithValue(stats[2]).WithIsInline(true));
                }
                embed
                .AddField(efb => efb.WithName("💔 " + GetText("dropped")).WithValue(stats[3]).WithIsInline(true))
                .AddField(efb => efb.WithName("⚪ " + GetText("plan_to_watch")).WithValue(stats[4]).WithIsInline(true))
                .AddField(efb => efb.WithName("🕐 " + daysAndMean[0][0]).WithValue(daysAndMean[0][1]).WithIsInline(true))
                .AddField(efb => efb.WithName("📊 " + daysAndMean[1][0]).WithValue(daysAndMean[1][1]).WithIsInline(true))
                .AddField(efb => efb.WithName(MalInfoToEmoji(info[0].Item1) + " " + info[0].Item1).WithValue(info[0].Item2.TrimTo(20)).WithIsInline(true))
                .AddField(efb => efb.WithName(MalInfoToEmoji(info[1].Item1) + " " + info[1].Item1).WithValue(info[1].Item2.TrimTo(20)).WithIsInline(true));
                if (info.Count > 2)
                {
                    embed.AddField(efb => efb.WithName(MalInfoToEmoji(info[2].Item1) + " " + info[2].Item1).WithValue(info[2].Item2.TrimTo(20)).WithIsInline(true));
                }
                //if(info.Count > 3)
                //    embed.AddField(efb => efb.WithName(MalInfoToEmoji(info[3].Item1) + " " + info[3].Item1).WithValue(info[3].Item2).WithIsInline(true))
                embed
                .WithDescription($@"
** https://myanimelist.net/animelist/{ name } **

**{GetText("top_3_fav_anime")}**
{favAnime}"

//**[Manga List](https://myanimelist.net/mangalist/{name})**
//💚`Reading:` {stats[5]}
//💙`Completed:` {stats[6]}
//💔`Dropped:` {stats[8]}
//⚪`Plan to read:` {stats[9]}

//**Top 3 Favorite Manga:**
//{favManga}"

                                 )
                .WithUrl(fullQueryLink)
                .WithImageUrl(imageUrl);

                await Context.Channel.EmbedAsync(embed).ConfigureAwait(false);
            }
Exemple #25
0
        static async Task Main(string[] args)
        {
            ThreadPool.SetMaxThreads(100, Int32.MaxValue);
            // Create the token source.
            var cts = new CancellationTokenSource();

            await using var crawlerHost = new CrawlerHost();
            var hostBuilder = CrawlerHost.CreateHostBuilder(args, (hostContext, services) =>
            {
                //Add more services to the Dependency Injection here

                //1. Scheduler
                // services.AddTransient<ISchedulerService, DefaultScheduler>();
                // services.AddHostedService(serviceProvider => serviceProvider.GetService<ISchedulerService>());
                services.AddTransient <IScheduler, CoreScheduler>();
                services.AddHostedService(serviceProvider => serviceProvider.GetService <IScheduler>());

                services.AddSingleton <ObservableConcurrentQueue <WaitingPageModel> >();
                // services.AddSingleton<AsyncBulkheadPolicy>(serviceProvider =>
                //     Policy.BulkheadAsync(Convert.ToInt32(hostContext.Configuration.GetSection("AppConfig")["MaxThreads"]),
                //         Int32.MaxValue));
                services.AddTransient <IUriBucket <WaitingPage>, DefaultWaitingUriBucket>();
                services.AddDbContextPool <UriDBContext>(options =>
                {
                    options.UseSqlite(@"Data Source=./1_Scheduler/DB/UriDB.db;");
                }, 16);

                //2. Crawler
                services.AddSingleton <IBrowsingContext>(serviceProvider =>
                {
                    //Use the default configuration for AngleSharp
                    var config = Configuration.Default
                                 .WithRequesters()     // from AngleSharp.Io
                                 .WithDefaultLoader(); // from AngleSharp

                    //Create a new context for evaluating webpages with the given config
                    var context = BrowsingContext.New(config);
                    return(context);
                });

                //3. Downloader
                services.AddHttpClient();
                services.AddTransient <IDownloader, Downloader>();

                //4. UriPolicies

                //5. Content Extractor
                services.AddTransient <IContentExtractor, ContentExtractor>();

                //6. Storage
                services.AddDbContextPool <ContentContext>((serviceProvider, options) =>
                {
                    options.UseSqlite(hostContext.Configuration.GetSection("AppConfig")["ConnectionString"]);
                    // options.UseSqlite(@"Data Source=./4_Storage/PageModels/DB/content.db;");
                }, 32);
            });

            try
            {
                Console.WriteLine("Application Starting!");

                using var host = crawlerHost.RunCrawlerEngine(hostBuilder, cts.Token);

                // var waitingQueue = host.Services.GetRequiredService<ObservableConcurrentQueue<WaitingPageModel>>();
                // waitingQueue.Enqueue(new WaitingPageModel {Uri = new Uri("https://www.webtoons.com"), Verb = "GET"});

                await host.StartAsync(cts.Token);

                Console.WriteLine("Application Started! Press <enter> to stop.");

                var waitingQueue = host.Services.GetRequiredService <ObservableConcurrentQueue <WaitingPageModel> >();
                waitingQueue.Enqueue(new WaitingPageModel {
                    Uri = new Uri("https://www.webtoons.com"), Verb = "GET"
                });

                // Console.WriteLine("Enter");

                Console.ReadLine();

                // waitingQueue = host.Services.GetRequiredService<ObservableConcurrentQueue<WaitingPageModel>>();
                // waitingQueue.Enqueue(new WaitingPageModel {Uri = new Uri("https://www.webtoons.com"), Verb = "GET"});
                //
                // Console.ReadLine();

                // cts.Cancel();

                await host.StopAsync(cts.Token);

                Console.WriteLine("Application Stopping!");
                Console.WriteLine("Main thread Stopped!");
            }
            catch (Exception e)
            {
                var trace = new StackTrace(e, true);
                var line  = trace.GetFrame(trace.FrameCount - 1).GetFileLineNumber();
                Console.WriteLine("Exception at line " + line);
                Console.WriteLine(e);
            }
            finally
            {
                cts.Dispose();
            }
        }
Exemple #26
0
        public async Task Image([Remainder] string terms = null)
        {
            var oterms = terms?.Trim();

            if (string.IsNullOrWhiteSpace(oterms))
            {
                return;
            }

            terms = WebUtility.UrlEncode(oterms).Replace(' ', '+');

            try
            {
                var res = await _google.GetImageAsync(oterms).ConfigureAwait(false);

                var embed = new EmbedBuilder()
                            .WithOkColor()
                            .WithAuthor(eab => eab.WithName(GetText("image_search_for") + " " + oterms.TrimTo(50))
                                        .WithUrl("https://www.google.rs/search?q=" + terms + "&source=lnms&tbm=isch")
                                        .WithIconUrl("http://i.imgur.com/G46fm8J.png"))
                            .WithDescription(res.Link)
                            .WithImageUrl(res.Link)
                            .WithTitle(Context.User.ToString());
                await Context.Channel.EmbedAsync(embed).ConfigureAwait(false);
            }
            catch
            {
                _log.Warn("Falling back to Imgur search.");

                var fullQueryLink = $"http://imgur.com/search?q={ terms }";
                var config        = Configuration.Default.WithDefaultLoader();
                using (var document = await BrowsingContext.New(config).OpenAsync(fullQueryLink).ConfigureAwait(false))
                {
                    var elems = document.QuerySelectorAll("a.image-list-link");

                    if (!elems.Any())
                    {
                        return;
                    }

                    var img = (elems.FirstOrDefault()?.Children?.FirstOrDefault() as IHtmlImageElement);

                    if (img?.Source == null)
                    {
                        return;
                    }

                    var source = img.Source.Replace("b.", ".", StringComparison.InvariantCulture);

                    var embed = new EmbedBuilder()
                                .WithOkColor()
                                .WithAuthor(eab => eab.WithName(GetText("image_search_for") + " " + oterms.TrimTo(50))
                                            .WithUrl(fullQueryLink)
                                            .WithIconUrl("http://s.imgur.com/images/logo-1200-630.jpg?"))
                                .WithDescription(source)
                                .WithImageUrl(source)
                                .WithTitle(Context.User.ToString());
                    await Context.Channel.EmbedAsync(embed).ConfigureAwait(false);
                }
            }
        }
Exemple #27
0
 static IDocument CreateEmpty(String url)
 {
     return(BrowsingContext.New().OpenNewAsync(url).Result);
 }
        // This method gets called by the runtime. Use this method to add services to the container.
        // For more information on how to configure your application, visit https://go.microsoft.com/fwlink/?LinkID=398940
        public void ConfigureServices(IServiceCollection services)
        {
            services.AddControllersWithViews();
            services.AddRazorPages();

            services.AddSignalR();

            // TODO: Change the ServiceLifetime. The usage of ServiceLifetime.Transient is because multiple threads operations are running in the same dbcontext.
            services.AddDbContext <DbContext, ApplicationDbContext>(ServiceLifetime.Transient);
            services.AddOrcEntityFrameworkCore();
            services.AddDatabaseSeeder <ApplicationDbSeeder>();

            var token = this.Configuration.GetSection("TelegramBot")?["Token"];

            if (!string.IsNullOrWhiteSpace(token))
            {
                if (token == "%TELEGRAM_BOT_TOKEN%")
                {
                    Log.Warning(
                        "Telegram notification is disable. Replace %TELEGRAM_BOT_TOKEN% placeholder in the configuration file with a valid bot token.");
                }
                else
                {
                    Log.Information("Telegram notification is enable.");

                    services.AddTransient <ITelegramBotClient>(sp => new TelegramBotClient(token));
                    services.AddSingleton <ITelegramCommander, TelegramCommander>();
                }
            }
            else
            {
                Log.Warning(
                    "Telegram notification is disable. To enable it, add a TelegramBot section with a key Token.");
            }

            services.AddTransient(sp => new CookieContainer());

            services.AddTransient(sp => BrowsingContext.New(AngleSharp.Configuration.Default));

            services.AddTransient(
                sp =>
            {
                var cookieContainer = sp.GetService <CookieContainer>();

                var handler = new HttpClientHandler
                {
                    AutomaticDecompression =
                        DecompressionMethods.GZip | DecompressionMethods.Deflate
                        | DecompressionMethods.Brotli,
                    AllowAutoRedirect = true
                };

                if (cookieContainer != null)
                {
                    handler.CookieContainer = cookieContainer;

                    // TODO: Review how to avoid the call to .GetAwaiter().GetResult()
                    var cookieCollection = CookiesHelper.GetCollectitonAsync().GetAwaiter().GetResult();
                    if (cookieCollection.Count > 0)
                    {
                        handler.CookieContainer.Add(new Uri("https://www.tuenvio.cu"), cookieCollection);
                    }
                }

                var httpClient = new HttpClient(handler)
                {
                    Timeout = ScrappingConfiguration.HttpClientTimeout
                };

                httpClient.DefaultRequestHeaders.TryAddWithoutValidation(
                    "user-agent",
                    ScrappingConfiguration.Agent);

                httpClient.DefaultRequestHeaders.TryAddWithoutValidation(
                    "accept-encoding",
                    "gzip, deflate, br");
                httpClient.DefaultRequestHeaders.CacheControl = new CacheControlHeaderValue {
                    NoCache = true
                };

                return(httpClient);
            });

            // services.AddHttpClient(
            // "json",
            // (sp, httpClient) =>
            // {
            // httpClient.Timeout = ScrappingConfiguration.HttpClientTimeout;
            // httpClient.DefaultRequestHeaders.CacheControl =
            // new CacheControlHeaderValue { NoCache = true };
            // httpClient.DefaultRequestHeaders.TryAddWithoutValidation(
            // "user-agent",
            // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36");
            // });
            services.AddScoped <IStoreService, StoreService>();

            services.AddSingleton <ICacheStorage <string, Product> >(
                provider => new CacheStorage <string, Product>(storeNullValues: true));
            services.AddSingleton <ICacheStorage <string, Department> >(
                provider => new CacheStorage <string, Department>(storeNullValues: true));
            services.AddSingleton <ICacheStorage <string, Store> >(
                provider => new CacheStorage <string, Store>(storeNullValues: true));

            services.AddTransient <IEntityScrapper <Product>, ProductScrapper>();
            services.AddTransient <IEntityScrapper <Department>, DepartmentScrapper>();
            services.AddTransient <IEntityScrapper <Store>, StoreScrapper>();

            services.AddTransient <IMultiEntityScrapper <Product>, InspectDepartmentProductsScrapper>();
            services.AddTransient <IMultiEntityScrapper <Department>, InspectStoreDepartmentsScrapper>();

            services.AddSingleton <ImportStoresHostedService>();

            services.AddHostedService <DepartmentMonitorHostedService>();
            services.AddHostedService <ProductMonitorHostedService>();
            services.AddHostedService <StoreMonitorHostedService>();

            // services.AddHostedService<SyncUsersFromTelegramHostedService>();
        }
Exemple #29
0
        void MakeForDir(System.IO.DirectoryInfo dirinfo)
        {
            foreach (var file in dirinfo.GetFiles())
            {
                var FileName = file.Name.Replace(".", "_");
                FileName      = FileName.Replace("-", "_");
                CurrentBlock += "public class " + FileName;
                CurrentBlock += "{";
                if (file.Name.ToLower().EndsWith("css"))
                {
                    MakeDataField(file.FullName);
                }
                else if (file.Name.ToLower().EndsWith("js"))
                {
                    CurrentBlock.NewBlock(() =>
                    {
                        CurrentBlock += "public static readonly string TextContent = ((Func<string>)(() =>";
                        CurrentBlock += "{";
                        CurrentBlock.NewBlock(() =>
                        {
                            CurrentBlock += "byte[] Result = null;";
#if DEBUG
                            MakeBytesAsB64(Encoding.UTF8.GetBytes(System.IO.File.ReadAllText(file.FullName)), "Result");
#else
                            MakeBytesAsbyte(System.IO.File.ReadAllBytes(file.FullName), "Result");
#endif
                            CurrentBlock += "return System.Text.Encoding.UTF8.GetString(Result);";
                        });
                        CurrentBlock += "}))();\n";
                    });
                }
                else if (file.Name.ToLower().EndsWith("html") | file.Name.ToLower().EndsWith("htm"))
                {
                    //Use the default configuration for AngleSharp
                    var config = Configuration.Default;

                    //Create a new context for evaluating webpages with the given config
                    var context = BrowsingContext.New(config);

                    //Source to be parsed
                    var source = System.IO.File.ReadAllText(file.FullName);

                    //Create a virtual request to specify the document to load (here from our fixed string)
                    var web = context.OpenAsync(req => req.Content(source)).GetAwaiter().GetResult();

                    var ElementIds  = new string[0];
                    var ElementTags = new string[0];
                    foreach (var Element in web.All)
                    {
                        string Attribute = null;
                        if (Element.GetAttribute("src") != null)
                        {
                            Attribute = "src";
                        }
                        else if (Element.GetAttribute("href") != null & Element.TagName.ToLower() == "link")
                        {
                            Attribute = "href";
                        }
                        if (Attribute != null)
                        {
                            string Address = file.DirectoryName + "\\" + Element.GetAttribute(Attribute).Replace("/", "\\");
                            if (System.IO.File.Exists(Address))
                            {
                                Address = new System.IO.FileInfo(Address).FullName;
                                if (Address.StartsWith(BaseDir))
                                {
                                    Element.RemoveAttribute(Attribute);
                                    Element.SetAttribute("MNsrc",
                                                         "Monsajem_Incs.Resources" +
                                                         Address.Substring(BaseDir.Length).
                                                         Replace(".", "_").
                                                         Replace("-", "_").
                                                         Replace("\\", "."));
                                }
                            }
                        }
                    }
                    foreach (var Element in web.All)
                    {
                        if (Element.Id != null)
                        {
                            Array.Resize(ref ElementIds, ElementIds.Length + 1);
                            ElementIds[ElementIds.Length - 1] = Element.Id;
                            Array.Resize(ref ElementTags, ElementTags.Length + 1);
                            ElementTags[ElementTags.Length - 1] = Element.TagName.ToUpper();
                        }
                    }

                    if (ElementIds.Length > 0)
                    {
                        string DocText = "<html>" +
                                         "<head>" + web.GetElementsByTagName("head")[0].InnerHtml + "</head>" +
                                         "<body>" + web.GetElementsByTagName("body")[0].InnerHtml + "</body></html>";

                        for (int i = 0; i < ElementIds.Length; i++)
                        {
                            if (ElementTags[i] == "INPUT")
                            {
                                ElementTags[i] = "HTMLInputElement";
                            }
                            else if (ElementTags[i] == "LABEL")
                            {
                                ElementTags[i] = "HTMLLabelElement";
                            }
                            else if (ElementTags[i] == "DIV")
                            {
                                ElementTags[i] = "HTMLDivElement";
                            }
                            else if (ElementTags[i] == "DL")
                            {
                                ElementTags[i] = "HTMLDListElement";
                            }
                            else if (ElementTags[i] == "BUTTON")
                            {
                                ElementTags[i] = "HTMLButtonElement";
                            }
                            else if (ElementTags[i] == "AREA")
                            {
                                ElementTags[i] = "HTMLAreaElement";
                            }
                            else if (ElementTags[i] == "IMG")
                            {
                                ElementTags[i] = "HTMLImageElement";
                            }
                            else if (ElementTags[i] == "TABLE")
                            {
                                ElementTags[i] = "HTMLTableElement";
                            }
                            else if (ElementTags[i] == "TR")
                            {
                                ElementTags[i] = "HTMLTableRowElement";
                            }
                            else if (ElementTags[i] == "TD")
                            {
                                ElementTags[i] = "HTMLTableDataCellElement";
                            }
                            else if (ElementTags[i] == "SELECT")
                            {
                                ElementTags[i] = "HTMLSelectElement";
                            }
                            else if (ElementTags[i] == "OPTION")
                            {
                                ElementTags[i] = "HTMLOptionElement";
                            }
                            else if (ElementTags[i] == "IFRAME")
                            {
                                ElementTags[i] = "HTMLIFrameElement";
                            }
                            else
                            {
                                ElementTags[i] = "HTMLElement";
                            }
                        }


                        CurrentBlock.NewBlock(() =>
                        {
                            CurrentBlock += "public static readonly string HtmlText = ((Func<string>)(() =>";
                            CurrentBlock += "{";
                            CurrentBlock.NewBlock(() =>
                            {
                                CurrentBlock += "byte[] ByteResult = null;";

                                CurrentBlock += $"var Result =\n@\"{DocText.Replace("\"", "\"\"")}\";";

                                CurrentBlock += "var Doc = Document.Parse(Result);";
                                CurrentBlock += "var Elements = Doc.GetElementsByTagName(\"*\").ToArray();";
                                CurrentBlock += "foreach(var Element in Elements)";
                                CurrentBlock += "{";
                                CurrentBlock.NewBlock(() =>
                                {
                                    CurrentBlock += "var MNsrc = Element.GetAttribute(\"MNsrc\");";
                                    CurrentBlock += "if (MNsrc == \"\")";
                                    CurrentBlock += "MNsrc = null;";
                                    CurrentBlock += "if (MNsrc != null)";
                                    CurrentBlock += "{";
                                    CurrentBlock.NewBlock(() =>
                                    {
                                        CurrentBlock += "Element.RemoveAttribute(\"MNsrc\");";
                                        CurrentBlock += "var TagName = Element.TagName.ToLower();";
                                        CurrentBlock += "switch(TagName)";
                                        CurrentBlock += "{";
                                        CurrentBlock.NewBlock(() =>
                                        {
                                            CurrentBlock += "case \"script\":";
                                            CurrentBlock.NewBlock(() =>
                                            {
                                                CurrentBlock += "Element.InnerHtml = (string)Type.GetType(MNsrc).GetField(\"TextContent\").GetValue(null);";
                                                CurrentBlock += "break;";
                                            });
                                            CurrentBlock += "case \"img\":";
                                            CurrentBlock.NewBlock(() =>
                                            {
                                                CurrentBlock += "Element.SetAttribute(\"src\",(string)Type.GetType(MNsrc).GetField(\"Url\").GetValue(null));";
                                                CurrentBlock += "break;";
                                            });
                                            CurrentBlock += "case \"link\":";
                                            CurrentBlock.NewBlock(() =>
                                            {
                                                CurrentBlock += "var LinkType = Element.GetAttribute(\"type\");";
                                                CurrentBlock += "if (LinkType!=null)";
                                                CurrentBlock.NewBlock(() => CurrentBlock += @"LinkType=LinkType.ToLower();");
                                                CurrentBlock += "var LinkRel = Element.GetAttribute(\"rel\");";
                                                CurrentBlock += "if (LinkRel!=null)";
                                                CurrentBlock.NewBlock(() => CurrentBlock += @"LinkRel=LinkRel.ToLower();");
                                                CurrentBlock += "if (LinkType==\"text/css\" || LinkRel==\"stylesheet\")";
                                                CurrentBlock += "{";
                                                CurrentBlock.NewBlock(() =>
                                                {
                                                    CurrentBlock += "var Style = Document.document.CreateElement<HTMLStyleElement>();";
                                                    CurrentBlock += "Element.SetAttribute(\"src\",(string)Type.GetType(MNsrc).GetField(\"Url\").GetValue(null));";
                                                    CurrentBlock += "Element.ParentElement.ReplaceChild(Style, Element);";
                                                });
                                                CurrentBlock += "}";
                                                CurrentBlock += "break;";
                                            });
                                        });
                                        CurrentBlock += "}";
                                    });
                                    CurrentBlock += "}";
                                });
                                CurrentBlock += "}";

                                CurrentBlock += "Result = \"<html>\\\"\" +" +
                                                "\"<head>\" + Doc.GetElementsByTagName(\"head\")[0].InnerHtml + \"</head>\" +" +
                                                "\"<body>\" + Doc.GetElementsByTagName(\"body\")[0].InnerHtml + \"</body></html>\";";
                                CurrentBlock += "return Result;";
                            });
                            CurrentBlock += "}))();\n";

                            for (int i = 0; i < ElementIds.Length; i++)
                            {
                                CurrentBlock += "public readonly " + ElementTags[i] + " " + ElementIds[i] + ";";
                            }
                            CurrentBlock += "public " + FileName + "():this(false){}";
                            CurrentBlock += "public " + FileName + "(bool IsGlobal)";
                            CurrentBlock += "{";
                            CurrentBlock.NewBlock(() =>
                            {
                                CurrentBlock += "if(IsGlobal==true)";
                                CurrentBlock += "{";
                                CurrentBlock.NewBlock(() =>
                                {
                                    CurrentBlock += "var Document = new Document();";
                                    for (int i2 = 0; i2 < ElementIds.Length; i2++)
                                    {
                                        CurrentBlock += ElementIds[i2] + "= Document.GetElementById<" + ElementTags[i2] + ">(\"" + ElementIds[i2] + "\");";
                                    }
                                    CurrentBlock += "return;";
                                });
                                CurrentBlock += "}";

                                CurrentBlock += "var doc =  Document.Parse(HtmlText);";
                                CurrentBlock += "var HeadTags = doc.Head.GetElementsByTagName(\"*\").ToArray();";
                                CurrentBlock += "foreach(var Tag in HeadTags)";
                                CurrentBlock += "Document.document.Head.AppendChild(Tag);";

                                for (int i2 = 0; i2 < ElementIds.Length; i2++)
                                {
                                    CurrentBlock += ElementIds[i2] + "= doc.GetElementById<" + ElementTags[i2] + ">(\"" + ElementIds[i2] + "\");";
                                }

                                CurrentBlock += "var div = Document.document.CreateElement(\"Div\");";
                                CurrentBlock += "div.AppendChild(doc.Body);";

                                CurrentBlock += "var Scripts = div.GetElementsByTagName(\"Script\").ToArray();";
                                CurrentBlock += "foreach(var Script in Scripts)";
                                CurrentBlock += "{";
                                CurrentBlock.NewBlock(() =>
                                {
                                    CurrentBlock += "var NewScript = Document.document.CreateElement(\"Script\");";
                                    CurrentBlock += "var Src = Script.GetAttribute(\"src\");";
                                    CurrentBlock += "if(Src!=null)";
                                    CurrentBlock.NewBlock(() =>
                                                          CurrentBlock += "NewScript.SetAttribute(\"src\",Src);");
                                    CurrentBlock += "NewScript.InnerHtml = Script.InnerHtml;";
                                    CurrentBlock += "Script.ParentElement.ReplaceChild(NewScript, Script);";
                                });
                                CurrentBlock += "}";

                                CurrentBlock += "div.SetStyleAttribute(\"display\",\"none\");";
                                CurrentBlock += "Document.document.Body.AppendChild(div);";

                                CurrentBlock += "Document.document.Body.RemoveChild(div);";
                                for (int i2 = 0; i2 < ElementIds.Length; i2++)
                                {
                                    CurrentBlock += ElementIds[i2] + ".Id=\"\";";
                                }
                            });
                            CurrentBlock += "}";
                        });
                    }
                }
                else
                {
                    MakeDataField(file.FullName);
                }

                CurrentBlock += "\n" + AddTabs() + "}";
            }

            foreach (var Dir in dirinfo.GetDirectories())
            {
                CurrentBlock += "\n" + AddTabs() + "namespace " + Dir.Name.Replace(".", "_") +
                                "\n" + AddTabs() + "{";
                Tabs += 1;
                MakeForDir(Dir);
                Tabs         -= 1;
                CurrentBlock += "\n" + AddTabs() + "}";
            }
        }
Exemple #30
0
        public async Task Google([Remainder] string terms = null)
        {
            terms = terms?.Trim();
            if (string.IsNullOrWhiteSpace(terms))
            {
                return;
            }

            terms = WebUtility.UrlEncode(terms).Replace(' ', '+');

            var fullQueryLink = $"https://www.google.ca/search?q={ terms }&gws_rd=cr,ssl&cr=countryUS";
            var config        = Configuration.Default.WithDefaultLoader();

            using (var document = await BrowsingContext.New(config).OpenAsync(fullQueryLink))
            {
                var elems = document.QuerySelectorAll("div.g");

                var resultsElem  = document.QuerySelectorAll("#resultStats").FirstOrDefault();
                var totalResults = resultsElem?.TextContent;
                //var time = resultsElem.Children.FirstOrDefault()?.TextContent
                //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection
                if (!elems.Any())
                {
                    return;
                }

                var results = elems.Select <IElement, GoogleSearchResult?>(elem =>
                {
                    var aTag = (elem.Children.FirstOrDefault()?.Children.FirstOrDefault() as IHtmlAnchorElement); // <h3> -> <a>
                    var href = aTag?.Href;
                    var name = aTag?.TextContent;
                    if (href == null || name == null)
                    {
                        return(null);
                    }

                    var txt = elem.QuerySelectorAll(".st").FirstOrDefault()?.TextContent;

                    if (txt == null)
                    {
                        return(null);
                    }

                    return(new GoogleSearchResult(name, href, txt));
                }).Where(x => x != null).Take(5);

                var embed = new EmbedBuilder()
                            .WithOkColor()
                            .WithAuthor(eab => eab.WithName(GetText("search_for") + " " + terms.TrimTo(50))
                                        .WithUrl(fullQueryLink)
                                        .WithIconUrl("http://i.imgur.com/G46fm8J.png"))
                            .WithTitle(Context.User.ToString())
                            .WithFooter(efb => efb.WithText(totalResults));

                var desc = await Task.WhenAll(results.Select(async res =>
                                                             $"[{Format.Bold(res?.Title)}]({(await _google.ShortenUrl(res?.Link))})\n{res?.Text?.TrimTo(400 - res.Value.Title.Length - res.Value.Link.Length)}\n\n"))
                           .ConfigureAwait(false);

                var descStr = string.Concat(desc);
                _log.Info(descStr.Length);
                await Context.Channel.EmbedAsync(embed.WithDescription(descStr)).ConfigureAwait(false);
            }
        }
Exemple #31
0
        /// <summary>
        /// Get the page from the given url
        /// </summary>
        /// <param name="url">Page url</param>
        /// <returns>Document</returns>
        public async Task <IDocument> GetDocument(string url)
        {
            var config = Configuration.Default.WithDefaultLoader();

            return(await BrowsingContext.New(config).OpenAsync(url));
        }
        public async Task FetchDataAsync(AppledailyNewsConditions conditions, Saver saver)
        {
            var PlatformUrl = "https://tw.appledaily.com/";
            var client      = new HttpClient();

            //maper config
            var mapperConfig = new MapperConfiguration(cfg =>
            {
                cfg.CreateMap <ChinatimesNewsModel, NewsDataModel>();
            });
            var mapper = mapperConfig.CreateMapper();

            var config  = Configuration.Default;
            var context = BrowsingContext.New(config);

            var hrefs = new List <string>();

            List <string> NewsURLList = new List <string>();

            for (DateTime day = DateTime.Now.Add(conditions.timeSpan); day < DateTime.UtcNow; day = day.AddDays(1))
            {
                //昔日文章
                var dailyurl = $"{PlatformUrl}archive/{day.ToString("yyyyMMdd")}/";
                //find post url
                var responseMessage = await client.GetAsync(dailyurl);

                var responseResult = await responseMessage.Content.ReadAsStringAsync();

                var document = await context.OpenAsync(res => res.Content(responseResult));

                var paginationHrefs = document.QuerySelectorAll(".archive-story").Select(x => x.GetAttribute("href"));
                foreach (var href in paginationHrefs)
                {
                    NewsURLList.Add(PlatformUrl + href);
                }
            }

            foreach (var href in NewsURLList)
            {
                var responseMessage = await client.GetAsync(href);

                var responseResult = await responseMessage.Content.ReadAsStringAsync();

                var document = await context.OpenAsync(res => res.Content(responseResult));

                var title = string.Join('\n', document.QuerySelectorAll(".text_medium").Select(x => x.TextContent));

                var contentItems = document.QuerySelectorAll("p").Select(x => x.TextContent);
                var content      = string.Join('\n', contentItems);
                var postDate     = href.Split('/')[4];
                if (content.IndexOf(conditions.Keyword) > 0)
                {
                    var model = new AppledailyNewsModel
                    {
                        Title   = title,
                        Content = content,
                        Date    = postDate,
                        Source  = href
                    };

                    // save result
                    var result = mapper.Map <NewsDataModel>(model);
                    saver.Save(result);
                }
            }

            return;
        }
Exemple #33
0
 /// <summary>
 /// Creates a new named browsing context as child of the given parent.
 /// </summary>
 /// <param name="parent">The parent context.</param>
 /// <param name="name">The name of the child context.</param>
 /// <param name="security">The security flags to apply.</param>
 /// <returns></returns>
 public IBrowsingContext Create(IBrowsingContext parent, String name, Sandboxes security)
 {
     var context = new BrowsingContext(parent, security);
     _cache[name] = new WeakReference<IBrowsingContext>(context);
     return context;
 }