/// <summary> /// See http://www.w3.org/TR/html-imports/#dfn-import-request. /// </summary> public override async Task LoadAsync(IConfiguration configuration, IResourceLoader loader) { var link = Link; var document = link.Owner; var list = ImportLists.GetOrCreateValue(document); var location = Url; var request = link.CreateRequestFor(location); var item = new ImportEntry { Relation = this, IsCycle = CheckCycle(document, location) }; _isasync = link.HasAttribute(AttributeNames.Async); list.Add(item); if (!item.IsCycle) { var nestedStatus = new TaskCompletionSource<Boolean>(); var download = loader.DownloadAsync(request); SetDownload(download); await link.ProcessResponse(download, async response => { var context = new BrowsingContext(document.Context, Sandboxes.None); var options = new CreateDocumentOptions(response, configuration) { ImportAncestor = document }; _import = await context.OpenAsync(options, CancellationToken.None).ConfigureAwait(false); nestedStatus.SetResult(true); }).ConfigureAwait(false); await nestedStatus.Task.ConfigureAwait(false); } }
protected override async Task ProcessResponseAsync(IResponse response) { var context = new BrowsingContext(_parentDocument.Context, Sandboxes.None); var options = new CreateDocumentOptions(response, _configuration, _parentDocument); var factory = _configuration.GetFactory<IDocumentFactory>(); ChildDocument = await factory.CreateAsync(context, options, CancellationToken.None).ConfigureAwait(false); }
private async Task GetProjectNuGetDataAsync(IDocument input, IExecutionContext context, ConcurrentDictionary <string, object> metadata) { List <Package> packageData = new List <Package>(); IReadOnlyList <string> packages = input.GetList("NuGet", Array.Empty <string>()); foreach (string package in packages.Where(x => !string.IsNullOrWhiteSpace(x))) { context.LogInformation($"Getting NuGet data for {package}"); try { IBrowsingContext browsingContext = BrowsingContext.New(AngleSharpConfig); AngleSharp.Dom.IDocument document = await browsingContext.OpenAsync($"https://www.nuget.org/packages/{package}"); if (document.StatusCode != System.Net.HttpStatusCode.OK) { context.LogWarning($"Bad status code for {package}: {document.StatusCode}"); } else if (document == null) { context.LogWarning($"Could not get document for {package}"); } else { Package data = new Package { Id = package }; // Get statistics AngleSharp.Dom.IElement statistics = document .QuerySelectorAll(".package-details-info h2") .First(x => x.TextContent == "Statistics") .NextElementSibling; data.TotalDownloads = statistics.Children .First(x => x.TextContent.Contains("total downloads")) .TextContent.Trim().Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0]; data.PerDayDownloads = statistics.Children .First(x => x.TextContent.Contains("per day")) .TextContent.Trim().Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0]; // Get versions data.Versions = document .QuerySelectorAll("#version-history table tbody tr") .Select(x => new PackageVersion(x)) .ToList(); // Add the data packageData.Add(data); } } catch (Exception ex) { context.LogWarning($"Error getting NuGet data for {package}: {ex.Message}"); } } if (packageData.Count > 0) { metadata.TryAdd("NuGetPackages", packageData); } }
public static async Task <Article> DecorateSingleArticle(Article article) { var config = Configuration.Default.WithDefaultLoader(); IDocument document = await BrowsingContext.New(config).OpenAsync(article.Url); try { string pattern = @"\d{4}\/\d{2}\/\d{2}"; Match match = Regex.Match(article.Url, pattern, RegexOptions.IgnoreCase); if (match.Success) { string dateString = match.Groups[0].Value; DateTime date = DateTime.Parse(dateString); article.ExactPublishDate = date; } } catch (Exception) { } var claimCssSelector = ".Claim >p"; var authorCssSelector = "a.author"; var blogDateCssSelector = ".blog-date"; var conclusionCssSelector = ".conclusion >p"; var articleCssSelector = ".main-content-body .type-post"; var allLinksSelector = ".main-content-body .type-post a"; var claimsResults = document.QuerySelectorAll(claimCssSelector); var conclusionResults = document.QuerySelectorAll(conclusionCssSelector); var articleResult = document.QuerySelector(articleCssSelector); var authorResult = document.QuerySelector(authorCssSelector); var dateResult = document.QuerySelector(authorCssSelector); var blogDateResult = document.QuerySelector(blogDateCssSelector); var modifiedDateTime = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("article:modified_time", x.OuterHtml)); var publishedDateTime = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("article:published_time", x.OuterHtml)); var articleTitleResult = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("og:title", x.OuterHtml)); var imageUrlResult = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("og:image", x.OuterHtml)); var descriptionUrlResult = document.Head.QuerySelectorAll("meta").FirstOrDefault(x => FindPartialWordMatch("og:description", x.OuterHtml)); var allLinks = document.QuerySelectorAll(allLinksSelector); var sourceLinks = allLinks.Where(x => x.InnerHtml?.ToLower() == "πηγή"); var restOfTheLinks = allLinks.Where(x => x.InnerHtml?.ToLower() != "πηγή"); article.Author = authorResult?.InnerHtml; article.Claim = claimsResults?.Select(x => x.InnerHtml).ToList(); article.Result = conclusionResults?.Select(x => x.InnerHtml).ToList(); article.RawArticleHtml = articleResult?.InnerHtml; article.RawText = articleResult?.Text(); article.PublishDate = blogDateResult?.InnerHtml; DateTime.TryParse(modifiedDateTime?.GetAttribute("Content"), out var modifiedDateValue); DateTime.TryParse(publishedDateTime?.GetAttribute("Content"), out var publishedDateValue); article.ModifiedDate = modifiedDateValue; article.ExactPublishDate = publishedDateValue; article.GreekHoaxTitle = articleTitleResult?.GetAttribute("Content"); article.ImageUrl = imageUrlResult?.GetAttribute("Content"); article.GreekHoaxDescription = descriptionUrlResult?.GetAttribute("Content"); article.ProofLinks = sourceLinks?.Select(x => x.GetAttribute("href")).ToList(); article.HoaxSourceLinks = restOfTheLinks?.Select(x => x.GetAttribute("href")).ToList(); return(article); }
public async Task <string[]> GetInfo(string address) { string name = System.Web.HttpUtility.UrlDecode(address.Split('/') [4].Replace("_Ecology", "").Replace("_", " ")); Console.WriteLine(String.Format("Attempting to write data for {0}.", name)); var config = Configuration.Default.WithDefaultLoader(); var context = BrowsingContext.New(config); var page = await context.OpenAsync(address); string image_url; try { var imelement = (IHtmlImageElement)page.QuerySelector("#In-Game_Information") .ParentElement.NextElementSibling.FirstElementChild.FirstElementChild; image_url = imelement.Source; if (image_url.Contains("data:image")) { image_url = imelement.GetAttribute("data-src"); } } catch (Exception) { try { var imelement = (IHtmlImageElement)page.QuerySelector("#In-Game_Description") .ParentElement.NextElementSibling.FirstElementChild.FirstElementChild; image_url = imelement.Source; if (image_url.Contains("data:image")) { image_url = imelement.GetAttribute("data-src"); } } catch (Exception) { image_url = "https://vignette.wikia.nocookie.net/monsterhunter/images/2/2e/MHFU-Question_Mark_Icon.png/revision/latest?cb=20100610145952"; } } var tax = page.QuerySelector("#Taxonomy"); //can contain list elements so needs to be handled seperately IElement ul; try { ul = tax.ParentElement.QuerySelector("ul"); } catch (NullReferenceException) { ul = null; } string habitat, niche, biology, behavior, taxonomy; try { if (ul != null) { taxonomy = String.Join('\n', ul.QuerySelectorAll("li").Select(li => li.TextContent)); } else { taxonomy = tax.ParentElement.NextElementSibling.TextContent; } habitat = page.QuerySelector("#Habitat_Range").ParentElement.NextElementSibling.TextContent; niche = page.QuerySelector("#Ecological_Niche").ParentElement.NextElementSibling.TextContent; try // people's lovely spelling errors :) { biology = page.QuerySelector("#Biological_Adaptations").ParentElement.NextElementSibling.TextContent; } catch { biology = page.QuerySelector("#Biological_Adaptions").ParentElement.NextElementSibling.TextContent; } behavior = page.QuerySelector("#Behavior").ParentElement.NextElementSibling.TextContent; return(new string[] { name, image_url, taxonomy, habitat, niche, biology, behavior }); } catch (Exception) { Console.WriteLine("Either " + name + " is not a monster or some spelling error prevented the data from being accessed."); } return(new string[] { "ERROR" }); }
private async Task GetContent() { try { using (var scope = _scopeFactory.CreateScope()) { var dbContext = scope.ServiceProvider.GetRequiredService <ApplicationDbContext>(); DateTime startTime = DateTime.Now; dbContext.EventLogs.Add(new EventLog { Message = "Start. Crawling content: ", EventType = "scheduler", CreatedDate = DateTime.Now }); foreach (Category c in categories) { if (string.IsNullOrEmpty(c.ExternalUrl)) { continue; } dbContext.EventLogs.Add(new EventLog { Message = "Start. Crawling for category: " + c.Name + " from " + c.ExternalUrl, EventType = "scheduler", CreatedDate = DateTime.Now }); try { int categoryId = c.Id; string[] urls = c.ExternalUrl.SplitCommas(); foreach (string url in urls) { var config = Configuration.Default.WithDefaultLoader(); var context = BrowsingContext.New(config); //string url = "https://mraovat.nguoi-viet.com/classified/phong-cho-thue-rooms-to-share-browse-88.aspx"; var document = await context.OpenAsync(url); //var selectedItems = document.All.Where(m => m.LocalName == "td" && m.ClassList.Contains("TBLRoll")); //view mor selectors syntax at //https://www.w3schools.com/cssref/css_selectors.asp //OR var selectedItems = document.QuerySelectorAll("table.listingsTBL td"); //string userId = "3eb8064b-f4cd-480c-9e4c-f18d0cbadcc3"; foreach (var item in selectedItems) { if (item.QuerySelector("div.ListingNewNDate>img") != null) { string title = item.QuerySelector("div.ListingDescription>a").Text().Trim(); string datetime = item.QuerySelector("div.ListingNewNDate>span").Text(); Post model = new Post { OwnerId = _userId, CreatedDate = DateTime.Now, ModifiedDate = DateTime.Now, Title = title, Content = null, CityId = null, CategoryId = categoryId, PostalCode = null, ContactEmail = null, ContactPhone = null }; dbContext.Posts.Add(model); } } } dbContext.EventLogs.Add(new EventLog { Message = "Done. Crawling for category: " + c.Name + " from " + c.ExternalUrl, EventType = "scheduler", CreatedDate = DateTime.Now }); var result = await dbContext.SaveChangesAsync(); } catch (Exception ex1) { dbContext.EventLogs.Add(new EventLog { Message = "Error. Crawling for category: " + c.Name + " from " + c.ExternalUrl, EventType = "scheduler", CreatedDate = DateTime.Now }); var result1 = await dbContext.SaveChangesAsync(); } } dbContext.EventLogs.Add(new EventLog { Message = "End. Crawling content: " + (DateTime.Now - startTime).TotalSeconds + " (s)", EventType = "scheduler", CreatedDate = DateTime.Now }); var result2 = await dbContext.SaveChangesAsync(); } } catch (Exception ex) { _logger.LogError(ex.ToString()); } }
public Scraper() { _config = Configuration.Default.WithDefaultLoader(); _context = BrowsingContext.New(_config); }
private static async Task <Bike> GetBikeInfo(BikesUrl b) { // Get DOM var context = BrowsingContext.New(Configuration.Default); var document = await context.OpenAsync(async req => req.Content(await GetContent("https://www.bikebd.com/bikes/hero-achiever-150/"))); //var document = await context.OpenAsync(async req => req.Content(await GetContent(b.Url))); Bike bike = new Bike(); bike.BrandId = b.BrandId; bike.PostTitle = document.Title; var imageOwls = document .QuerySelectorAll( "body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.single_post_thumb") .FirstOrDefault()?.ChildNodes[0].ChildNodes; List <string> bikeImages = new List <string>(); foreach (var img in imageOwls) { var imageDiv = img.ToHtml(); string src = Regex.Match(imageDiv, "<img.+?src=[\"'](.+?)[\"'].*?>", RegexOptions.IgnoreCase).Groups[1].Value; bikeImages.Add(src); } bike.Images = bikeImages.ToArray(); bike.Name = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(1) > div.col-sm-9 > div > h4").FirstOrDefault()?.TextContent; // Basic bike.Features = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(1) > table > tbody > tr > td:nth-child(1)").FirstOrDefault()?.TextContent; bike.DisplacementCC = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(1) > table > tbody > tr > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Mileage = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(1) > table > tbody > tr > td:nth-child(3)").FirstOrDefault()?.TextContent; // Bike Overview bike.Price = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.FuelSupplySystem = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.StartingMethod = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.CoolingSystem = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.EngineeOilRecommendation = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.TyresType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.TopSpeed = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div.full_specifications > div:nth-child(2) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Specifications bike.EngineeType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.MaximumPower = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.MaximumTorque = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Bore = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Stroke = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.CompressionRatio = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.NoOfCylinders = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(5) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Transmission bike.TransmissionType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(6) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.NoOfGears = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(6) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.ClutchType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(6) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Chassis & Suspension bike.ChassisType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(7) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.FrontSuspension = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(7) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.RearSuspension = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(7) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Brakes bike.FrontBrakeType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.RearBrakeType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.FrontBrakeDiameter = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.RearBrakeDiameter = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.AntiLockBrakingSystem_ABS = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(8) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Wheels & Tires bike.FrontTireSize = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(9) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.RearTireSize = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(9) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.TubelessTires = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(9) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Dimensions bike.OverallLength = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.OverallWidth = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.OverallHeight = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.GroundClearance = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Weight = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.FuelTankCapacity = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Wheelbase = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(10) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Electricals bike.BatteryType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.BatteryVoltage = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.HeadLight = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.TailLight = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Indicators = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(11) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent; // Features bike.Speedometer = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(1) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.Odometer = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(2) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.RPMMeter = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(3) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.HandleType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(4) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.SeatType = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(5) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.PassengerGrabRail = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(6) > td:nth-child(2)").FirstOrDefault()?.TextContent; bike.EngineKillSwitch = document.QuerySelectorAll("body > div.full-width > section.bikebd_main_content_area > div > div > div.col-sm-7 > div.bikebd_posts_area > div > div:nth-child(12) > table > tbody > tr:nth-child(7) > td:nth-child(2)").FirstOrDefault()?.TextContent; return(bike); }
public async Task FetchDataAsync(NewsConditions conditions, Saver saver) { //var result = new List<ChinatimesNewsModel>(); var client = new HttpClient(); var config = Configuration.Default; var context = BrowsingContext.New(config); //get html content var page = 1; var hrefs = new List <string>(); var paginationDataTotal = 0; do { var url = $"https://www.ettoday.net/news_search/doSearch.php?keywords={conditions.Keyword}&daydiff=3&page={page}"; var responseMessage = await client.GetAsync(url); var responseResult = await responseMessage.Content.ReadAsStringAsync(); var document = await context.OpenAsync(res => res.Content(responseResult)); var datetimePattern = "2[0-9]{3}-[0|1][0-9]-[0-2][0-9] [0-2][0-9]:[0-5][0-9]"; var paginationHrefs = document.QuerySelectorAll(".archive.clearfix") .Where(x => DateTime.Parse(Regex.Match(x.QuerySelector(".date").TextContent, datetimePattern).Value) > DateTime.Now.Add(conditions.timeSpan)) .Select(x => x.QuerySelector("h2 a").GetAttribute("href")); paginationDataTotal = paginationHrefs.Count(); hrefs.AddRange(paginationHrefs); page++; } while (paginationDataTotal > 0); foreach (var href in hrefs) { var responseMessage = await client.GetAsync(href); var responseResult = await responseMessage.Content.ReadAsStringAsync(); var document = await context.OpenAsync(res => res.Content(responseResult)); var title = document.QuerySelector("h1.title").TextContent; var story = document.QuerySelector("[itemprop=articleBody]"); //刪除圖文廣告外嵌 foreach (var d in story.QuerySelectorAll("iframe,a,img")) { story.QuerySelector("iframe,a,img").Remove(); } //圖註解 var contentItems = story.QuerySelectorAll("p") .Where(x => x.TextContent.Length > 20 && !x.TextContent.StartsWith("▲")).Select(x => x.TextContent); var content = string.Join(string.Empty, contentItems); var postDate = DateTime.Parse(document.QuerySelector(".date").GetAttribute("datetime")); var post = new EttodayNewsModel { Title = title, Content = content, Date = postDate.ToString("yyyyMMdd"), Source = href }; //save result //var result = mapper.Map<NewsDataModel>(model); saver.Save(post); } return; }
internal XmlDocument(IBrowsingContext context, TextSource source) : base(context ?? BrowsingContext.New(), source) { ContentType = MimeTypes.Xml; }
private async void Server_OnPostProcessingMessage(object sender, IrcMessage theMessage) { if (theMessage.IsIgnored) { return; } try { List <string> links = theMessage.CommandArgs.Where(x => x.StartsWith("http://") || x.StartsWith("https://")).Distinct().ToList(); if (links.Count == 0) { return; } var tasks = new List <Task <HttpResponseMessage> >(links.Count); foreach (string link in links) { Uri address = new Uri(link); if (IPAddress.TryParse(address.DnsSafeHost, out var ip) && IsInternal(ip)) { continue; } else { try { var addresses = Dns.GetHostAddresses(address.DnsSafeHost); if (addresses.Any(IsInternal)) { continue; } } catch (Exception ex) { Console.WriteLine(ex); } } tasks.Add(Client.GetAsync(address, HttpCompletionOption.ResponseContentRead)); } if (tasks.Count == 0) { return; } try { await Task.WhenAll(tasks); } catch { } foreach (var task in tasks) { try { var response = await task; var stream = await response.Content.ReadAsStreamAsync(); IDocument doc = await BrowsingContext.New().OpenAsync(x => x.Content(stream, true)); string title = Regex.Replace(doc.Title.Replace("\n", "").Replace("\r", "").Replace("–", "–"), "[ ]{2,}", " "); if (!String.IsNullOrWhiteSpace(title)) { theMessage.Answer("[url] " + title); } } catch { } } } catch (Exception ex) { Log.Error(ex, "Fehler beim Downloaden der Webseite"); } }
public async Task LoadGtfsData() { await _loadGtfsDataChannel.Writer.WriteAsync("Downloading GTFS data"); using var client = new HttpClient(); var content = await client.GetStringAsync("http://www.dtpm.cl/index.php/gtfs-vigente"); using var document = await BrowsingContext .New(Configuration.Default) .OpenAsync(req => req.Content(content)); var downloadLink = document .QuerySelectorAll("a") .OfType <IHtmlAnchorElement>() .Where(a => a.Href.ToLower().Contains("gtfs") && a.Href.ToLower().Contains(".zip")) .Select(a => a.Href) .FirstOrDefault(); await using var gtfsStreamZippedFile = await client.GetStreamAsync(downloadLink); using var file = new ZipArchive(gtfsStreamZippedFile); foreach (var entry in file.Entries) { using var reader = new StreamReader(entry.Open()); using var csv = new CsvReader(reader, CultureInfo.InvariantCulture); switch (entry.Name) { case "agency.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk agencies data"); csv.Configuration.RegisterClassMap <AgencyMap>(); var agencyRecords = csv.GetRecords <Agency>(); await _agencyRepository.Bulk(agencyRecords); break; case "calendar_dates.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk calendar dates data"); csv.Configuration.RegisterClassMap <CalendarDateMap>(); var calendarDatesRecords = csv.GetRecords <CalendarDate>(); await _calendarDateRepository.Bulk(calendarDatesRecords); break; case "calendar.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk calendar data"); csv.Configuration.RegisterClassMap <CalendarMap>(); var calendarRecords = csv.GetRecords <Calendar>(); await _calendarRepository.Bulk(calendarRecords); break; case "feed_info.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk feed info data"); csv.Configuration.RegisterClassMap <FeedInfoMap>(); var feedInfoRecords = csv.GetRecords <FeedInfo>(); await _feedInfoRepository.Bulk(feedInfoRecords); break; case "frequencies.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk frequencies data"); csv.Configuration.RegisterClassMap <FrequencyMap>(); var frequencyRecords = csv.GetRecords <Frequency>(); await _frequencyRepository.Bulk(frequencyRecords); break; case "routes.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk routes data"); csv.Configuration.RegisterClassMap <RouteMap>(); var routeRecords = csv.GetRecords <Route>(); await _routeRepository.Bulk(routeRecords); break; case "shapes.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk shapes data"); csv.Configuration.RegisterClassMap <ShapeMap>(); var shapeRecords = csv.GetRecords <Shape>(); await _shapeRepository.Bulk(shapeRecords); break; case "stop_times.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk stop times data"); csv.Configuration.RegisterClassMap <StopTimeMap>(); var stopTimeRecords = csv.GetRecords <StopTime>(); await _stopTimeRepository.Bulk(stopTimeRecords); break; case "stops.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk stops data"); csv.Configuration.RegisterClassMap <StopMap>(); var stopRecords = csv.GetRecords <Stop>(); await _stopRepository.Bulk(stopRecords); break; case "trips.txt": await _loadGtfsDataChannel.Writer.WriteAsync("Bulk trips data"); csv.Configuration.RegisterClassMap <TripMap>(); var tripRecords = csv.GetRecords <Trip>(); await _tripRepository.Bulk(tripRecords); break; } } }
static async Task Main() { var config = Configuration.Default.WithDefaultLoader(); var url2 = new Url("https://phys.org/news/2020-11-rare-species-small-cats-inadequately.html"); var doc2 = await BrowsingContext.New(config).OpenAsync(url2); //get image var image = doc2.GetElementsByClassName("article-img")[0].GetElementsByTagName("img")[0].GetAttribute("src"); //credit: var credit = doc2.GetElementsByClassName("article-img")[0].GetElementsByTagName("figcaption")[0].TextContent.Trim(); //mainText var textParagraphs = doc2.GetElementsByClassName("article-main")[0].GetElementsByTagName("p"); var sb = new StringBuilder(); foreach (var paragraph in textParagraphs) { sb.AppendLine(paragraph.TextContent.Trim()); sb.AppendLine(); } sb.ToString().TrimEnd(); //title var title2 = doc2.GetElementsByClassName("news-article")[0].GetElementsByTagName("h1")[0].TextContent; Console.WriteLine(title2); int newsNumber = 0; for (int i = 1; i < 1; i++) { var url = new Url("https://phys.org/biology-news/ecology/sort/date/all/page" + i + ".html"); var doc = await BrowsingContext.New(config).OpenAsync(url); var x = doc.GetElementsByClassName("sorted-article")[0]; var MainPhoto = x.GetElementsByClassName("sorted-article-figure")[0] .GetElementsByTagName("img")[0] .GetAttribute("data-src"); var MainPage = x.GetElementsByClassName("sorted-article-figure")[0] .GetElementsByTagName("a")[0] .GetAttribute("href"); var Tile = x.GetElementsByClassName("sorted-article-content")[0] .GetElementsByTagName("a")[0] .TextContent; var ShortIntro = x.GetElementsByClassName("sorted-article-content")[0] .GetElementsByTagName("p")[0] .TextContent .Trim(); var Category = x.GetElementsByClassName("article__info")[0] .GetElementsByTagName("p")[0] .TextContent .Trim(); var PostedOn = x.GetElementsByClassName("article__info")[0] .GetElementsByTagName("p")[1] .TextContent .Trim(); // foreach (var item in elements) // { // Console.WriteLine(item.MainPage); // Console.WriteLine(item.MainPhoto); // Console.WriteLine(item.Tile); // Console.WriteLine(item.ShortIntro); // Console.WriteLine(item.Category); // Console.WriteLine(item.PostedOn); // Console.WriteLine(); // //get main photo // //Console.WriteLine(item.GetElementsByClassName("sorted-article-figure")[0].GetElementsByTagName("img")/[0].GetAttribute("data-src")); // // //get more info page // // Console.WriteLine(item.GetElementsByClassName("sorted-article-figure")[0].GetElementsByTagName("a")/[0].GetAttribute("href")); // // //get title // // Console.WriteLine(item.GetElementsByClassName("sorted-article-content")[0].GetElementsByTagName("a")/[0].TextContent); // // //get short intro // //Console.WriteLine(item.GetElementsByClassName("sorted-article-content")[0].GetElementsByTagName("p")/[0].TextContent.Trim()); // // //get category // //Console.WriteLine(item.GetElementsByClassName("article__info")[0].GetElementsByTagName("p")/[0].TextContent.Trim()); // // //get postedOn // //Console.WriteLine(item.GetElementsByClassName("article__info")[0].GetElementsByTagName("p")/[1].TextContent.Trim()); // // //Console.WriteLine(item.TextContent); // //Console.WriteLine("Innet htnl___________"+item.InnerHtml); // //Console.WriteLine("Outer html___________"+item.OuterHtml); // //Console.WriteLine("to html______________"+item.ToHtml()); // } } }
private static async Task <Action> processCommentedOnProfile(UserProfile userProfile, Bot bot) { WebBrowser webBrowser = ASF.WebBrowser; Uri url = new("https://steamcommunity.com/comment/Profile/render/" + bot.SteamID); JObject response = (await webBrowser.UrlGetToJsonObject <JObject>(url)).Content; if (!response.GetValue("success").ToObject <bool>()) { return(new Action("none")); } var context = BrowsingContext.New(Configuration.Default); var htmlDocument = await context.OpenAsync(req => req.Content(response.GetValue("comments_html").ToString())); List <KeyValuePair <string, string> > comments = new List <KeyValuePair <string, string> >(); var nodes = htmlDocument.QuerySelectorAll("div.commentthread_comment"); var groupedData = new List <KeyValuePair <string, string> >().ToLookup(x => x.Key, x => x.Value); if (nodes != null) { foreach (var node in nodes) { var authorLinkNode = node.QuerySelector("a.commentthread_author_link"); var commentNode = node.QuerySelector("div.commentthread_comment_text"); Uri authorUri = new Uri(authorLinkNode.GetAttribute("href")); string comment = commentNode.Text().Trim().Normalize(); string authorProfileID = authorUri.Segments[authorUri.Segments.Count() - 1].Replace(@"/", ""); comments.Add(new KeyValuePair <string, string>(authorProfileID, comment)); } Uri senderProfileUri = new Uri(userProfile.profileUrl); string senderProfileID = senderProfileUri.Segments[senderProfileUri.Segments.Count() - 1].Replace(@"/", ""); groupedData = comments.ToLookup(x => x.Key, x => x.Value); } Config.FriendInviteConfigs.TryGetValue(bot, out Config config); string defaultAction = "none"; foreach (ConfigItem item in config.Comments) { switch (item.condition) { case "less_than": if (!groupedData.Contains(userProfile.steamId64.ToString()) && (Convert.ToInt32(item.value) > 0)) { return(new Action(item.action, "Number of comments < " + Convert.ToInt32(item.value))); } if (groupedData.Contains(userProfile.steamId64.ToString()) && (groupedData[userProfile.steamId64.ToString()].Count() < Convert.ToInt32(item.value))) { return(new Action(item.action, "Number of comments < " + Convert.ToInt32(item.value))); } break; case "more_than": if (groupedData.Contains(userProfile.steamId64.ToString()) && (groupedData[userProfile.steamId64.ToString()].Count() > Convert.ToInt32(item.value))) { return(new Action(item.action, "Number of comments > " + Convert.ToInt32(item.value))); } break; case "equal": if (groupedData.Contains(userProfile.steamId64.ToString()) && groupedData[userProfile.steamId64.ToString()].Contains(item.value)) { return(new Action(item.action, "Comment is " + item.value)); } break; case "contain": if (groupedData.Contains(userProfile.steamId64.ToString()) && (groupedData[userProfile.steamId64.ToString()].Where(comment => comment.Contains(item.value, StringComparison.OrdinalIgnoreCase)).Count() > 0)) { return(new Action(item.action, "Profile comment contains " + item.value)); } break; case "default": defaultAction = item.action; break; } } return(new Action(defaultAction)); }
public static IEnumerable <CWMVolume> ScrapeVolumes() { var context = BrowsingContext.New( Configuration.Default.WithDefaultLoader()); var volumesListPageUrl = _archiveDomain; using (var document = context .OpenAsync(volumesListPageUrl) .GetAwaiter() .GetResult()) { var content = document .GetElementById("canvas-wrapper") .QuerySelector("div#canvas") .QuerySelector("div#page-body-wrapper") .QuerySelector("div#page-body") .QuerySelector("div#content-wrapper") .QuerySelector("div#content"); var mainContentDiv = content .Children[1]; var mainContentRow = mainContentDiv .QuerySelector("div.sqs-layout") .QuerySelector("div.row.sqs-row"); var unorderedLists = mainContentRow .QuerySelectorAll( "ul.archive-group-list") .ToArray(); foreach (var unorderedList in unorderedLists) { var volumeListItemArchiveGroupEntries = unorderedList .QuerySelectorAll( "li.archive-group") .ToArray(); foreach (var volumeListItemArchiveGroupEntry in volumeListItemArchiveGroupEntries) { var volumeLinkElement = volumeListItemArchiveGroupEntry .QuerySelector("a.archive-group-name-link"); var hrefRelativeVolumeLink = volumeLinkElement.GetAttribute("href"); var hrefVolumeLinkRawText = volumeLinkElement.TextContent; var formattedVolumeText = hrefVolumeLinkRawText.Trim(); var volumeTextMatch = _volumeLinkTextRegex.Match(formattedVolumeText); var volumeYearMatchText = volumeTextMatch.Groups["year"].Value; var volumeNumberMatchText = volumeTextMatch.Groups["volumeNumber"].Value; if (!int.TryParse(volumeYearMatchText, out var volumeYear)) { throw new FormatException( $"Cannot parse the 'year' from the text {volumeYearMatchText.Quote()}."); } if (!int.TryParse(volumeNumberMatchText, out var volumeNumber)) { throw new FormatException( $"Cannot parse the 'volumeNumber' from the text {volumeNumberMatchText.Quote()}."); } var volumePageAbsoluteUrl = $"{_domain.TrimEnd('/')}{hrefRelativeVolumeLink}"; yield return(new CWMVolume( volumeYear, volumeNumber, volumePageAbsoluteUrl)); } } } }
public static IEnumerable <CWMArticle> ScrapeIssueArticles( CWMIssue cwmIssue) { var context = BrowsingContext.New( Configuration.Default.WithDefaultLoader()); var downloadPageUrl = cwmIssue.IssuePageAbsoluteUrl; using (var document = context .OpenAsync(downloadPageUrl) .GetAwaiter() .GetResult()) { var canvasWrapper = document .GetElementById("canvas-wrapper"); var contentDiv = canvasWrapper .QuerySelector( "div#canvas > " + "div#page-body-wrapper > " + "div#page-body > " + "div#content-wrapper > " + "div#content"); var mainContentDiv = contentDiv.Children[2]; var productBlockContentElement = mainContentDiv .QuerySelector( "div#productWrapper > " + "div.product-description > " + "div.sqs-layout > " + "div.row.sqs-row > " + "div.col > " + "div.sqs-block.html-block > " + "div.sqs-block-content"); var currentArticleCategory = "Unknown"; var magazineSections = ValueEnum .EnumerateValues <MagazineSection, string>() .ToArray(); foreach (var productBlockElement in productBlockContentElement.Children) { var articeInfoStr = productBlockElement .TextContent .Replace(" ", "") .Trim(); var isMagazineSection = magazineSections .Contains( articeInfoStr, new FuzzyStringMatchingComparer(2)); if (isMagazineSection) { currentArticleCategory = articeInfoStr; continue; } var splitTerms = articeInfoStr.Split('-'); if (splitTerms.Length == 2) { var articleName = splitTerms[0].Trim(); var articleAuthor = splitTerms[1].Trim(); yield return(new CWMArticle( currentArticleCategory, articleName, articleAuthor, cwmIssue)); } else { var articleName = articeInfoStr.Trim(); yield return(new CWMArticle( currentArticleCategory, articleName, "unknown", cwmIssue)); } } } }
public static IEnumerable <CWMIssue> ScrapeVolumeIssues( CWMVolume cwmVolume) { var context = BrowsingContext.New( Configuration.Default.WithDefaultLoader()); var downloadPageUrl = cwmVolume.VolumePageAbsoluteUrl; using (var document = context .OpenAsync(downloadPageUrl) .GetAwaiter() .GetResult()) { var canvasWrapper = document .GetElementById("canvas-wrapper"); var contentDiv = canvasWrapper .QuerySelector( "div#canvas > " + "div#page-body-wrapper > " + "div#page-body > " + "div#content-wrapper > " + "div#content"); var mainContentDiv = contentDiv.Children[2]; var mainProductList = mainContentDiv .QuerySelector("div#productList"); var issueLinkElements = mainProductList .QuerySelectorAll( "a.product") .ToArray(); foreach (var issueLinkElement in issueLinkElements) { var hrefRelativeIssueLink = issueLinkElement.GetAttribute("href"); var formattedIssueText = hrefRelativeIssueLink.Trim(); if (!_issueLinkTextRegex.IsMatch(formattedIssueText)) { //Console.WriteLine($"Cannot parse IssueLinkText {formattedIssueText.Quote()}"); continue; } var issueTextMatch = _issueLinkTextRegex.Match(formattedIssueText); var issueMagazineText = issueTextMatch.Groups["magazine"].Value; var volumeNumberMatchText = issueTextMatch.Groups["volumeNumber"].Value; var issueNumberMatchText = issueTextMatch.Groups["issueNumber"].Value; if (issueNumberMatchText.IsNullOrEmptyEx()) { //Console.WriteLine($"Cannot parse IssueLinkText regex {formattedIssueText.Quote()}"); continue; } var issueMagazine = Magazine.GetMagazineFromPrefix(issueMagazineText); if (!int.TryParse(volumeNumberMatchText, out var volumeNumber)) { throw new FormatException( $"Cannot parse the 'volumeNumber' from the text {volumeNumberMatchText.Quote()}."); } if (!int.TryParse(issueNumberMatchText, out var issueNumber)) { throw new FormatException( $"Cannot parse the 'issueNumber' from the text {issueNumberMatchText.Quote()}."); } var issuePageAbsoluteUrl = $"{_domain.TrimEnd('/')}{hrefRelativeIssueLink}"; yield return(new CWMIssue( volumeNumber, issueNumber, issueMagazine, issuePageAbsoluteUrl, cwmVolume)); } } }
public async Task <List <DownloadedPost> > ProcessPostTask(IEnumerable <string> urls) { var angleSharpConfig = Configuration.Default .WithCulture("es-es") .WithDefaultLoader() .WithCss() .WithJs() .WithXPath(); var angleSharpContext = BrowsingContext.New(angleSharpConfig); var result = new List <DownloadedPost>(); try { foreach (var url in urls) { _logger.LogInformation("Starts processing posts for URL {0}", url); var mainPageDocument = await angleSharpContext.OpenAsync(url); var links = await _strategyLoaderService.ParseLinksByStrategy(mainPageDocument); var groupedResult = links.GroupBy(x => x.Href); _logger.LogInformation("Got a total of {0} links for URL {1}", groupedResult.Count(), url); foreach (var link in groupedResult) { var downloadedPost = new DownloadedPost(); var postDocument = await angleSharpContext.OpenAsync(link.First().Href); var title = await _strategyLoaderService.ParseTitleByStrategy(postDocument); var content = await _strategyLoaderService.ParseContentByStrategy(postDocument); if (string.IsNullOrWhiteSpace(title)) { var extractedTitle = string.Empty; foreach (var words in content.ToList().Select(paragraph => paragraph.SplitSpaces())) { extractedTitle = string.Join("", words.Take(7)); if (words.Length == 7) { break; } } title = extractedTitle; } downloadedPost.Title = title; downloadedPost.PostContents = content; result.Add(downloadedPost); } } } catch (Exception e) { _logger.LogError(e, "Rest in pepperoni, innerEx: {0}", e.InnerException); } result.RemoveAll(x => !x.PostContents.Any() || x.PostContents.All(string.IsNullOrWhiteSpace)); return(result); }
public async Task <ParseResult <Parse.Data> > GetMoviesByDateAsync(DateTime date, List <Cinema> cinemas, List <Movie> movies_proto) { List <Movie> movies = new List <Movie>(); movies.AddRange(movies_proto); string param = ""; param = $"{date.Year}-{date.Month}-{date.Day}"; ConcurrentDictionary <string, string> headers = new ConcurrentDictionary <string, string>(); headers.TryAdd("ContentType", "application/json"); headers.TryAdd("Cookie", "cookieLanguageCode=en-US; _gcl_au=1.1.1238722976.1602705939; _fbp=fb.1.1602705939926.1048953155; _gid=GA1.2.445262349.1603531359; moe_uuid=56694985-c80f-499b-b8ee-353d2ba58144; movieDetails=qeoEzVnzBc-6cYMbYWwS4scp_uWZetsLEfEwEONYBHu6Ug3EcLkPOkkIJRxNIl21LoXAHMoiLl-K2GFvWFxLoLetERKNJ9mg2n5nJuGQnCTm4cLcKFW7kZ4wKrzk7QcgU5mbqjSflnb4_PeFk656nOlvLWaS7p8iFkS-zpiG8sRM94UJOeX_RAJBHeSg-2ca0iolX65I08aDzQXISNe834N45rFvnlXKnczOqHQxCJvkiScCXPEMudUt0mZ50_zT4mA6cD08aIHg-Hc-kaOI0Sg3RjPFVqt3tsn14JDZoCeu0PFyc7VLVr79jwvPn6r--qKRLudWoNJO5aJNQv_2smOTfz0_ZUF7RxCpwQ8geT_vx7XKrH_0c9o23bsv5jiokSdBTwrkHbiwYk4ITrwCkmhHtkHeJVYs3sKRoroNevKirCwY585fkH59_SHcrTA8RUc6JMj2ixXsezJeeszlngooxw8GHDdH5o1ASeVsvKFbOcHeWPCNjA5XgQZvyIFKoHDQjZXS5N-Tu0-G_wNFmL4aQBnOdvIbFw0_DfbDHLaFTfeJY9NuTSyhoZJP8mOaebaIZMokCCFINDhBcgerB1CcAhP5SUnKdXkeLbBADZskqh_sU-f8QzGsesP1dOa1tNQmRDwenGOKb3MDof1EkQCYenShYRMmJC7VY1SBqmlM8n0aOMycbc1kcoLME98Z7F4BGJUKMMxa4PB1XU_e3OIEZYobFZP6czI1juhh9yYHWum4Yyd_JVGXqjh622CGWUhIXWAjkwDX1n6n5C-nb5ar5UVbclfatS1YKyuUXAU_zODz2OFT8Wxo5Lto1OFtqEGYkKiIu-m9A7w4l8yQIUpb1tLpOvJnEbXe_1CWiqM6hEaixfrc7jJHQ-FncGApH7i0WNF4iALHGV0aeIwjHiL5jbE5rMrHi1K_suW7TM2xrzVg93DQQBlgvhnd7g5UJ4jTCR4XyWMKKMcPBhw3R-aNQEnlbLyhWznB3ljwjYwOV4EGb8iBSqBw0I2V0uTfk7tPhv8DRZi-GkAZznbyGxcNPw32wxy2hU33iWPTbk09TfVWsyySmAhSvdSZ3240g2jYn36zA11QnAIT-UlJynV5Hi2wQA2bJKof7FPn_EMxrHrD9XTmtbTUJkQ8HpxjAwoeVLSoTKPQ4y2TWLCzR9Ei5wWaWcn3MjZnddtzlBGOiECIDhKg5cvAi5xIRSpTWuDwdV1P3tziBGWKGxWT-nMyNY7oYfcmB1rpuGMnfyVRl6o4ettghllISF1gI5MA19Z-p-Qvp2-Wq-VFW3JX2lovt8CIT7PaD5XH9yjF98wF5oI0EPF53Lc738yxFmYEfw1Kbiphe4_NiXlcBxA2rIj5MI_W0zvmFZchj8C0eES6TXP_fFrixhgjHoboPhh0jSDlwXIV-kd1zBprw6hS4YPF4MlAm57mDdEnjV8a-GNx2JO20V7SeT9Jz852f9ooX6uZGVYXcaF7tpJ-xlK_lQJOV3XVyz_KJZRfILUk-QCoND7wXTraBj3qRM7h7rfdJ3SbyxHW4kpaM6i2DODUh5ZT8_yOKDzi7Ueba1g_6F3Uf97Rk_w4uNL8C58OyDdurk9Q_OweYQuFfmV4aoKNnF5r5Vq_Kc69zTw79tbFWEL0GrFW9QTj8WTr7r6dyup5NhKbmuzd6OL1mdCdxQGASSVesopwH44N0UIx-_UCNAn3zhfAlxD53CiOdBMGW5Clq-CQyaBo9TFZ2UH-CqdlTqDK_2z0XZKxvnSlBp--b9VRWoHoa9fnPMkB2lBmtrQHR1BO9A7Et6IPlpwqbOK2572nGc3FLffigcWEYxLiTv3N0QzKnxka6oldA7f0-Apqe40I5hu9OzFn-8kMwEnWxf1hWkUABpZeHdgdZkIRi5Ahz2mpBnqJwyDCX5pRE2tgjje_ST3iOzn7PbcXQhQzzt6nR0jz-ADG7-JkLXZ68Waud65ZrZ5oBkZDRk-JwMR0kM4LD9ARW-EVXsPJ9S5Y_fcAgUHIrU7AL_M_c5Eplx-OcA-zua4UhO5r5h69qLewm-E7V0KvfmJf77HDDjXxK-fFXsKQnjVzV18WLXLB-A35MgPPwaJ1R4ewcF8SNup8DeAykd6FzgS21uYAO4jMus-qf6rg4_RwUJ80eWBXSAoXHQV-EL5ULP74mXTdCQ5fsaaDv-NVkFreM1J8ci1geDlrqrH8QC0W5X-CJzqE-EzQ5RwkWYIdOx7v_LPhb5hi6bzodBbRzg_vCxKbz2dKDJs2K4vSvJPkXW50W1kUGWW6TXP_fFrixmV7HljSa-BEThFF1iSQwnPyGqH5QvX3kcOKuAqadrSKht423Ge2agI_uwUw9WL3I0zfewFuTT6RMFfaIWOAzRJtHzT-cK8X4e3OKggm-cCfFIA4KTo8DPGm48OQSnQHnDTzQPjeoSV0M-31M8cNvd6He6j6RS4QZmiaD4VShb8zkZu7h-UO-hsgt7Hlw8ufys580_sSRY0fzLEtSnB2As02oU262OM-_1ioC6hC_shFKq4XSUgj-lmi3iJsFVLL4ismpqxRrlaPrBdamlWixdh7sspXVzz_8r2dVcQxP57nmgmfg0kQ7e_lN68fsvocEbFsGrVmFvhBDBZR4C5ieSJZ2Mjx4jek_-c9epWdVKjG-7aYgR7b5Wy3TmnUpnzPLFSRzKMyQzF07yi9BRSlt0JNqJgxZut9aezSGoMxJOXEvmR28hbWK9fOHaqH_YgXWLnWwmPE_oE2w9v24I9mRqdz-Ev2Qrlb2itS5La3cXrApIqHcZdHKdQeXydAX9ZwzaRAcq1I8wLDToCntq1opN2sqkEYwOfK1iAz3kV16I7A-oXjWvBiYX848t7iua4KaYACrIc96WArIxMxW1zflgXJ1AAD0nLzLIZkMPd5lo7K257dGMCKo5TtlzVJNkrHMEXMV2FHgFqsS_UBHv9OLxynpA0ixs1Ixho-D4WpMgOiX2JLt3vyRG3CWNKrjoF5mY7AN_yiVifETdNtFtcWF6SJjku5ZJLHmD-pAW0Yx1gqFgbboUu1T8YhdHu58Km5Ri77GCZnj0wVcVFeePYSCMw=; _gat_UA-117333109-1=1; AccessTokenInfo=u2x96MRfae-glKK-nne0UV_A-GUIp1Pp36QiNk9wKOOkCGuHUNUsNAMdZQBNFno3kZ9VqVTPBMIxMGUOdzfyb1rGxtEq6lS8-iP8KeTvybVFqjc-5HctD_wkIJEF7-7kx8meVjzWxySwVgwkEZmqv5fGeVqv0EsLhQa05blbJJoF2HSdEMiOBwgyeTQ_lR6gOo_4s-N2npeI4xHYtG4p_TFRfLXvVa8llp8mIxBrHwUKfAJH_osh7K6v1RWm7ZvNIo4XwnuyUTmDeaeNmWtotuLjKE3MvCYYycGrak3AnBMuEX1EpR4synaZBqmGFgJK_RYyfuRoiqL-aFm9GVSfmKVGi2kiLBuAwTp0nuDlMoqjG6r53iG9x8D2RIRgv5DsgOgN8SS7KhNrQFoaL-rxlKmtrTSYPzWci8igX0lEBF7RrJV8qYnN1ER6V6KOQkwLQ1MhpROy3Lk3fu12Tz-_0UYHIzp9QNgFpL4y8b1ZsUmVxcyGkAqFg6TOPBi32mvnUdCre-QPTbA_dLwmF0yNETtVxAVZo4aeDOdihITWOZMCj6H5OLSglKMfWj615P9bWMf_Pihog7uRy-sDA5TjxLYhU4ZvH97xyQKEUWk6rnT_A6mH8DnfipmRBGFO1kkKUyrJvJBvADCt4nL567uFBvbGdFadUcq-JzBjzmFIe9CoU_VsEYtBWhAGV0sbajk_k_9Ls190ziS3d9lOSi5YRxAMLCipEKM3yGi2IduB_ERn1d5exZTCRolUrSdD4d3yFNlu1FQS7QtJD75FxN5BYCB03KgMBRUnyNejPU0RiqHx-liJQMQiPrRkDBpyxo-de5MBKoNSgt-nW44dmEWMrPK5h2jNpOwjtfbwxGxFFSp6txscWxaMDbo9mGSJeE6e27tb2WvX5YSNi68fccyo3PZLTF-H8ZixtfoTn4g1nL1JGA55-oXDCjq95ivHCJfGSdXmV99vayz6I_wp5O_JtUWqNz7kdy0Pf_SsPVbZS7bHyZ5WPNbHJLBWDCQRmaq_l8Z5Wq_QSwv4obJ14EiI6gO4QyDgrNISZkkg3fwMSDiTDKMoMVLi0xsXF7xJnGUoaH2Dvehd22ZFaozIpZB94wHIE0elFyUtXCLVqeNo-gyAUcIOdORSk7XqfCElKheoGzI4Op1kfF3rouycJtpjGcztaWblB-nY52VmuwOpsfRatMz60tA06ZSAQK3FzQmbJNPi_LyWagUDokf3pvAxfz1LJgnxe7cn55cPBW8IWOuHoxAPIZ7HePAxFtttra0C2F9_CB6SI11M9QSweb-rSgZwU_xc5wFsEb2CIWj3SgE4gbO3WxnbCJcXTd8RK_7yG9OzGhvLk-uzz7W6g1KXVFjb8mCWoxgGMK97SL_CjmkTQGcgwup24JzVo_IGkxVNJQyUDhfndbHoxLZstyyfKlrGxtEq6lS8JMPOGXaUEQK3yn_wkykYX0yrtmgM99sTXzHcpK1916aI_ev7q5X2KUjdDofJfybNKEVcSE8n-hSP8b652vCeFdkY7P4cXN29; _ga_EJRWLKGJNW=GS1.1.1603541129.5.1.1603542988.47; _ga=GA1.2.87188470.1602703684"); var result = await _http.Execute(o => { o.Host = new Uri("https://www.amccinemas.com/showtime/getallmovies"); o.Method = HttpMethod.Post; o.Headers = headers; o.Body = @"{""url"":""showdate=" + param + @"""}"; }); if (result.IsSuccess) { var config = Configuration.Default; //Create a new context for evaluating webpages with the given config var context = BrowsingContext.New(config); //Just get the DOM representation var document = await context.OpenAsync(req => req.Content(result.Response)); //Select all sessions of films var listofgroups = document.All.Where(m => m.ClassName == "amc-time-list"); foreach (var item in listofgroups) { string cinemaName = item.GetAttribute("data-cinemaname"); string movieName = item.GetAttribute("data-moviename"); var childdoc = await context.OpenAsync(req => req.Content(item.InnerHtml)); var listofsessions = childdoc.QuerySelectorAll("span").Where(m => m.ClassName == "amc-time"); List <Session> sessions = new List <Session>(); foreach (var c_session in listofsessions) { Session session = new Session(); DateTime sessionDate = new DateTime(date.Year, date.Month, date.Day); string time = c_session.InnerHtml; time = time.Remove(0, 1); time = time.Remove(time.Length - 1, 1); time = "10/10/2000 " + time; DateTime parsedTime = DateTime.ParseExact(time, "M/d/yyyy h:mm tt", CultureInfo.InvariantCulture); sessionDate = sessionDate.AddHours(parsedTime.Hour); sessionDate = sessionDate.AddMinutes(parsedTime.Minute); session.ShowTime = sessionDate; session.CinemaId = cinemas.First(x => x.Name == cinemaName).ExternalId; sessions.Add(session); } movies.First(x => x.Title == movieName).Sessions.AddRange(sessions); } return(new ParseResult <Data>(Result.Success) { Data = new Data() { Movies = movies } }); } return(new ParseResult <Parse.Data>(Result.Error)); }
/// <summary> /// Validates all images if they are rotated correctly (when <paramref name="rotate"/> is set /// to <c>true</c>) and fit on the given <paramref name="pageSettings"/>. /// If an image does need to be rotated or does not fit then a local copy is made of /// the <paramref name="inputUri"/> file. /// </summary> /// <param name="inputUri">The uri of the webpage</param> /// <param name="resize">When set to <c>true</c> then an image is resized when needed</param> /// <param name="rotate">When set to <c>true</c> then the EXIF information of an /// image is read and when needed the image is automatic rotated</param> /// <param name="sanitizeHtml">When set to <c>true</c> then the HTML with get sanitized</param> /// <param name="pageSettings"><see cref="PageSettings"/></param> /// <param name="outputUri">The outputUri when this method returns <c>false</c> otherwise /// <c>null</c> is returned</param> /// <returns>Returns <c>false</c> when the images dit not fit the page, otherwise <c>true</c></returns> /// <exception cref="WebException">Raised when the webpage from <paramref name="inputUri"/> could not be downloaded</exception> public bool Validate(ConvertUri inputUri, bool resize, bool rotate, bool sanitizeHtml, PageSettings pageSettings, out ConvertUri outputUri) { outputUri = null; string localDirectory = null; if (inputUri.IsFile) { localDirectory = Path.GetDirectoryName(inputUri.OriginalString); } using (var webpage = inputUri.IsFile ? File.OpenRead(inputUri.OriginalString) : DownloadStream(inputUri)) { var maxWidth = (pageSettings.PaperWidth - pageSettings.MarginLeft - pageSettings.MarginRight) * 96.0; var maxHeight = (pageSettings.PaperHeight - pageSettings.MarginTop - pageSettings.MarginBottom) * 96.0; var htmlChanged = false; var config = Configuration.Default.WithCss(); var context = BrowsingContext.New(config); IDocument document; try { // ReSharper disable AccessToDisposedClosure document = inputUri.Encoding != null ? context.OpenAsync(m => m.Content(webpage).Header("Content-Type", $"text/html; charset={inputUri.Encoding.WebName}")) .Result : context.OpenAsync(m => m.Content(webpage)).Result; // ReSharper restore AccessToDisposedClosure } catch (Exception exception) { WriteToLog($"Exception occured in AngleSharp: {ExceptionHelpers.GetInnerException(exception)}"); return(true); } if (sanitizeHtml) { WriteToLog("Sanitizing HTML"); new HtmlSanitizer().DoSanitize(document as IHtmlDocument, document.DocumentElement); htmlChanged = true; WriteToLog("HTML sanitized"); } WriteToLog("Validating all images if they need to be rotated and if they fit the page"); var unchangedImages = new List <IHtmlImageElement>(); // ReSharper disable once PossibleInvalidCastExceptionInForeachLoop foreach (var htmlImage in document.Images) { var imageChanged = false; if (string.IsNullOrWhiteSpace(htmlImage.Source)) { WriteToLog($"HTML image tag '{htmlImage.TagName}' has no image source '{htmlImage.Source}'"); continue; } Image image = null; var source = htmlImage.Source.Contains("?") ? htmlImage.Source.Split('?')[0] : htmlImage.Source; var extension = Path.GetExtension(FileManager.RemoveInvalidFileNameChars(source)); var fileName = GetTempFile(extension); try { // The local width and height attributes always go before css width and height var width = htmlImage.DisplayWidth; var height = htmlImage.DisplayHeight; if (rotate) { image = GetImage(htmlImage.Source, localDirectory); if (image == null) { continue; } if (RotateImageByExifOrientationData(image)) { htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; WriteToLog($"Image rotated and saved to location '{fileName}'"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.SetStyle(string.Empty); htmlImage.Source = new Uri(fileName).ToString(); htmlChanged = true; imageChanged = true; } width = image.Width; height = image.Height; } if (resize) { if (height == 0 && width == 0) { var style = context.Current.GetComputedStyle(htmlImage); if (style != null) { width = ParseValue(style.GetPropertyValue("width")); height = ParseValue(style.GetPropertyValue("height")); } } // If we don't know the image size then get if from the image itself if (width <= 0 || height <= 0) { if (image == null) { image = GetImage(htmlImage.Source, localDirectory); } if (image == null) { continue; } width = image.Width; height = image.Height; } if (width > maxWidth || height > maxHeight) { // If we did not load the image already then load it if (image == null) { image = GetImage(htmlImage.Source, localDirectory); } if (image == null) { continue; } image = ScaleImage(image, (int)maxWidth); WriteToLog( $"Image resized to width {image.Width} and height {image.Height} and saved to location '{fileName}'"); image.Save(fileName); htmlImage.DisplayWidth = image.Width; htmlImage.DisplayHeight = image.Height; htmlImage.SetStyle(string.Empty); htmlImage.Source = new Uri(fileName).ToString(); htmlChanged = true; imageChanged = true; } } } finally { image?.Dispose(); } if (!imageChanged) { unchangedImages.Add(htmlImage); } } if (!htmlChanged) { return(true); } foreach (var unchangedImage in unchangedImages) { using (var image = GetImage(unchangedImage.Source, localDirectory)) { if (image == null) { WriteToLog($"Could not load unchanged image from location '{unchangedImage.Source}'"); continue; } var extension = Path.GetExtension(unchangedImage.Source.Contains("?") ? unchangedImage.Source.Split('?')[0] : unchangedImage.Source); var fileName = GetTempFile(extension); WriteToLog($"Unchanged image saved to location '{fileName}'"); image.Save(fileName); unchangedImage.Source = new Uri(fileName).ToString(); } } var outputFile = GetTempFile(".htm"); outputUri = new ConvertUri(outputFile, inputUri.Encoding); try { using (var fileStream = new FileStream(outputFile, FileMode.CreateNew, FileAccess.Write)) { if (inputUri.Encoding != null) { using (var textWriter = new StreamWriter(fileStream, inputUri.Encoding)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } else { using (var textWriter = new StreamWriter(fileStream)) document.ToHtml(textWriter, new HtmlMarkupFormatter()); } } return(false); } catch (Exception exception) { WriteToLog($"Could not generate new html file '{outputFile}', error: {ExceptionHelpers.GetInnerException(exception)}"); return(true); } } }
public async Task Upload(Solver solver) { var color = Console.ForegroundColor; Console.WriteLine(); var solverResult = Runner.RunSolver(solver); Console.WriteLine(); if (solverResult.errors.Any()) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Uhh-ohh the solution doesn't pass the tests..."); Console.ForegroundColor = color; Console.WriteLine(); return; } var problem = await DownloadProblem(GetContext(), GetBaseAddress(), solver.Year(), solver.Day()); if (problem.Answers.Length == 2) { Console.WriteLine("Both parts of this puzzle are complete!"); Console.WriteLine(); } else if (solverResult.answers.Length <= problem.Answers.Length) { Console.WriteLine($"You need to work on part {problem.Answers.Length + 1}"); Console.WriteLine(); } else { var level = problem.Answers.Length + 1; var answer = solverResult.answers[problem.Answers.Length]; Console.WriteLine($"Uploading answer ({answer}) for part {level}..."); // https://adventofcode.com/{year}/day/{day}/answer // level={part}&answer={answer} var cookieContainer = new CookieContainer(); using var handler = new HttpClientHandler() { CookieContainer = cookieContainer }; using var client = new HttpClient(handler) { BaseAddress = GetBaseAddress() }; var content = new FormUrlEncodedContent(new[] { new KeyValuePair <string, string>("level", level.ToString()), new KeyValuePair <string, string>("answer", answer), }); cookieContainer.Add(GetBaseAddress(), new Cookie("session", GetSession())); var result = await client.PostAsync($"/{solver.Year()}/day/{solver.Day()}/answer", content); result.EnsureSuccessStatusCode(); var responseString = await result.Content.ReadAsStringAsync(); var config = Configuration.Default; var context = BrowsingContext.New(config); var document = await context.OpenAsync(req => req.Content(responseString)); var article = document.Body.QuerySelector("body > main > article").TextContent; article = Regex.Replace(article, @"\[Continue to Part Two.*", "", RegexOptions.Singleline); article = Regex.Replace(article, @"You have completed Day.*", "", RegexOptions.Singleline); article = Regex.Replace(article, @"\(You guessed.*", "", RegexOptions.Singleline); article = Regex.Replace(article, @" ", "\n", RegexOptions.Singleline); using (var repo = new Git.Repository(".git")) if (article.StartsWith("That's the right answer") || article.Contains("You've finished every puzzle")) { Git.Commands.Stage(repo, "*"); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine(article); Console.ForegroundColor = color; Console.WriteLine(); await Update(solver.Year(), solver.Day()); var signature = new Git.Signature(repo.Config.Get <string>("user.name").Value, repo.Config.Get <string>("user.email").Value, DateTime.Now); if (article.StartsWith('T')) { Git.Commands.Stage(repo, "**/input.refout"); repo.Commit($"Solved P1", signature, signature, new()); Git.Commands.Stage(repo, "*"); repo.Commit("P2", signature, signature, new()); } else { Git.Commands.Stage(repo, "*"); repo.Commit($"Solved P2", signature, signature, new()); } } else if (article.StartsWith("That's not the right answer")) { Git.Commands.Stage(repo, "*"); Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(article); Console.ForegroundColor = color; Console.WriteLine(); } else if (article.StartsWith("You gave an answer too recently")) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(article); Console.ForegroundColor = color; Console.WriteLine(); } else { Console.ForegroundColor = ConsoleColor.White; Console.WriteLine(article); Console.ForegroundColor = color; } } }
public async Task CssWithImportRuleShouldBeAbleToHandleNestedStylesheets() { var files = new Dictionary <String, String> { { "index.html", "<!doctype html><html><link rel=stylesheet href=origin.css type=text/css><style>@import url('linked2.css');</style>" }, { "origin.css", "@import url(linked1.css);" }, { "linked1.css", "" }, { "linked2.css", "@import url(\"linked3.css\"); @import 'linked4.css';" }, { "linked3.css", "" }, { "linked4.css", "" }, }; var requester = new TestServerRequester(files); var config = Configuration.Default .With(requester) .WithDefaultLoader(new LoaderOptions { IsResourceLoadingEnabled = true }) .WithCss(); var document = await BrowsingContext.New(config).OpenAsync("http://localhost/index.html"); var link = document.QuerySelector <IHtmlLinkElement>("link"); var style = document.QuerySelector <IHtmlStyleElement>("style"); await Task.Delay(100); Assert.IsNotNull(link); Assert.IsNotNull(style); var origin = link.Sheet as ICssStyleSheet; Assert.IsNotNull(origin); Assert.AreEqual("http://localhost/origin.css", origin.Href); Assert.AreEqual(1, origin.Rules.Length); Assert.AreEqual(CssRuleType.Import, origin.Rules[0].Type); var linked1 = (origin.Rules[0] as ICssImportRule).Sheet; Assert.IsNotNull(linked1); Assert.AreEqual("http://localhost/linked1.css", linked1.Href); Assert.AreEqual(0, linked1.Rules.Length); var styleSheet = style.Sheet as ICssStyleSheet; Assert.IsNotNull(styleSheet); Assert.AreEqual(null, styleSheet.Href); Assert.AreEqual(1, styleSheet.Rules.Length); Assert.AreEqual(CssRuleType.Import, styleSheet.Rules[0].Type); var linked2 = (styleSheet.Rules[0] as ICssImportRule).Sheet; Assert.IsNotNull(linked2); Assert.AreEqual("http://localhost/linked2.css", linked2.Href); Assert.AreEqual(2, linked2.Rules.Length); Assert.AreEqual(CssRuleType.Import, linked2.Rules[0].Type); Assert.AreEqual(CssRuleType.Import, linked2.Rules[1].Type); var linked3 = (linked2.Rules[0] as ICssImportRule).Sheet; Assert.IsNotNull(linked3); Assert.AreEqual("http://localhost/linked3.css", linked3.Href); Assert.AreEqual(0, linked3.Rules.Length); var linked4 = (linked2.Rules[1] as ICssImportRule).Sheet; Assert.IsNotNull(linked4); Assert.AreEqual("http://localhost/linked4.css", linked4.Href); Assert.AreEqual(0, linked4.Rules.Length); }
public async Task <NovelResult> GetNovelData(string query) { if (string.IsNullOrWhiteSpace(query)) { throw new ArgumentNullException(nameof(query)); } query = query.Replace(" ", "-", StringComparison.InvariantCulture); try { var link = "http://www.novelupdates.com/series/" + Uri.EscapeDataString(query.Replace("/", " ", StringComparison.InvariantCulture)); link = link.ToLowerInvariant(); var(ok, data) = await _cache.TryGetNovelDataAsync(link).ConfigureAwait(false); if (!ok) { var config = Configuration.Default.WithDefaultLoader(); using (var document = await BrowsingContext.New(config).OpenAsync(link).ConfigureAwait(false)) { var imageElem = document.QuerySelector("div.seriesimg > img"); if (imageElem == null) { return(null); } var imageUrl = ((IHtmlImageElement)imageElem).Source; var descElem = document.QuerySelector("div#editdescription > p"); var desc = descElem.InnerHtml; var genres = document.QuerySelector("div#seriesgenre").Children .Select(x => x as IHtmlAnchorElement) .Where(x => x != null) .Select(x => $"[{x.InnerHtml}]({x.Href})") .ToArray(); var authors = document .QuerySelector("div#showauthors") .Children .Select(x => x as IHtmlAnchorElement) .Where(x => x != null) .Select(x => $"[{x.InnerHtml}]({x.Href})") .ToArray(); var score = ((IHtmlSpanElement)document .QuerySelector("h5.seriesother > span.uvotes")) .InnerHtml; var status = document .QuerySelector("div#editstatus") .InnerHtml; var title = document .QuerySelector("div.w-blog-content > div.seriestitlenu") .InnerHtml; var obj = new NovelResult() { Description = desc, Authors = authors, Genres = genres, ImageUrl = imageUrl, Link = link, Score = score, Status = status, Title = title, }; await _cache.SetNovelDataAsync(link, JsonConvert.SerializeObject(obj)).ConfigureAwait(false); return(obj); } } return(JsonConvert.DeserializeObject <NovelResult>(data)); } catch (Exception ex) { _log.Error(ex); return(null); } }
public async Task Mal([Remainder] string name) { if (string.IsNullOrWhiteSpace(name)) { return; } var fullQueryLink = "https://myanimelist.net/profile/" + name; var config = Configuration.Default.WithDefaultLoader(); var document = await BrowsingContext.New(config).OpenAsync(fullQueryLink); var imageElem = document.QuerySelector("body > div#myanimelist > div.wrapper > div#contentWrapper > div#content > div.content-container > div.container-left > div.user-profile > div.user-image > img"); var imageUrl = ((IHtmlImageElement)imageElem)?.Source ?? "http://icecream.me/uploads/870b03f36b59cc16ebfe314ef2dde781.png"; var stats = document.QuerySelectorAll("body > div#myanimelist > div.wrapper > div#contentWrapper > div#content > div.content-container > div.container-right > div#statistics > div.user-statistics-stats > div.stats > div.clearfix > ul.stats-status > li > span").Select(x => x.InnerHtml).ToList(); var favorites = document.QuerySelectorAll("div.user-favorites > div.di-tc"); var favAnime = GetText("anime_no_fav"); if (favorites[0].QuerySelector("p") == null) { favAnime = string.Join("\n", favorites[0].QuerySelectorAll("ul > li > div.di-tc.va-t > a") .Shuffle() .Take(3) .Select(x => { var elem = (IHtmlAnchorElement)x; return($"[{elem.InnerHtml}]({elem.Href})"); })); } var info = document.QuerySelectorAll("ul.user-status:nth-child(3) > li.clearfix") .Select(x => Tuple.Create(x.Children[0].InnerHtml, x.Children[1].InnerHtml)) .ToList(); var daysAndMean = document.QuerySelectorAll("div.anime:nth-child(1) > div:nth-child(2) > div") .Select(x => x.TextContent.Split(':').Select(y => y.Trim()).ToArray()) .ToArray(); var embed = new EmbedBuilder() .WithOkColor() .WithTitle(GetText("mal_profile", name)) .AddField(efb => efb.WithName("💚 " + GetText("watching")).WithValue(stats[0]).WithIsInline(true)) .AddField(efb => efb.WithName("💙 " + GetText("completed")).WithValue(stats[1]).WithIsInline(true)); if (info.Count < 3) { embed.AddField(efb => efb.WithName("💛 " + GetText("on_hold")).WithValue(stats[2]).WithIsInline(true)); } embed .AddField(efb => efb.WithName("💔 " + GetText("dropped")).WithValue(stats[3]).WithIsInline(true)) .AddField(efb => efb.WithName("⚪ " + GetText("plan_to_watch")).WithValue(stats[4]).WithIsInline(true)) .AddField(efb => efb.WithName("🕐 " + daysAndMean[0][0]).WithValue(daysAndMean[0][1]).WithIsInline(true)) .AddField(efb => efb.WithName("📊 " + daysAndMean[1][0]).WithValue(daysAndMean[1][1]).WithIsInline(true)) .AddField(efb => efb.WithName(MalInfoToEmoji(info[0].Item1) + " " + info[0].Item1).WithValue(info[0].Item2.TrimTo(20)).WithIsInline(true)) .AddField(efb => efb.WithName(MalInfoToEmoji(info[1].Item1) + " " + info[1].Item1).WithValue(info[1].Item2.TrimTo(20)).WithIsInline(true)); if (info.Count > 2) { embed.AddField(efb => efb.WithName(MalInfoToEmoji(info[2].Item1) + " " + info[2].Item1).WithValue(info[2].Item2.TrimTo(20)).WithIsInline(true)); } //if(info.Count > 3) // embed.AddField(efb => efb.WithName(MalInfoToEmoji(info[3].Item1) + " " + info[3].Item1).WithValue(info[3].Item2).WithIsInline(true)) embed .WithDescription($@" ** https://myanimelist.net/animelist/{ name } ** **{GetText("top_3_fav_anime")}** {favAnime}" //**[Manga List](https://myanimelist.net/mangalist/{name})** //💚`Reading:` {stats[5]} //💙`Completed:` {stats[6]} //💔`Dropped:` {stats[8]} //⚪`Plan to read:` {stats[9]} //**Top 3 Favorite Manga:** //{favManga}" ) .WithUrl(fullQueryLink) .WithImageUrl(imageUrl); await Context.Channel.EmbedAsync(embed).ConfigureAwait(false); }
static async Task Main(string[] args) { ThreadPool.SetMaxThreads(100, Int32.MaxValue); // Create the token source. var cts = new CancellationTokenSource(); await using var crawlerHost = new CrawlerHost(); var hostBuilder = CrawlerHost.CreateHostBuilder(args, (hostContext, services) => { //Add more services to the Dependency Injection here //1. Scheduler // services.AddTransient<ISchedulerService, DefaultScheduler>(); // services.AddHostedService(serviceProvider => serviceProvider.GetService<ISchedulerService>()); services.AddTransient <IScheduler, CoreScheduler>(); services.AddHostedService(serviceProvider => serviceProvider.GetService <IScheduler>()); services.AddSingleton <ObservableConcurrentQueue <WaitingPageModel> >(); // services.AddSingleton<AsyncBulkheadPolicy>(serviceProvider => // Policy.BulkheadAsync(Convert.ToInt32(hostContext.Configuration.GetSection("AppConfig")["MaxThreads"]), // Int32.MaxValue)); services.AddTransient <IUriBucket <WaitingPage>, DefaultWaitingUriBucket>(); services.AddDbContextPool <UriDBContext>(options => { options.UseSqlite(@"Data Source=./1_Scheduler/DB/UriDB.db;"); }, 16); //2. Crawler services.AddSingleton <IBrowsingContext>(serviceProvider => { //Use the default configuration for AngleSharp var config = Configuration.Default .WithRequesters() // from AngleSharp.Io .WithDefaultLoader(); // from AngleSharp //Create a new context for evaluating webpages with the given config var context = BrowsingContext.New(config); return(context); }); //3. Downloader services.AddHttpClient(); services.AddTransient <IDownloader, Downloader>(); //4. UriPolicies //5. Content Extractor services.AddTransient <IContentExtractor, ContentExtractor>(); //6. Storage services.AddDbContextPool <ContentContext>((serviceProvider, options) => { options.UseSqlite(hostContext.Configuration.GetSection("AppConfig")["ConnectionString"]); // options.UseSqlite(@"Data Source=./4_Storage/PageModels/DB/content.db;"); }, 32); }); try { Console.WriteLine("Application Starting!"); using var host = crawlerHost.RunCrawlerEngine(hostBuilder, cts.Token); // var waitingQueue = host.Services.GetRequiredService<ObservableConcurrentQueue<WaitingPageModel>>(); // waitingQueue.Enqueue(new WaitingPageModel {Uri = new Uri("https://www.webtoons.com"), Verb = "GET"}); await host.StartAsync(cts.Token); Console.WriteLine("Application Started! Press <enter> to stop."); var waitingQueue = host.Services.GetRequiredService <ObservableConcurrentQueue <WaitingPageModel> >(); waitingQueue.Enqueue(new WaitingPageModel { Uri = new Uri("https://www.webtoons.com"), Verb = "GET" }); // Console.WriteLine("Enter"); Console.ReadLine(); // waitingQueue = host.Services.GetRequiredService<ObservableConcurrentQueue<WaitingPageModel>>(); // waitingQueue.Enqueue(new WaitingPageModel {Uri = new Uri("https://www.webtoons.com"), Verb = "GET"}); // // Console.ReadLine(); // cts.Cancel(); await host.StopAsync(cts.Token); Console.WriteLine("Application Stopping!"); Console.WriteLine("Main thread Stopped!"); } catch (Exception e) { var trace = new StackTrace(e, true); var line = trace.GetFrame(trace.FrameCount - 1).GetFileLineNumber(); Console.WriteLine("Exception at line " + line); Console.WriteLine(e); } finally { cts.Dispose(); } }
public async Task Image([Remainder] string terms = null) { var oterms = terms?.Trim(); if (string.IsNullOrWhiteSpace(oterms)) { return; } terms = WebUtility.UrlEncode(oterms).Replace(' ', '+'); try { var res = await _google.GetImageAsync(oterms).ConfigureAwait(false); var embed = new EmbedBuilder() .WithOkColor() .WithAuthor(eab => eab.WithName(GetText("image_search_for") + " " + oterms.TrimTo(50)) .WithUrl("https://www.google.rs/search?q=" + terms + "&source=lnms&tbm=isch") .WithIconUrl("http://i.imgur.com/G46fm8J.png")) .WithDescription(res.Link) .WithImageUrl(res.Link) .WithTitle(Context.User.ToString()); await Context.Channel.EmbedAsync(embed).ConfigureAwait(false); } catch { _log.Warn("Falling back to Imgur search."); var fullQueryLink = $"http://imgur.com/search?q={ terms }"; var config = Configuration.Default.WithDefaultLoader(); using (var document = await BrowsingContext.New(config).OpenAsync(fullQueryLink).ConfigureAwait(false)) { var elems = document.QuerySelectorAll("a.image-list-link"); if (!elems.Any()) { return; } var img = (elems.FirstOrDefault()?.Children?.FirstOrDefault() as IHtmlImageElement); if (img?.Source == null) { return; } var source = img.Source.Replace("b.", ".", StringComparison.InvariantCulture); var embed = new EmbedBuilder() .WithOkColor() .WithAuthor(eab => eab.WithName(GetText("image_search_for") + " " + oterms.TrimTo(50)) .WithUrl(fullQueryLink) .WithIconUrl("http://s.imgur.com/images/logo-1200-630.jpg?")) .WithDescription(source) .WithImageUrl(source) .WithTitle(Context.User.ToString()); await Context.Channel.EmbedAsync(embed).ConfigureAwait(false); } } }
static IDocument CreateEmpty(String url) { return(BrowsingContext.New().OpenNewAsync(url).Result); }
// This method gets called by the runtime. Use this method to add services to the container. // For more information on how to configure your application, visit https://go.microsoft.com/fwlink/?LinkID=398940 public void ConfigureServices(IServiceCollection services) { services.AddControllersWithViews(); services.AddRazorPages(); services.AddSignalR(); // TODO: Change the ServiceLifetime. The usage of ServiceLifetime.Transient is because multiple threads operations are running in the same dbcontext. services.AddDbContext <DbContext, ApplicationDbContext>(ServiceLifetime.Transient); services.AddOrcEntityFrameworkCore(); services.AddDatabaseSeeder <ApplicationDbSeeder>(); var token = this.Configuration.GetSection("TelegramBot")?["Token"]; if (!string.IsNullOrWhiteSpace(token)) { if (token == "%TELEGRAM_BOT_TOKEN%") { Log.Warning( "Telegram notification is disable. Replace %TELEGRAM_BOT_TOKEN% placeholder in the configuration file with a valid bot token."); } else { Log.Information("Telegram notification is enable."); services.AddTransient <ITelegramBotClient>(sp => new TelegramBotClient(token)); services.AddSingleton <ITelegramCommander, TelegramCommander>(); } } else { Log.Warning( "Telegram notification is disable. To enable it, add a TelegramBot section with a key Token."); } services.AddTransient(sp => new CookieContainer()); services.AddTransient(sp => BrowsingContext.New(AngleSharp.Configuration.Default)); services.AddTransient( sp => { var cookieContainer = sp.GetService <CookieContainer>(); var handler = new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate | DecompressionMethods.Brotli, AllowAutoRedirect = true }; if (cookieContainer != null) { handler.CookieContainer = cookieContainer; // TODO: Review how to avoid the call to .GetAwaiter().GetResult() var cookieCollection = CookiesHelper.GetCollectitonAsync().GetAwaiter().GetResult(); if (cookieCollection.Count > 0) { handler.CookieContainer.Add(new Uri("https://www.tuenvio.cu"), cookieCollection); } } var httpClient = new HttpClient(handler) { Timeout = ScrappingConfiguration.HttpClientTimeout }; httpClient.DefaultRequestHeaders.TryAddWithoutValidation( "user-agent", ScrappingConfiguration.Agent); httpClient.DefaultRequestHeaders.TryAddWithoutValidation( "accept-encoding", "gzip, deflate, br"); httpClient.DefaultRequestHeaders.CacheControl = new CacheControlHeaderValue { NoCache = true }; return(httpClient); }); // services.AddHttpClient( // "json", // (sp, httpClient) => // { // httpClient.Timeout = ScrappingConfiguration.HttpClientTimeout; // httpClient.DefaultRequestHeaders.CacheControl = // new CacheControlHeaderValue { NoCache = true }; // httpClient.DefaultRequestHeaders.TryAddWithoutValidation( // "user-agent", // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"); // }); services.AddScoped <IStoreService, StoreService>(); services.AddSingleton <ICacheStorage <string, Product> >( provider => new CacheStorage <string, Product>(storeNullValues: true)); services.AddSingleton <ICacheStorage <string, Department> >( provider => new CacheStorage <string, Department>(storeNullValues: true)); services.AddSingleton <ICacheStorage <string, Store> >( provider => new CacheStorage <string, Store>(storeNullValues: true)); services.AddTransient <IEntityScrapper <Product>, ProductScrapper>(); services.AddTransient <IEntityScrapper <Department>, DepartmentScrapper>(); services.AddTransient <IEntityScrapper <Store>, StoreScrapper>(); services.AddTransient <IMultiEntityScrapper <Product>, InspectDepartmentProductsScrapper>(); services.AddTransient <IMultiEntityScrapper <Department>, InspectStoreDepartmentsScrapper>(); services.AddSingleton <ImportStoresHostedService>(); services.AddHostedService <DepartmentMonitorHostedService>(); services.AddHostedService <ProductMonitorHostedService>(); services.AddHostedService <StoreMonitorHostedService>(); // services.AddHostedService<SyncUsersFromTelegramHostedService>(); }
void MakeForDir(System.IO.DirectoryInfo dirinfo) { foreach (var file in dirinfo.GetFiles()) { var FileName = file.Name.Replace(".", "_"); FileName = FileName.Replace("-", "_"); CurrentBlock += "public class " + FileName; CurrentBlock += "{"; if (file.Name.ToLower().EndsWith("css")) { MakeDataField(file.FullName); } else if (file.Name.ToLower().EndsWith("js")) { CurrentBlock.NewBlock(() => { CurrentBlock += "public static readonly string TextContent = ((Func<string>)(() =>"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "byte[] Result = null;"; #if DEBUG MakeBytesAsB64(Encoding.UTF8.GetBytes(System.IO.File.ReadAllText(file.FullName)), "Result"); #else MakeBytesAsbyte(System.IO.File.ReadAllBytes(file.FullName), "Result"); #endif CurrentBlock += "return System.Text.Encoding.UTF8.GetString(Result);"; }); CurrentBlock += "}))();\n"; }); } else if (file.Name.ToLower().EndsWith("html") | file.Name.ToLower().EndsWith("htm")) { //Use the default configuration for AngleSharp var config = Configuration.Default; //Create a new context for evaluating webpages with the given config var context = BrowsingContext.New(config); //Source to be parsed var source = System.IO.File.ReadAllText(file.FullName); //Create a virtual request to specify the document to load (here from our fixed string) var web = context.OpenAsync(req => req.Content(source)).GetAwaiter().GetResult(); var ElementIds = new string[0]; var ElementTags = new string[0]; foreach (var Element in web.All) { string Attribute = null; if (Element.GetAttribute("src") != null) { Attribute = "src"; } else if (Element.GetAttribute("href") != null & Element.TagName.ToLower() == "link") { Attribute = "href"; } if (Attribute != null) { string Address = file.DirectoryName + "\\" + Element.GetAttribute(Attribute).Replace("/", "\\"); if (System.IO.File.Exists(Address)) { Address = new System.IO.FileInfo(Address).FullName; if (Address.StartsWith(BaseDir)) { Element.RemoveAttribute(Attribute); Element.SetAttribute("MNsrc", "Monsajem_Incs.Resources" + Address.Substring(BaseDir.Length). Replace(".", "_"). Replace("-", "_"). Replace("\\", ".")); } } } } foreach (var Element in web.All) { if (Element.Id != null) { Array.Resize(ref ElementIds, ElementIds.Length + 1); ElementIds[ElementIds.Length - 1] = Element.Id; Array.Resize(ref ElementTags, ElementTags.Length + 1); ElementTags[ElementTags.Length - 1] = Element.TagName.ToUpper(); } } if (ElementIds.Length > 0) { string DocText = "<html>" + "<head>" + web.GetElementsByTagName("head")[0].InnerHtml + "</head>" + "<body>" + web.GetElementsByTagName("body")[0].InnerHtml + "</body></html>"; for (int i = 0; i < ElementIds.Length; i++) { if (ElementTags[i] == "INPUT") { ElementTags[i] = "HTMLInputElement"; } else if (ElementTags[i] == "LABEL") { ElementTags[i] = "HTMLLabelElement"; } else if (ElementTags[i] == "DIV") { ElementTags[i] = "HTMLDivElement"; } else if (ElementTags[i] == "DL") { ElementTags[i] = "HTMLDListElement"; } else if (ElementTags[i] == "BUTTON") { ElementTags[i] = "HTMLButtonElement"; } else if (ElementTags[i] == "AREA") { ElementTags[i] = "HTMLAreaElement"; } else if (ElementTags[i] == "IMG") { ElementTags[i] = "HTMLImageElement"; } else if (ElementTags[i] == "TABLE") { ElementTags[i] = "HTMLTableElement"; } else if (ElementTags[i] == "TR") { ElementTags[i] = "HTMLTableRowElement"; } else if (ElementTags[i] == "TD") { ElementTags[i] = "HTMLTableDataCellElement"; } else if (ElementTags[i] == "SELECT") { ElementTags[i] = "HTMLSelectElement"; } else if (ElementTags[i] == "OPTION") { ElementTags[i] = "HTMLOptionElement"; } else if (ElementTags[i] == "IFRAME") { ElementTags[i] = "HTMLIFrameElement"; } else { ElementTags[i] = "HTMLElement"; } } CurrentBlock.NewBlock(() => { CurrentBlock += "public static readonly string HtmlText = ((Func<string>)(() =>"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "byte[] ByteResult = null;"; CurrentBlock += $"var Result =\n@\"{DocText.Replace("\"", "\"\"")}\";"; CurrentBlock += "var Doc = Document.Parse(Result);"; CurrentBlock += "var Elements = Doc.GetElementsByTagName(\"*\").ToArray();"; CurrentBlock += "foreach(var Element in Elements)"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "var MNsrc = Element.GetAttribute(\"MNsrc\");"; CurrentBlock += "if (MNsrc == \"\")"; CurrentBlock += "MNsrc = null;"; CurrentBlock += "if (MNsrc != null)"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "Element.RemoveAttribute(\"MNsrc\");"; CurrentBlock += "var TagName = Element.TagName.ToLower();"; CurrentBlock += "switch(TagName)"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "case \"script\":"; CurrentBlock.NewBlock(() => { CurrentBlock += "Element.InnerHtml = (string)Type.GetType(MNsrc).GetField(\"TextContent\").GetValue(null);"; CurrentBlock += "break;"; }); CurrentBlock += "case \"img\":"; CurrentBlock.NewBlock(() => { CurrentBlock += "Element.SetAttribute(\"src\",(string)Type.GetType(MNsrc).GetField(\"Url\").GetValue(null));"; CurrentBlock += "break;"; }); CurrentBlock += "case \"link\":"; CurrentBlock.NewBlock(() => { CurrentBlock += "var LinkType = Element.GetAttribute(\"type\");"; CurrentBlock += "if (LinkType!=null)"; CurrentBlock.NewBlock(() => CurrentBlock += @"LinkType=LinkType.ToLower();"); CurrentBlock += "var LinkRel = Element.GetAttribute(\"rel\");"; CurrentBlock += "if (LinkRel!=null)"; CurrentBlock.NewBlock(() => CurrentBlock += @"LinkRel=LinkRel.ToLower();"); CurrentBlock += "if (LinkType==\"text/css\" || LinkRel==\"stylesheet\")"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "var Style = Document.document.CreateElement<HTMLStyleElement>();"; CurrentBlock += "Element.SetAttribute(\"src\",(string)Type.GetType(MNsrc).GetField(\"Url\").GetValue(null));"; CurrentBlock += "Element.ParentElement.ReplaceChild(Style, Element);"; }); CurrentBlock += "}"; CurrentBlock += "break;"; }); }); CurrentBlock += "}"; }); CurrentBlock += "}"; }); CurrentBlock += "}"; CurrentBlock += "Result = \"<html>\\\"\" +" + "\"<head>\" + Doc.GetElementsByTagName(\"head\")[0].InnerHtml + \"</head>\" +" + "\"<body>\" + Doc.GetElementsByTagName(\"body\")[0].InnerHtml + \"</body></html>\";"; CurrentBlock += "return Result;"; }); CurrentBlock += "}))();\n"; for (int i = 0; i < ElementIds.Length; i++) { CurrentBlock += "public readonly " + ElementTags[i] + " " + ElementIds[i] + ";"; } CurrentBlock += "public " + FileName + "():this(false){}"; CurrentBlock += "public " + FileName + "(bool IsGlobal)"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "if(IsGlobal==true)"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "var Document = new Document();"; for (int i2 = 0; i2 < ElementIds.Length; i2++) { CurrentBlock += ElementIds[i2] + "= Document.GetElementById<" + ElementTags[i2] + ">(\"" + ElementIds[i2] + "\");"; } CurrentBlock += "return;"; }); CurrentBlock += "}"; CurrentBlock += "var doc = Document.Parse(HtmlText);"; CurrentBlock += "var HeadTags = doc.Head.GetElementsByTagName(\"*\").ToArray();"; CurrentBlock += "foreach(var Tag in HeadTags)"; CurrentBlock += "Document.document.Head.AppendChild(Tag);"; for (int i2 = 0; i2 < ElementIds.Length; i2++) { CurrentBlock += ElementIds[i2] + "= doc.GetElementById<" + ElementTags[i2] + ">(\"" + ElementIds[i2] + "\");"; } CurrentBlock += "var div = Document.document.CreateElement(\"Div\");"; CurrentBlock += "div.AppendChild(doc.Body);"; CurrentBlock += "var Scripts = div.GetElementsByTagName(\"Script\").ToArray();"; CurrentBlock += "foreach(var Script in Scripts)"; CurrentBlock += "{"; CurrentBlock.NewBlock(() => { CurrentBlock += "var NewScript = Document.document.CreateElement(\"Script\");"; CurrentBlock += "var Src = Script.GetAttribute(\"src\");"; CurrentBlock += "if(Src!=null)"; CurrentBlock.NewBlock(() => CurrentBlock += "NewScript.SetAttribute(\"src\",Src);"); CurrentBlock += "NewScript.InnerHtml = Script.InnerHtml;"; CurrentBlock += "Script.ParentElement.ReplaceChild(NewScript, Script);"; }); CurrentBlock += "}"; CurrentBlock += "div.SetStyleAttribute(\"display\",\"none\");"; CurrentBlock += "Document.document.Body.AppendChild(div);"; CurrentBlock += "Document.document.Body.RemoveChild(div);"; for (int i2 = 0; i2 < ElementIds.Length; i2++) { CurrentBlock += ElementIds[i2] + ".Id=\"\";"; } }); CurrentBlock += "}"; }); } } else { MakeDataField(file.FullName); } CurrentBlock += "\n" + AddTabs() + "}"; } foreach (var Dir in dirinfo.GetDirectories()) { CurrentBlock += "\n" + AddTabs() + "namespace " + Dir.Name.Replace(".", "_") + "\n" + AddTabs() + "{"; Tabs += 1; MakeForDir(Dir); Tabs -= 1; CurrentBlock += "\n" + AddTabs() + "}"; } }
public async Task Google([Remainder] string terms = null) { terms = terms?.Trim(); if (string.IsNullOrWhiteSpace(terms)) { return; } terms = WebUtility.UrlEncode(terms).Replace(' ', '+'); var fullQueryLink = $"https://www.google.ca/search?q={ terms }&gws_rd=cr,ssl&cr=countryUS"; var config = Configuration.Default.WithDefaultLoader(); using (var document = await BrowsingContext.New(config).OpenAsync(fullQueryLink)) { var elems = document.QuerySelectorAll("div.g"); var resultsElem = document.QuerySelectorAll("#resultStats").FirstOrDefault(); var totalResults = resultsElem?.TextContent; //var time = resultsElem.Children.FirstOrDefault()?.TextContent //^ this doesn't work for some reason, <nobr> is completely missing in parsed collection if (!elems.Any()) { return; } var results = elems.Select <IElement, GoogleSearchResult?>(elem => { var aTag = (elem.Children.FirstOrDefault()?.Children.FirstOrDefault() as IHtmlAnchorElement); // <h3> -> <a> var href = aTag?.Href; var name = aTag?.TextContent; if (href == null || name == null) { return(null); } var txt = elem.QuerySelectorAll(".st").FirstOrDefault()?.TextContent; if (txt == null) { return(null); } return(new GoogleSearchResult(name, href, txt)); }).Where(x => x != null).Take(5); var embed = new EmbedBuilder() .WithOkColor() .WithAuthor(eab => eab.WithName(GetText("search_for") + " " + terms.TrimTo(50)) .WithUrl(fullQueryLink) .WithIconUrl("http://i.imgur.com/G46fm8J.png")) .WithTitle(Context.User.ToString()) .WithFooter(efb => efb.WithText(totalResults)); var desc = await Task.WhenAll(results.Select(async res => $"[{Format.Bold(res?.Title)}]({(await _google.ShortenUrl(res?.Link))})\n{res?.Text?.TrimTo(400 - res.Value.Title.Length - res.Value.Link.Length)}\n\n")) .ConfigureAwait(false); var descStr = string.Concat(desc); _log.Info(descStr.Length); await Context.Channel.EmbedAsync(embed.WithDescription(descStr)).ConfigureAwait(false); } }
/// <summary> /// Get the page from the given url /// </summary> /// <param name="url">Page url</param> /// <returns>Document</returns> public async Task <IDocument> GetDocument(string url) { var config = Configuration.Default.WithDefaultLoader(); return(await BrowsingContext.New(config).OpenAsync(url)); }
public async Task FetchDataAsync(AppledailyNewsConditions conditions, Saver saver) { var PlatformUrl = "https://tw.appledaily.com/"; var client = new HttpClient(); //maper config var mapperConfig = new MapperConfiguration(cfg => { cfg.CreateMap <ChinatimesNewsModel, NewsDataModel>(); }); var mapper = mapperConfig.CreateMapper(); var config = Configuration.Default; var context = BrowsingContext.New(config); var hrefs = new List <string>(); List <string> NewsURLList = new List <string>(); for (DateTime day = DateTime.Now.Add(conditions.timeSpan); day < DateTime.UtcNow; day = day.AddDays(1)) { //昔日文章 var dailyurl = $"{PlatformUrl}archive/{day.ToString("yyyyMMdd")}/"; //find post url var responseMessage = await client.GetAsync(dailyurl); var responseResult = await responseMessage.Content.ReadAsStringAsync(); var document = await context.OpenAsync(res => res.Content(responseResult)); var paginationHrefs = document.QuerySelectorAll(".archive-story").Select(x => x.GetAttribute("href")); foreach (var href in paginationHrefs) { NewsURLList.Add(PlatformUrl + href); } } foreach (var href in NewsURLList) { var responseMessage = await client.GetAsync(href); var responseResult = await responseMessage.Content.ReadAsStringAsync(); var document = await context.OpenAsync(res => res.Content(responseResult)); var title = string.Join('\n', document.QuerySelectorAll(".text_medium").Select(x => x.TextContent)); var contentItems = document.QuerySelectorAll("p").Select(x => x.TextContent); var content = string.Join('\n', contentItems); var postDate = href.Split('/')[4]; if (content.IndexOf(conditions.Keyword) > 0) { var model = new AppledailyNewsModel { Title = title, Content = content, Date = postDate, Source = href }; // save result var result = mapper.Map <NewsDataModel>(model); saver.Save(result); } } return; }
/// <summary> /// Creates a new named browsing context as child of the given parent. /// </summary> /// <param name="parent">The parent context.</param> /// <param name="name">The name of the child context.</param> /// <param name="security">The security flags to apply.</param> /// <returns></returns> public IBrowsingContext Create(IBrowsingContext parent, String name, Sandboxes security) { var context = new BrowsingContext(parent, security); _cache[name] = new WeakReference<IBrowsingContext>(context); return context; }