/// <summary> /// 从Url地址下载页面 /// </summary> /// <param name="url"></param> /// <returns></returns> public async static ValueTask <HtmlDocument> LoadHtmlFromUrlAsync(string url) { HtmlWeb web = new HtmlWeb(); return(await web?.LoadFromWebAsync(url)); }
public async Task <List <News> > GetNewsItems() { var document = await newsPageWeb.LoadFromWebAsync($"https://d2-megaten-l.sega.com/en/news/index.html"); var urls = document.DocumentNode.SelectNodes("//*[@class='news-list-title']/a"); var info = document.DocumentNode.SelectNodes("//*[@class='newslist-hed cf']"); var newsItems = new List <News>(); for (int i = 0; i < urls.Count; i++) { var link = urls[i].GetAttributeValue("href", ""); var newsItem = new News(); newsItem.Title = urls[i].SelectSingleNode("h3").InnerText.Replace("\"", "\"\""); newsItem.Url = baseUrl + link; newsItem.Image = urls[i].SelectSingleNode("div/img")?.GetAttributeValue("src", ""); newsItems.Add(newsItem); } return(newsItems); }
static async Task <CoinsRate> parseCoin(string xpath, string acronim) { HtmlWeb web = new HtmlWeb(); var htmlDoc = await web.LoadFromWebAsync(UrlParseHelper.Site9999d); var selector = "//div[@class='catalog item-views table catalog_table_2' and @data-slice='Y']"; var node = htmlDoc.DocumentNode.SelectSingleNode(selector); var htmlDoc2 = new HtmlDocument(); htmlDoc2.LoadHtml("<div>" + node.InnerHtml + "</div>"); var nodeCoin = htmlDoc2.DocumentNode.SelectSingleNode(xpath); var innerText = Regex.Replace(nodeCoin.InnerText, @"\s+", " "); var prices = GetBetweenTwoWords("ПРОДАЖА", "Цена за грамм", innerText); var pricePair = prices.Split(new string[1] { "ПОКУПКА" }, StringSplitOptions.RemoveEmptyEntries); pricePair = pricePair.Select(x => x.Replace("₽", "")).ToArray(); CoinsRate coin = new CoinsRate() { Date = DateTime.Now, Site = UrlParseHelper.Site9999d, Acronim = acronim }; coin.Sell = pricePair[0].ParseToDoubleFormat(); coin.Buy = pricePair[1].ParseToDoubleFormat(); return(coin); }
public async Task <SubjectSummary[]> SearchSubjects(SearchCriteria criteria) { if (criteria == null) { throw new ArgumentNullException(nameof(criteria)); } var sb = new StringBuilder(); sb.Append("https://or.justice.cz/ias/ui/rejstrik-$firma?p%3A%3Asubmit=x&.%2Frejstrik-%24firma="); if (!string.IsNullOrWhiteSpace(criteria.Name)) { sb.AppendFormat("&nazev={0}", Uri.EscapeDataString(criteria.Name)); } if (!string.IsNullOrWhiteSpace(criteria.FictIdNumber)) { sb.AppendFormat("&ico={0}", Uri.EscapeDataString(criteria.FictIdNumber)); } sb.Append("&obec=&ulice=&forma=&oddil=&vlozka=&soud=&polozek=50&typHledani=STARTS_WITH&jenPlatne=VSECHNY"); var web = new HtmlWeb(); var htmlDocument = await web.LoadFromWebAsync(sb.ToString()); var searchResults = htmlDocument.DocumentNode.SelectNodes("//table[@class='result-details']/.."); if (searchResults == null) { return(new SubjectSummary[0]); } return(searchResults .Select(x => new SubjectSummary { FictIdNumber = Regex.Replace(x.SelectSingleNode(".//tbody/tr[1]/td[2]").InnerText, @"\s+", string.Empty), Name = HtmlEntity.DeEntitize(x.SelectSingleNode(".//tbody/tr[1]/td[1]").InnerText.Trim()), SubjectId = GetSubjectId(x, ".//li[1]/a") }).ToArray()); }
public static async Task <(string ageRange, string sex)> GetAgeRangeAndSex(string uid, string memberId) { string url = $"https://www.tripadvisor.com.au/MemberOverlay?Mode=owa&uid={uid}&c=&src={memberId}&fus=false&partner=false&LsoId=&metaReferer=ShowUserReviewsAttractions"; var doc = await Web.LoadFromWebAsync(url); var memberdescriptionReviewEnhancementsNode = doc.DocumentNode.Descendants().FirstOrDefault(x => x.HasClass("memberdescriptionReviewEnhancements")); if (memberdescriptionReviewEnhancementsNode == null) { return(null, null); } var liNodes = memberdescriptionReviewEnhancementsNode? .Descendants()? .Where(n => n.Name.Equals("li", System.StringComparison.OrdinalIgnoreCase))? .ToList(); // <li> nodes if (liNodes != null && liNodes.Any() && liNodes.Count >= 2) { //get 2nd liNode var ageSexNode = liNodes[1]; if (ageSexNode != null) { var match = ageSexRegEx.Match(ageSexNode.InnerText); if (match.Success) { var ageRange = match.Groups["age"].Value; var sex = match.Groups["sex"].Value; return(ageRange, sex); } } } return(null, null); }
/// <summary> /// Gets Trending repositories in a time range for a language /// </summary> /// <param name="range"></param> /// <param name="language"></param> /// <returns></returns> public async static Task <List <Tuple <string, string> > > ExtractTrendingRepoNames(TimeRange range, Language language) { string url = ""; List <Tuple <string, string> > repoNames = new List <Tuple <string, string> >(); if (range == TimeRange.TODAY) { url = "https://github.com/trending/" + language.ToString() + "?since=daily"; } if (range == TimeRange.WEEKLY) { url = "https://github.com/trending/" + language.ToString() + "?since=weekly"; } if (range == TimeRange.MONTHLY) { url = "https://github.com/trending/" + language.ToString() + "?since=monthly"; } // // The following code loads the HTML page and looks for specific tags so that we get what we want. // This code will have to be changed as and when the Github Trending page changes its HTML. // We will have to revise the logic as per the new page structure. // HtmlWeb web = new HtmlWeb(); HtmlDocument doc = await web.LoadFromWebAsync(url); var h3 = doc.DocumentNode.Descendants("h3"); foreach (var i in h3) { var s = i.Descendants("a").First(); var names = s.Attributes["href"].Value.Split('/'); repoNames.Add(new Tuple <string, string>(names[1], names[2])); } return(repoNames); }
public async Task <Mini> ParseFromUrl(Uri url) { HtmlWeb web = new HtmlWeb(); HtmlDocument htmlDoc = await web.LoadFromWebAsync(url, null, null); // atelierstoria.com only has minis created by atelierstoria Creator creator = new Creator { Name = "atelierstoria" }; AtelierStoriaSource source = new AtelierStoriaSource(creator); creator.Sites.Add(source); // URLs come in the form 'https://atelierstoria.com/collections/collection/mini?variant=variantId' // Some minis have variants, in which case they have a single query parameter named variant. var variantId = url.Query.Split("=").Last(); // If the variant is not specified in the URL query, the default variant will be used when avaiable. // If the default variant doesn't exist, variantId will still be an empty string. if (variantId == "") { variantId = GetValueFromMeta(htmlDoc, "product:defaultvariant", ""); } Mini mini = new Mini() { Creator = creator, Name = GetValueFromMeta(htmlDoc, "product:title:", variantId), Thumbnail = GetValueFromMeta(htmlDoc, "product:image:", variantId), Cost = GetCost(htmlDoc, variantId), Link = url.ToString(), }; mini.Sources.Add(new MiniSourceSite(mini, source, url)); return(mini); }
public static async Task <OverwatchViewModel> GetOverwatchData(string Uri, string btag) { HtmlWeb web = new HtmlWeb(); await Task.Delay(5000); HtmlDoc = await web.LoadFromWebAsync(Uri); if (!IsValidAccount()) { return(null); } return(new OverwatchViewModel { BattleTag = btag, Level = GetLevel(), RankPoint = GetRankPoint(), PortraitLink = GetPortraitLink(), Rank = GetRank(), FavouriteHeroImage = GetHeroLink(), TimePlayed = GetTimePlayed(), GamesWon = GetGamesWon() }); }
private async Task <HtmlDocument> GetPage_ScrappyAsync(int pageNumber, ScraperHomeLessStateModel state) { HtmlDocument result = null; var needReplay = false; var web = new HtmlWeb(); do { needReplay = false; try { result = await web.LoadFromWebAsync($"https://www.homeless.co.il/rent/{pageNumber}"); } catch (Exception exception) { _log($"Error-g1. Wait 1 sec. {exception.Message}"); needReplay = true; Thread.Sleep(1000 * 1); } } while (needReplay); return(result); }
public async Task <string> GetMagnetLink(Uri uri) { HtmlWeb web = new HtmlWeb(); var htmlDoc = await web.LoadFromWebAsync(uri.AbsoluteUri); if (htmlDoc == null) { throw new HtmlWebException($"Null html document loaded, url: {uri.AbsoluteUri}"); } else if (htmlDoc.DocumentNode == null) { throw new HtmlWebException($"Null node in html document, url: {uri.AbsoluteUri}"); } var magnetLinkNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='torrent_url']"); if (magnetLinkNode == null) { throw new HtmlWebException($"magnetLinkNode is null, url: {uri.AbsoluteUri}"); } return(magnetLinkNode.InnerText); }
public override async Task <Fuel[]> GetActiveFuelsAsync() { var url = "http://www.lukoil.ge/?m=328"; var web = new HtmlWeb(); var doc = await web.LoadFromWebAsync(url); var fuels = doc.DocumentNode.Descendants("table").First(x => x.HasClass("pricetable")).Descendants("tr").Select(x => { var tds = x.Descendants("td").ToArray(); return( name: tds[0].InnerText.Trim(), price: decimal.Parse(tds[1].InnerText.Trim()) ); }).Where(x => x.price > 0).ToArray(); return(fuels.Select(x => new Fuel { Key = ConvertFuelNameToKey(x.name), Name = x.name, Price = x.price }).ToArray()); }
public async Task <IEnumerable <InStockItem> > GetItemInStock(ItemsIWant item) { var list = new List <InStockItem>(); if (string.IsNullOrEmpty(item.KomplettUrl)) { return(list); } if (!item.Name.Contains("Komplett a240 Epic Gaming PC")) { return(list); } var webCrawler = new HtmlWeb() { AutoDetectEncoding = false, OverrideEncoding = Encoding.UTF8 }; var doc = await webCrawler.LoadFromWebAsync(item.KomplettUrl, Encoding.UTF8, CancellationToken.None); var products = doc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' buy-button-section ')]"); foreach (var product in products) { var inStock = product.SelectSingleNode(".//div[contains(@class, 'buy-button')]"); if (inStock != null && inStock.InnerText.Contains("Legg i handlevogn")) { list.Add(new InStockItem { Url = item.KomplettUrl, Name = item.Name, Count = 0, Channel = item.DiscordChannel, Store = "Komplett.no", ChannelId = item.DiscordChannelId }); } } return(list); }
private async Task <bool> ProcessPage(string url) { HtmlWeb web = new HtmlWeb(); HtmlDocument html = await web.LoadFromWebAsync(url); HtmlNode cap = html.DocumentNode.SelectSingleNode("/html/body/div[1]/div[2]/div/div/h6"); if (cap?.InnerText == "Please solve the reCAPTCHA to continue:") { return(false); } HtmlNode headbar = html.DocumentNode.SelectSingleNode("//*[@id=\"headbar\"]"); if (headbar != null) { string code = headbar.GetDataAttribute("code").Value; string token = headbar.GetDataAttribute("token").Value; int timer = int.Parse(headbar.GetDataAttribute("timer").Value); await Task.Delay(timer * 1000); Dictionary <string, string> parameters = new Dictionary <string, string> { { "code", code }, { "token", token } }; HttpContent content = new FormUrlEncodedContent(parameters); await httpClient.PostAsync(rewardUrl, content); } else { await Task.Delay(10000); } return(true); }
public async Task <IList <Image> > DownloadMangaAsync(string url) { string mangaUrl = url.Contains("/manga/") ? url.Replace("/manga/", "/online/") : url; var web = new HtmlWeb(); var htmlDoc = await web.LoadFromWebAsync(mangaUrl); var content = htmlDoc.GetElementbyId("content"); var nodes = content.Descendants("script"); var contentScript = nodes.FirstOrDefault(x => x.InnerHtml.Contains("fullimg")); var imagesRegex = new Regex("\"fullimg\":\\[(?<images>[-a-zA-Z0-9/_\\.\",:]+)\\]"); var match = imagesRegex.Match(contentScript.InnerText); var imagesGroup = match.Groups["images"]; var imageUrls = imagesGroup.Value .Replace("\"", string.Empty) .Split(',') .Where(x => !string.IsNullOrEmpty(x)); var images = new List <Image>(); using (var httpClient = new HttpClient()) { int i = 1; foreach (var imageUrl in imageUrls) { var bytes = await httpClient.GetByteArrayAsync(imageUrl); images.Add(new Image($"{i++}.jpg", bytes)); } } return(images); }
public async Task <WsprnetSpot[]> GetSpots(WsprnetBand band, WsprnetSortOrder sort, int numberOfSpots = 10000, string searchForCall = null, string showSpotsHeardBy = null, bool findUniqueCalls = false, bool findUniqueReporters = false) { var url = $"http://wsprnet.org/olddb?mode=html&band={band}&limit={Math.Min(10000, numberOfSpots)}&findcall={searchForCall}&findreporter={showSpotsHeardBy}{(findUniqueCalls ? "&unique=on" : "")}{(findUniqueReporters ? "&uniquereporters=on" : "")}&sort={sort}"; var htmlWeb = new HtmlWeb(); if (!string.IsNullOrWhiteSpace(userAgent)) { htmlWeb.UserAgent = userAgent; } var doc = await htmlWeb.LoadFromWebAsync(url); var rows = doc.DocumentNode.SelectNodes("/body[1]/table[3]/tr").Skip(2); var spots = new List <WsprnetSpot>(); foreach (var row in rows) { var spot = new WsprnetSpot(); spot.Timestamp = DateTime.ParseExact(StripNbsp(row.ChildNodes[0].InnerText), "yyyy-MM-dd HH:mm", CultureInfo.InvariantCulture, DateTimeStyles.AdjustToUniversal); spot.Call = StripNbsp(row.ChildNodes[1].InnerText); spot.Frequency = (long)(double.Parse(StripNbsp(row.ChildNodes[2].InnerText)) * 1000000); spot.Snr = int.Parse(StripNbsp(row.ChildNodes[3].InnerText)); spot.Drift = int.Parse(StripNbsp(row.ChildNodes[4].InnerText)); spot.Grid = StripNbsp(row.ChildNodes[5].InnerText); spot.Power = int.Parse(StripNbsp(row.ChildNodes[6].InnerText)); spot.ReporterCallsign = StripNbsp(row.ChildNodes[8].InnerText); spot.ReporterLocator = StripNbsp(row.ChildNodes[9].InnerText); spot.Km = int.Parse(StripNbsp(row.ChildNodes[10].InnerText)); spot.Mode = StripNbsp(row.ChildNodes[12].InnerText); spots.Add(spot); } return(spots.ToArray()); }
public async Task Load() { //using var httpClient = new HttpClient(); //var html = await httpClient.GetStringAsync("https://gist.githubusercontent.com/andrew-from-toronto/69b87a099237f207c23767b4c1531558/raw/74bc8742c763cf41583bf96c9318be6dd1d69af5/output.html"); //var htmlDoc = new HtmlDocument(); //htmlDoc.LoadHtml(html); var htmlWeb = new HtmlWeb(); //var doc = htmlWeb.Load("https://gist.githubusercontent.com/andrew-from-toronto/69b87a099237f207c23767b4c1531558/raw/74bc8742c763cf41583bf96c9318be6dd1d69af5/output.html"); var htmlDoc = await htmlWeb.LoadFromWebAsync("https://pharmaconnect.ca/Appointment/8ab18efb-b158-4ca1-8103-34792852814d/Slots?serviceType=ImmunizationCovid"); var docNode = htmlDoc.DocumentNode; var days = docNode.QuerySelectorAll(SELECTOR_APPOINTMENT_AVAILABILITY_DAY_ITEM).Select(x => x.Attributes[ATTRIBUTE_DATA_SELECTED_ID].Value).ToArray(); var dayAppointments = (from day in days from appointment in docNode.QuerySelectorAll(string.Format(SELECTOR_APPOINTMENT_AVAILABILITY_TIME_ITEM, day)) let dayParsed = DateTime.Parse(day) select new { Day = dayParsed, time = appointment.InnerText }).ToArray(); }
static async Task <CoinsRate> getRshbRuCoin(string xpath, string acronim) { var site = @"https://www.rshb.ru/natural/coins/"; HtmlWeb web = new HtmlWeb(); var htmlDoc = await web.LoadFromWebAsync(site); var idLink = htmlDoc.DocumentNode.SelectSingleNode(xpath); var coinNode = idLink.ParentNode.ParentNode.ParentNode; var coinHtmlDoc = new HtmlDocument(); coinHtmlDoc.LoadHtml("<div>" + coinNode.InnerHtml + "</div>"); string buySelect = "//span[@class='b-coins-items-item-cost-b']"; string sellSelect = "//div[@class='b-coins-items-item-quotes-price ']"; string buyPrice = coinHtmlDoc.DocumentNode.SelectSingleNode(buySelect).InnerText.Replace("Р", ""); string sellPrice = coinHtmlDoc.DocumentNode.SelectSingleNode(sellSelect).InnerText.Replace("Р", ""); CoinsRate coin = new CoinsRate() { Acronim = acronim, Date = DateTime.Now, Site = site }; coin.Buy = buyPrice.ParseToDoubleFormat(); coin.Sell = sellPrice.ParseToDoubleFormat(); return(coin); }
public static async Task <ExtractedDocument> Run( [ActivityTrigger] string url, ILogger log) { var result = new ExtractedDocument(url); try { var doc = await Web.LoadFromWebAsync(url, Encoding.UTF8); var anchors = doc.DocumentNode.SelectNodes("//a[@href]"); var sources = anchors .Select(a => a.GetAttributeValue("href", string.Empty)) .Where(a => a.StartsWith("http")); result.ChildUrls = sources.ToList(); } catch (Exception e) { log.LogError(e, $"Exception while processing {url}."); } return(result); }
private async Task <IList <HnItem> > ScrapePageAsync(string uri, CancellationToken token) { var webGet = new HtmlWeb(); var htmlDoc = await webGet.LoadFromWebAsync(uri, token); var rows = htmlDoc.DocumentNode.CssSelect("table.itemlist tr.athing"); var hnItems = new List <HnItem>(); foreach (var row in rows) { var anchor = row.CssSelect("td.title a").First(); var anchorUri = anchor.GetAttributeValue("href"); var rank = row.CssSelect("td.title span.rank").First(); var trSubtext = row.NextSibling; var tdSubtext = trSubtext.CssSelect("td.subtext").First(); var author = tdSubtext.CssSelect("a.hnuser").FirstOrDefault(); var points = tdSubtext.CssSelect("span.score").FirstOrDefault(); var subtextAnchors = trSubtext.CssSelect("td.subtext > a").ToList(); var commentsAnchor = subtextAnchors.SingleOrDefault(x => x.InnerText.Contains("comment", StringComparison.InvariantCultureIgnoreCase)); var hnItem = HnItemBuilder.Build( anchor?.InnerText, anchorUri, author?.InnerText, ExtractPoints(points), ExtractComments(commentsAnchor), ExtractRank(rank)); hnItems.Add(hnItem); } return(hnItems); }
public async Task <string> DownloadPageAsync(string url) { int retries = 0; while (true) { try { var doc = await _client.LoadFromWebAsync(url).ConfigureAwait(false); var innerHtml = doc.DocumentNode.InnerHtml; return(innerHtml); } catch (Exception ex) { retries += 1; Thread.Sleep(500 * retries); _log.Log($"{ex.GetType().Name} occurred on retry {retries} loading {url}"); continue; } } }
public async Task <string> ReadContent() { var webReader = new HtmlWeb { UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0" }; var dom = await webReader.LoadFromWebAsync(_config.WebsiteAddress, Encoding.UTF8); var contentNode = GetContentNode(dom); if (contentNode == null) { _logger.LogWarning( "No content found at {Address} with {XPath}", _config.WebsiteAddress, _config.XPathSelector ); return(String.Empty); } return(_config.UseInnerHtml ? contentNode.InnerHtml : contentNode.WriteTo()); }
public async Task <HtmlDocument> Download(string path) { //this method downloads html from web or disc space this.Path = path; try { var doc = await web.LoadFromWebAsync(Path); if (doc != null) { return(doc); } } catch (Exception e) { //Console.WriteLine(e); } try { var docf = new HtmlDocument(); docf.Load(path); if (docf != null) { return(docf); } return(null); } catch (Exception e) { //Console.WriteLine(e); } return(null); }
public async Task <Mini> ParseFromUrl(Uri url) { HtmlWeb web = new HtmlWeb(); HtmlDocument htmlDoc = await web.LoadFromWebAsync(url, null, null); HtmlNode nameNode = htmlDoc.DocumentNode.SelectNodes("//meta[@property=\"og:title\"]").First(); HtmlNode imageNode = htmlDoc.DocumentNode.SelectNodes("//meta[@property=\"og:image\"]").First(); HtmlNode urlNode = htmlDoc.DocumentNode.SelectNodes("//meta[@property=\"og:url\"]").First(); HtmlNode costNode = htmlDoc.DocumentNode.SelectNodes("//meta[@property=\"product:price:amount\"]").First(); Uri link = new Uri(urlNode.GetAttributeValue("content", url.ToString())); string creatorName = link.Host.Split('.').First().Split('/').Last(); Creator creator = new Creator { Name = creatorName }; GumroadSource source = new GumroadSource(creator, creatorName); creator.Sites.Add(source); Mini mini = new Mini() { Creator = creator, Name = System.Web.HttpUtility.HtmlDecode(nameNode.GetAttributeValue("content", null)), Thumbnail = imageNode.GetAttributeValue("content", null), Link = "https://gumroad.com" + link.AbsolutePath }; mini.Cost = Convert.ToInt32(Math.Round(Convert.ToDouble(costNode.GetAttributeValue("content", "0")))); ; mini.Sources.Add(new MiniSourceSite(mini, source, url)); return(mini); }
public async static Task <HtmlDocument> LoadSiteAsync(string url) { bool success = false; int retryCount = maxRetry; HtmlDocument document = new HtmlDocument(); while (!success && retryCount > 0) { try { HtmlWeb web = new HtmlWeb(); document = await web.LoadFromWebAsync(url); success = true; } catch (Exception ex) { retryCount--; using (StreamWriter logWriter = File.AppendText("logs.txt")) { string errorInfo = $"Error: [{ex.InnerException}] @ [{url}], [{ex.StackTrace}]"; logWriter.WriteLine(errorInfo); } Thread.Sleep(1000); if (retryCount == 0) { throw; } } } return(document); }
public async Task <IList <HorribleSubsLink>?> Download(string name = "") { const string url = baseUrl + "/shows/"; var web = new HtmlWeb(); var doc = await web.LoadFromWebAsync(url).ConfigureAwait(false); var trackedShowList = doc.DocumentNode .SelectNodes("//*[contains(@class, 'ind-show')]" + "/a") .Select(node => new HorribleSubsAnime { Title = node.Attributes["title"].Value, Url = baseUrl + node.Attributes["href"].Value }) .Where(anime => anime.Title == name); if (!trackedShowList.Any()) { return(null); } var trackedShow = trackedShowList.First(); return(await GetDls(trackedShow.Url, name).ConfigureAwait(false)); }
public async Task ExecuteAsync() { var hotnewsUrls = new List <HotNewsJobItem <string> > { new HotNewsJobItem <string> { Result = "https://www.cnblogs.com", Source = HotNewsEnum.cnblogs }, new HotNewsJobItem <string> { Result = "https://www.v2ex.com/?tab=hot", Source = HotNewsEnum.v2ex }, new HotNewsJobItem <string> { Result = "https://segmentfault.com/hottest", Source = HotNewsEnum.segmentfault }, new HotNewsJobItem <string> { Result = "https://web-api.juejin.im/query", Source = HotNewsEnum.juejin }, new HotNewsJobItem <string> { Result = "https://weixin.sogou.com", Source = HotNewsEnum.weixin }, new HotNewsJobItem <string> { Result = "https://www.douban.com/group/explore", Source = HotNewsEnum.douban }, new HotNewsJobItem <string> { Result = "https://www.ithome.com", Source = HotNewsEnum.ithome }, new HotNewsJobItem <string> { Result = "https://36kr.com/newsflashes", Source = HotNewsEnum.kr36 }, new HotNewsJobItem <string> { Result = "http://tieba.baidu.com/hottopic/browse/topicList", Source = HotNewsEnum.tieba }, new HotNewsJobItem <string> { Result = "http://top.baidu.com/buzz?b=341", Source = HotNewsEnum.baidu }, new HotNewsJobItem <string> { Result = "https://s.weibo.com/top/summary/summary", Source = HotNewsEnum.weibo }, new HotNewsJobItem <string> { Result = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true", Source = HotNewsEnum.zhihu }, new HotNewsJobItem <string> { Result = "https://daily.zhihu.com", Source = HotNewsEnum.zhihudaily }, new HotNewsJobItem <string> { Result = "http://news.163.com/special/0001386F/rank_whole.html", Source = HotNewsEnum.news163 }, new HotNewsJobItem <string> { Result = "https://github.com/trending", Source = HotNewsEnum.github }, new HotNewsJobItem <string> { Result = "https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/word", Source = HotNewsEnum.douyin_hot }, new HotNewsJobItem <string> { Result = "https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/aweme", Source = HotNewsEnum.douyin_video }, new HotNewsJobItem <string> { Result = "https://www.iesdouyin.com/web/api/v2/hotsearch/billboard/aweme/?type=positive", Source = HotNewsEnum.douyin_positive }, }; var web = new HtmlWeb(); var list_task = new List <Task <HotNewsJobItem <object> > >(); hotnewsUrls.ForEach(item => { var task = Task.Run(async() => { var obj = new object(); if (item.Source == HotNewsEnum.juejin) { using var client = _httpClient.CreateClient(); client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36 Edg/83.0.478.13"); client.DefaultRequestHeaders.Add("X-Agent", "Juejin/Web"); var data = "{\"extensions\":{\"query\":{ \"id\":\"21207e9ddb1de777adeaca7a2fb38030\"}},\"operationName\":\"\",\"query\":\"\",\"variables\":{ \"first\":20,\"after\":\"\",\"order\":\"THREE_DAYS_HOTTEST\"}}"; var buffer = data.SerializeUtf8(); var byteContent = new ByteArrayContent(buffer); byteContent.Headers.ContentType = new MediaTypeHeaderValue("application/json"); var httpResponse = await client.PostAsync(item.Result, byteContent); obj = await httpResponse.Content.ReadAsStringAsync(); } else { // 针对GBK、GB2312编码网页,注册提供程序,否则获取到的数据乱码 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); obj = await web.LoadFromWebAsync(item.Result, (item.Source == HotNewsEnum.baidu || item.Source == HotNewsEnum.news163) ? Encoding.GetEncoding("GB2312") : Encoding.UTF8); } return(new HotNewsJobItem <object> { Result = obj, Source = item.Source }); }); list_task.Add(task); }); Task.WaitAll(list_task.ToArray()); var hotNews = new List <HotNews>(); foreach (var list in list_task) { var item = await list; var sourceId = (int)item.Source; // 博客园 if (item.Source == HotNewsEnum.cnblogs) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='post_item_body']/h3/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = x.GetAttributeValue("href", ""), SourceId = sourceId, CreateTime = DateTime.Now }); }); } //// V2EX //if (item.Source == HotNewsEnum.v2ex) //{ // var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//span[@class='item_title']/a").ToList(); // nodes.ForEach(x => // { // hotNews.Add(new HotNews // { // Title = x.InnerText, // Url = $"https://www.v2ex.com{x.GetAttributeValue("href", "")}", // SourceId = sourceId, // CreateTime = DateTime.Now // }); // }); //} // SegmentFault if (item.Source == HotNewsEnum.segmentfault) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='news__item-info clearfix']/a").Where(x => x.InnerText.IsNotNullOrEmpty()).ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.SelectSingleNode(".//h4").InnerText, Url = $"https://segmentfault.com{x.GetAttributeValue("href", "")}", SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 掘金 if (item.Source == HotNewsEnum.juejin) { var obj = JObject.Parse((string)item.Result); var nodes = obj["data"]["articleFeed"]["items"]["edges"]; foreach (var node in nodes) { hotNews.Add(new HotNews { Title = node["node"]["title"].ToString(), Url = node["node"]["originalUrl"].ToString(), SourceId = sourceId, CreateTime = DateTime.Now }); } } // 微信热门 if (item.Source == HotNewsEnum.weixin) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//ul[@class='news-list']/li/div[@class='txt-box']/h3/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = x.GetAttributeValue("href", ""), SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 豆瓣精选 if (item.Source == HotNewsEnum.douban) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='channel-item']/div[@class='bd']/h3/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = x.GetAttributeValue("href", ""), SourceId = sourceId, CreateTime = DateTime.Now }); }); } // IT之家 if (item.Source == HotNewsEnum.ithome) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='lst lst-2 hot-list']/div[1]/ul/li/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = x.GetAttributeValue("href", ""), SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 36氪 if (item.Source == HotNewsEnum.kr36) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='hotlist-main']/div[@class='hotlist-item-toptwo']/a[2]|//div[@class='hotlist-main']/div[@class='hotlist-item-other clearfloat']/div[@class='hotlist-item-other-info']/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = $"https://36kr.com{x.GetAttributeValue("href", "")}", SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 百度贴吧 if (item.Source == HotNewsEnum.tieba) { var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText); var nodes = obj["data"]["bang_topic"]["topic_list"]; foreach (var node in nodes) { hotNews.Add(new HotNews { Title = node["topic_name"].ToString(), Url = node["topic_url"].ToString().Replace("amp;", ""), SourceId = sourceId, CreateTime = DateTime.Now }); } } // 百度热搜 if (item.Source == HotNewsEnum.baidu) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//table[@class='list-table']//tr/td[@class='keyword']/a[@class='list-title']").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = x.GetAttributeValue("href", ""), SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 微博热搜 if (item.Source == HotNewsEnum.weibo) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//table/tbody/tr/td[2]/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = $"https://s.weibo.com{x.GetAttributeValue("href", "").Replace("#", "%23")}", SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 知乎热榜 if (item.Source == HotNewsEnum.zhihu) { var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText); var nodes = obj["data"]; foreach (var node in nodes) { hotNews.Add(new HotNews { Title = node["target"]["title"].ToString(), Url = $"https://www.zhihu.com/question/{node["target"]["id"]}", SourceId = sourceId, CreateTime = DateTime.Now }); } } // 知乎日报 if (item.Source == HotNewsEnum.zhihudaily) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='box']/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = $"https://daily.zhihu.com{x.GetAttributeValue("href", "")}", SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 网易新闻 if (item.Source == HotNewsEnum.news163) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//div[@class='area-half left']/div[@class='tabBox']/div[@class='tabContents active']/table//tr/td[1]/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText, Url = x.GetAttributeValue("href", ""), SourceId = sourceId, CreateTime = DateTime.Now }); }); } // GitHub if (item.Source == HotNewsEnum.github) { var nodes = ((HtmlDocument)item.Result).DocumentNode.SelectNodes("//article[@class='Box-row']/h1/a").ToList(); nodes.ForEach(x => { hotNews.Add(new HotNews { Title = x.InnerText.Trim().Replace("\n", "").Replace(" ", ""), Url = $"https://github.com{x.GetAttributeValue("href", "")}", SourceId = sourceId, CreateTime = DateTime.Now }); }); } // 抖音热点 if (item.Source == HotNewsEnum.douyin_hot) { var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText); var nodes = obj["word_list"]; foreach (var node in nodes) { hotNews.Add(new HotNews { Title = node["word"].ToString(), Url = $"#{node["hot_value"]}", SourceId = sourceId, CreateTime = DateTime.Now }); } } // 抖音视频 & 抖音正能量 if (item.Source == HotNewsEnum.douyin_video || item.Source == HotNewsEnum.douyin_positive) { var obj = JObject.Parse(((HtmlDocument)item.Result).ParsedText); var nodes = obj["aweme_list"]; foreach (var node in nodes) { hotNews.Add(new HotNews { Title = node["aweme_info"]["desc"].ToString(), Url = node["aweme_info"]["share_url"].ToString(), SourceId = sourceId, CreateTime = DateTime.Now }); } } } if (hotNews.Any()) { await _hotNewsRepository.DeleteAsync(x => true); await _hotNewsRepository.BulkInsertAsync(hotNews); } // 发送Email var message = new MimeMessage { Subject = "【定时任务】每日热点数据抓取任务推送", Body = new BodyBuilder { HtmlBody = $"本次抓取到{hotNews.Count()}条数据,时间:{DateTime.Now:yyyy-MM-dd HH:mm:ss}" }.ToMessageBody()
public async Task <IActionResult> GetAllData() { var playerName = "Fatso"; var region = "Northrend"; var pageNo = 1; var gameDetailsList = new List <GameDetail>(); var url = "http://classic.battle.net/war3/ladder/w3xp-player-logged-games.aspx?Gateway=" + region + "&PlayerName=" + playerName + "&SortField=Game_Date&SortDir=Asc&PageNo=" + pageNo; var web = new HtmlWeb(); var doc = await web.LoadFromWebAsync(url); if (doc == null) { return(NotFound("Could not get data from the page.")); } var maxPageString = doc.DocumentNode.SelectNodes("//td[@class='rankingFiller']//a").Last().InnerText.Trim(' '); var maxPageNumber = Convert.ToInt32(maxPageString); for (int p = 1; p <= maxPageNumber; p++) { url = "http://classic.battle.net/war3/ladder/w3xp-player-logged-games.aspx?Gateway=" + region + "&PlayerName=" + playerName + "&SortField=Game_Date&SortDir=Asc&PageNo=" + p; web = new HtmlWeb(); doc = await web.LoadFromWebAsync(url); if (doc == null) { return(NotFound("Could not get data from the page.")); } var htmlRankingRankingRowDataAll = doc.DocumentNode.SelectNodes("//tr[@class='rankingRow']//td"); if (htmlRankingRankingRowDataAll == null) { return(NotFound("Could not get data from the page.")); } var rankingRowList = new List <string>(); foreach (var item in htmlRankingRankingRowDataAll) { rankingRowList.Add(item.InnerText.Replace("\r", "").Replace("\n", "").Replace("\t", "")); } for (int i = 0; i < rankingRowList.Count; i += 11) { var gameDetail = new GameDetail() { Id = p * 100 + (i / 11), Date = rankingRowList[i + 1], GameType = rankingRowList[i + 2], Map = rankingRowList[i + 3], Allies = rankingRowList[i + 6], Opponents = rankingRowList[i + 8], GameLength = rankingRowList[i + 9], Result = rankingRowList[i + 10], }; gameDetailsList.Add(gameDetail); } } // Getting all games that I want here with the info I want now I need to store it in DB // Maybe show it in a model of some kind at the page instead not sure. return(Json(gameDetailsList)); }
private async void TimerCallback(object sender, ElapsedEventArgs args) { var timeoutCancellationTokenSource = new CancellationTokenSource(_configuration.PollingInterval * 5); CancellationTokenSource cancellationTokenSource; if (_cancellationTokenSource != null) { cancellationTokenSource = CancellationTokenSource.CreateLinkedTokenSource( _cancellationTokenSource.Token, timeoutCancellationTokenSource.Token); } else { cancellationTokenSource = timeoutCancellationTokenSource; } NowPlayingInfo nowPlayingInfo = null; try { var web = new HtmlWeb(); var doc = await web.LoadFromWebAsync( _configuration.SourceUrl, Encoding.UTF8, cancellationTokenSource.Token); var infoDiv = doc.DocumentNode.FirstDescendantWithClass("div", "card horizontal"); if (infoDiv != null) { var imgUrl = infoDiv.Descendants("img").FirstOrDefault()?.GetAttributeValue("src", default(string)); var content = infoDiv.FirstDescendantWithClass("div", "card-content")?.InnerText; if (content != null) { var lines = content .Split(new[] { "\n", "\r\n" }, StringSplitOptions.RemoveEmptyEntries) .Select(x => x.Trim()) .Take(2) .ToList(); var song = lines[0]; var artist = lines[1]; nowPlayingInfo = new NowPlayingInfo(artist, song, imgUrl); } } } catch (TaskCanceledException) { // Ignore - these are expected during timeouts or while quitting } catch (Exception ex) { _logger?.Log(ex); } finally { timeoutCancellationTokenSource.Dispose(); cancellationTokenSource.Dispose(); } OnNowPlayingChanged(nowPlayingInfo); }
public static async Task <F95ZoneGame> LoadGame(string url, ILogger logger) { var web = new HtmlWeb(); var document = await web.LoadFromWebAsync(url); if (document == null) { return(null); } var game = new F95ZoneGame { F95Link = url }; var node = document.DocumentNode; var bodyNode = node.SelectSingleNode("//div[@class='uix_contentWrapper']/div[@class='p-body-main ']/div[@class='p-body-content']"); if (bodyNode.IsNull(logger, "Body", url)) { return(null); } var headerNode = bodyNode.SelectSingleNode( "//div[@class='pageContent']/div[@class='uix_headerInner']"); if (headerNode.IsNull(logger, "Header", url)) { return(null); } var labels = headerNode.SelectNodes("div[@class='p-title ']/h1[@class='p-title-value']/a[@class='labelLink']"); if (!labels.IsNullOrEmpty(logger, "Labels", url)) { game.LabelList = labels.Select(x => !x.TryGetInnerText("span", logger, "Label", url, out var label) ? null : label) .NotNull().ToList(); } if (headerNode.TryGetInnerText( "div[@class='p-title ']/h1[@class='p-title-value']", logger, "Title", url, out var id)) { if (game.LabelList == null) { game.Name = id; } else { game.LabelList = game.LabelList.Select(label => { if (id.Contains(label)) { id = id.Replace(label, ""); } if (label.StartsWith("[")) { label = label.Substring(1, label.Length - 1); } if (label.EndsWith("]")) { label = label.Substring(0, label.Length - 1); } return(label); }).ToList(); id = id.Trim(); var lastStartingBracket = id.LastIndexOf('['); var lastClosingBracket = id.LastIndexOf(']'); if (lastStartingBracket != -1 && lastClosingBracket != -1) { var dev = id.Substring(lastStartingBracket + 1, lastClosingBracket - lastStartingBracket - 1); game.Developer = dev; } id = id.Substring(0, lastStartingBracket).Trim(); game.Name = id; } } else { return(null); } var tags = headerNode.SelectNodes( "div[@class='p-description']/ul/li[@class='groupedTags']/a[@class='tagItem']"); if (!tags.IsNullOrEmpty(logger, "Tags", id)) { game.Genres = tags.Select(x => { var innerText = x.DecodeInnerText(); return(innerText.IsEmpty(logger, "Tag", id) ? null : innerText); /*var ti = new CultureInfo("en-US").TextInfo; * if (innerText.IsEmpty(logger, "Tag", id)) * return null; * * if (innerText == "2dcg") * return "2DCG"; * if (innerText == "3dcg") * return "3DCG"; * * return ti.ToTitleCase(innerText);*/ }).NotNull().ToList(); } var contentNode = bodyNode.SelectSingleNode("//div[@class='message-inner']/div[@class='message-cell message-cell--main']/div[@class='message-main uix_messageContent js-quickEditTarget']/div/div/article[@class='message-body js-selectToQuote']/div[@class='bbWrapper']"); if (contentNode.IsNull(logger, "Content", id)) { return(null); } var topNode = contentNode.SelectSingleNode("div"); if (!topNode.IsNull(logger, "Top", id)) { var coverImageNode = topNode.SelectSingleNode("a"); if (!coverImageNode.IsNull(logger, "Cover Image", id)) { var href = coverImageNode.GetValue("href"); if (!href.IsEmpty(logger, "Cover Image", id)) { game.CoverImageURL = href; } } topNode.RemoveChild(coverImageNode); game.Overview = HttpUtility.HtmlDecode(topNode.InnerHtml); } var previewImages = contentNode.SelectNodes("//img[@class='bbImage ']"); if (!previewImages.IsNullOrEmpty(logger, "Preview Images", id)) { game.PreviewImageURLs = previewImages.Select(x => { var a = x.ParentNode; var href = a.GetValue("href"); return(href.IsEmpty(logger, "Preview Image href", id) ? null : href); }).NotNull().ToList(); } return(game); }
private async void GetSubtitle() { progress.IsActive = true; listView.Visibility = Visibility.Collapsed; CloseError(); if (GeneralHelper.IsNetworkAvailable()) { try { var web = new HtmlWeb(); var doc = await web.LoadFromWebAsync(subtitleUrl); var items = doc.DocumentNode.SelectNodes(@"//div[@id='new-link']/ul/li"); if (items == null) { ShowError(Constants.NotFoundOrExist); } else { Subtitles?.Clear(); foreach (var node in items) { var displayName = node.SelectSingleNode(".//div[@class='new-link-1']").InnerText; var status = node.SelectSingleNode(".//div[@class='new-link-2']").InnerText; var link = node.SelectSingleNode(".//a")?.Attributes["href"]?.Value; if (status.Contains(" ")) { status = status.Replace(" ", ""); } displayName = displayName.Trim() + " - " + status.Trim(); var item = new DownloadModel { DisplayName = displayName, DownloadLink = link }; Subtitles.Add(item); } } progress.IsActive = false; listView.Visibility = Visibility.Visible; } catch (ArgumentOutOfRangeException) { } catch (ArgumentNullException) { } catch (NullReferenceException) { } catch (WebException ex) { if (!string.IsNullOrEmpty(ex.Message)) { ShowError(ex.Message); } } catch (HttpRequestException hx) { if (!string.IsNullOrEmpty(hx.Message)) { ShowError(hx.Message); } } finally { progress.IsActive = false; listView.Visibility = Visibility.Visible; } } else { ShowError(Constants.InternetIsNotAvailable, Constants.InternetIsNotAvailableTitle); } }
public async void TestHtmlWebBasicCall() { var html = new HtmlWeb(); var doc = await html.LoadFromWebAsync("http://www.google.com"); Assert.IsNotNull(doc); }