public string Scrap(string html) { HtmlParser parser = new HtmlParser(); IHtmlDocument doc = parser.Parse(html); IHtmlCollection <IElement> docByPriceClasses = doc.GetElementsByClassName("price"); if (docByPriceClasses.Count() == 0) { return(""); } String text = docByPriceClasses.First() .GetElementsByTagName("span") .First() .TextContent; return(String.Join("", text.Where(c => char.IsNumber(c)) .Select(c => c.ToString()) )); }
static void Main(string[] args) { if (args.Length != 1) { Console.WriteLine("Usage: grab <url>"); return; } try { _sleep = ConfigurationUtility.GetInt32("sleep", 3000); Console.WriteLine($"Sleep={_sleep}ms"); _scale = ConfigurationUtility.GetInt32("scale", 4); Console.WriteLine($"Scale={_scale}x"); _pageUrl = Url.Create(args[0]); _pdfFileName = Path.ChangeExtension ( Path.GetFileNameWithoutExtension(_pageUrl.Path), ".pdf" ); _browsingConfiguration = Configuration.Default.WithDefaultLoader(); _browsingContext = BrowsingContext.New(_browsingConfiguration); IDocument document = _browsingContext.OpenAsync(_pageUrl).Result; Console.WriteLine("HTML downloaded"); string selector = "script"; IHtmlCollection <IElement> scripts = document.QuerySelectorAll(selector); Console.WriteLine($"Scripts: {scripts.Length}"); IElement scriptElement = scripts.First ( e => e.InnerHtml.StartsWith("jQuery.extend") ); string json = scriptElement.InnerHtml; int offset = json.IndexOf('{'); int length = json.LastIndexOf('}') - offset + 1; json = json.Substring(offset, length); JObject root = JObject.Parse(json); JValue token = (JValue)root.SelectToken("$.diva.1.options.objectData"); if (ReferenceEquals(token, null)) { Console.WriteLine("Not a book"); return; } string dataUrl = (string)token.Value; Console.WriteLine($"Data URL={dataUrl}"); token = (JValue)root.SelectToken("$.diva.1.options.iipServerURL"); if (ReferenceEquals(token, null)) { Console.WriteLine("No ServerUrl"); return; } _serverUrl = (string)token.Value; Console.WriteLine($"Server URL={_serverUrl}"); token = (JValue)root.SelectToken("$.diva.1.options.imageDir"); if (ReferenceEquals(token, null)) { Console.WriteLine("No imageDir"); return; } _imageDir = (string)token.Value; Console.WriteLine($"Image dir={_imageDir}"); _webClient = new WebClient(); json = _webClient.DownloadString(dataUrl); root = JObject.Parse(json); JArray array = (JArray)root.SelectToken("pgs"); _pages = array.ToObject <Page[]>(); Console.WriteLine($"Total pages={_pages.Length}"); int pageNumber = 1; foreach (Page page in _pages) { if (DownloadPage(pageNumber, page)) { Thread.Sleep(_sleep); } pageNumber++; } BuildDocument(); } catch (Exception exception) { Console.WriteLine(exception); } }
private bool NoHitAndRun(IHtmlCollection <IElement> torrentNodes) { return(torrentNodes.Count() == 1 && torrentNodes.First().TextContent == "Az általad letöltött anyagokat a szabályoknak megfelelően visszaosztottad, a listád ennek köszönhetően üres."); }
private static bool TryParseRank(IHtmlCollection <IElement> nodes, out int rank) => int.TryParse( nodes.First().QuerySelector("span").InnerHtml.TrimEnd('.'), out rank);