public async Task <GraphObject> ReadAsync(Uri uri) { Type graphType = uri.ToGraphType(); object graphObject = Activator.CreateInstance(graphType); PropertyInfo[] properties = graphType.GetProperties() .Where(x => Attribute.IsDefined(x, typeof(GraphElementAttribute))).ToArray(); HtmlDocument htmlDocument = await HtmlClient.LoadDocumentAsync(uri); foreach (PropertyInfo prop in properties) { if (prop.PropertyType.IsClass && (prop.PropertyType.Module.Name.Contains("Graphlax"))) { prop.SetValue(graphObject, GetSubPropValue(prop.PropertyType, htmlDocument)); } else { prop.SetValue(graphObject, GetPropValue(prop, htmlDocument)); } } ((GraphObject)graphObject).Url = uri.ToString(); ((GraphObject)graphObject).Site.Name = uri.Host; ((GraphObject)graphObject).Site.IP = "127.0.0.1"; return((GraphObject)graphObject); }
public async Task TestGetTextReaderAsync_ForEncoding(string url, string expectedTitle, ClientOptions options = null) { ClientOptions optionsToUse = options == null ? HtmlClient.Options : options; XmlDocument doc1 = new XmlDocument(); System.Text.Encoding initialEncoding = null; EncodingConfidence initialConfidence = EncodingConfidence.Tentative; System.Text.Encoding finalEncoding = null; EncodingConfidence finalConfidence = EncodingConfidence.Tentative; // Get the Html asynchronously and Parse it into an Xml Document using (HtmlTextReader textReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse)) { initialEncoding = textReader.CurrentEncoding; initialConfidence = textReader.CurrentEncodingConfidence; HtmlParser.DefaultParser.Parse(doc1, textReader, new ParserOptions { BaseUrl = url }); finalEncoding = textReader.CurrentEncoding; finalConfidence = textReader.CurrentEncodingConfidence; } string title1 = doc1.SelectSingleNode("//title/text()").InnerText; Console.WriteLine("Crawled: " + url + ", title: " + title1 + ", default: " + optionsToUse.DefaultEncoding.WebName + " (detect=" + optionsToUse.DetectEncoding + "), inital: " + initialEncoding.WebName + " (" + initialConfidence + "), final: " + finalEncoding.WebName + " (" + finalConfidence + ")"); // Compare the titles of the pages to see if the encoding is picking up consistently between Assert.AreEqual(expectedTitle, title1); }
protected virtual async Task <int> GetPagesCount() { var url = _urlGenerator.GeneratePageUrl(CurrentCity, 1); var htmlDoc = await HtmlClient.LoadFromWebAsync(url); HtmlNode pageCount = htmlDoc.DocumentNode.SelectSingleNode(TargetWebsite.PagesCountXPath); return(Convert.ToInt32(pageCount.InnerHtml)); }
public async Task <XmlDocument> LoadXHtmlDocAsync(string url) { XmlDocument xhtmlDoc = new XmlDocument(); // Get the Html asynchronously and Parse it into an Xml Document using (HtmlTextReader htmlReader = await HtmlClient.GetHtmlTextReaderAsync(url)) this.Parser.Parse(xhtmlDoc, htmlReader, new ParserOptions { BaseUrl = url }); return(xhtmlDoc); }
internal static async Task LoadWebPageAsync(XmlDocument doc, string url, LoaderOptions options) { LoaderOptions optionsToUse = options == null ? new LoaderOptions() : options; optionsToUse.ParserOptions.BaseUrl = string.IsNullOrEmpty(optionsToUse.ParserOptions.BaseUrl) ? url : optionsToUse.ParserOptions.BaseUrl; XmlDomBuilder dom = new XmlDomBuilder(doc); HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>(); // Get the Html asynchronously and Parse it into an Xml Document using (HtmlTextReader htmlReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse)) parser.Parse(dom, htmlReader, optionsToUse.ParserOptions); }
public async Task <int> DownloadPageUsingGetAsTextReaderAsync(string url) { using (HtmlTextReader reader = await HtmlClient.GetHtmlTextReaderAsync(url)) { int c = 0; int charsRead = 0; while (true) { c = reader.BaseReader.Read(); if (c < 0) { break; } charsRead++; } return(charsRead); } }
public virtual async Task CollectData(City city) { CurrentCity = city; await _repository.ClearOffers(CurrentCity, TargetWebsite.Name); for (int i = 0; i < await GetPagesCount(); i++) { var url = _urlGenerator.GeneratePageUrl(city, i); var htmlDoc = await HtmlClient.LoadFromWebAsync(url); var nodes = htmlDoc.DocumentNode .SelectNodes(TargetWebsite.DocumentNodeXPath).ToList(); var offers = GetDetails(nodes); await _repository.AddMany(offers); //todo find better anti-ban solution Thread.Sleep(4000); } }
private Offer GetFullOffer(Offer teaser) { var client = new HtmlClient(); var html = client.GetHtml(teaser.Url); var dom = new CQ(html); var header = dom[".wspolny_naglowek_tytul"][0].InnerHTML; if (header.Contains("PRYWATNA")) { teaser.PrivateOffer = true; } var fullDescription = dom[".pokaz_ogloszenie_tresc"]; for (var i = 0; i < fullDescription.Length; i++) { teaser.Description = TextHelper.CleanText(fullDescription.RenderSelection()); } var kontakt = dom["ul.pokaz_ogloszenie"][0].OuterHTML; Regex rgx = new Regex("<script.+script>", RegexOptions.Singleline); Match match = rgx.Match(kontakt); if (match.Success) kontakt = rgx.Replace(kontakt, ""); teaser.Description += kontakt; var pictureEls = dom["img.pokaz_ogloszenie_obrazek"]; var pictures = new List<string>(); foreach (var picture in pictureEls) { var pictureUrl = "http://ogloszenia.przemysl.pl/" + picture.ParentNode.Attributes["href"]; pictures.Add(pictureUrl); } if (pictures.Count > 0) { teaser.Pictures = pictures; } teaser.Teaser = false; return teaser; }
private TeaserCrawrlResult GetTeasers(string url) { var result = new TeaserCrawrlResult(); var client = new HtmlClient(); var html = client.GetHtml(url); var dom = new CQ(html); result.TeaserUrls = GetUrls(dom).ToList(); result.Teasers = GetTeasers(dom); return result; }