private void GetReferences() { HtmlNodeCollection hrefs = _doc.DocumentNode.SelectNodes("//a[@href]"); if (hrefs.IsNull()) { References = new string[0]; return; } References = hrefs. Select(href => href.Attributes["href"].Value). Distinct(). ToArray(); }
//得到产品的颜色和大小数据,通过在线读取,该方法主要用于以自动无干预自动上产品 public List <ProductColor> GetProductColorByOnline(RequestModel requestModelbase, HtmlDocument doc) { BanggoRequestModel requestModel = new BanggoRequestModel(); Util.CopyModel(requestModelbase, requestModel); HtmlNode htmlNodeColorList = doc.GetElementbyId(Resource.SysConfig_ColorListId); if (htmlNodeColorList.IsNull()) { return(null); } HtmlNodeCollection colors = htmlNodeColorList.SelectNodes("li/a"); if (colors.IsNull()) { return(null); } var colorList = new List <ProductColor>(); foreach (HtmlNode colorNode in colors) { string colorInfo = colorNode.Attributes["onclick"].Value; ProductColor productColor = CreateProductColor(colorInfo); requestModel.ColorCode = productColor.ColorCode; productColor.SizeList = GetAvailableSize(requestModel); foreach (var size in productColor.SizeList) { productColor.AvlNumForColor += size.AvlNum; } colorList.Add(productColor); } return(colorList); }
private void GetLinks() { HtmlNodeCollection atts = _doc.DocumentNode.SelectNodes("//*[@background or @lowsrc or @src or @href or @action]"); if (atts.IsNull()) { Links = new string[0]; return; } Links = atts. SelectMany(n => new[] { ParseLink(n, "background"), ParseLink(n, "href"), ParseLink(n, "src"), ParseLink(n, "lowsrc"), ParseLink(n, "action") }). Distinct(). ToArray(); }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(crawler, nameof(crawler)) .NotNull(propertyBag, nameof(propertyBag)); if (propertyBag.StatusCode != HttpStatusCode.OK) { return(Task.FromResult(true)); } if (!IsHtmlContent(propertyBag.ContentType)) { return(Task.FromResult(true)); } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { Encoding documentEncoding = htmlDoc.DetectEncoding(ms); ms.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(ms, documentEncoding, true); } else { htmlDoc.Load(ms, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select $"{name.Value}: {content.Value}").ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes .Select(entry => new { entry, href = entry.Attributes["href"] }) .Where(arg => !arg.href.IsNull() && !arg.href.Value.IsNullOrEmpty() && Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute)) .Select(t => { if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative)) { return(propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value); } return(t.href.Value); }) .AddToEnd(baseUrl) .FirstOrDefault(); } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.Crawl(new Uri(normalizedLink), propertyBag); } return(Task.FromResult(true)); }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream reader = propertyBag.GetResponseStream()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, link }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); } }
public override void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (Stream reader = propertyBag.GetResponse()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes. Select(entry => new { entry, href = entry.Attributes["href"] }). Where(@t => [email protected]() && [email protected]() && Uri.IsWellFormedUriString(@t.href.Value, UriKind.RelativeOrAbsolute)). Select(@t => @t.href.Value). AddToEnd(baseUrl). FirstOrDefault(); } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); try { string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } AddStepToCrawler(crawler, propertyBag, normalizedLink, link); } catch (UriFormatException) { //When the link is not propper formatted the link mist be ignored } } }