void ProcessMetaTag(HtmlReader reader, WorkspaceResult result) { var propertyName = reader.GetAttribute("property").ToLower(); if (!string.IsNullOrWhiteSpace(propertyName)) { propertyName = SanitiseProperty(propertyName); var content = SanitiseAttribute(reader.GetAttribute("content")); if (!result.PropertyCache.ContainsKey(propertyName)) { result.PropertyCache.Add(propertyName, reader.GetAttribute("content")); } switch (propertyName) { case "og_title": // og:title if (!result.PropertyCache.ContainsKey("title")) { result.PropertyCache.Add("title", content); } result.PropertyCache["title"] = content; result.Title = content; break; case "og_description": // og:description result.Description = content; if (!result.PropertyCache.ContainsKey("description")) { result.PropertyCache.Add("description", content); } result.PropertyCache["description"] = content; break; default: break; } } else { // other random historical meta tags here var metaname = reader.GetAttribute("name").ToLower(); switch (metaname) { case "description": var value = SanitiseAttribute(reader.GetAttribute("content")); if (!result.PropertyCache.ContainsKey("description")) { result.PropertyCache.Add("description", value); } break; } } }
public void AttributeValueNamedCharacterReference() { reader = HtmlReaderFactory.FromString("<a title=\"<\">", parseErrors); Assert.IsTrue(reader.Read()); Assert.AreEqual("<", reader.GetAttribute("title")); Assert.IsFalse(reader.Read()); Assert.AreEqual(0, parseErrors.Count); }
public void TagMissingAttribute() { reader = HtmlReaderFactory.FromString("<a>", parseErrors); Assert.IsTrue(reader.Read()); Assert.AreEqual(HtmlTokenKind.Tag, reader.TokenKind); Assert.AreEqual("a", reader.Name); Assert.IsNull(reader.GetAttribute("href")); Assert.IsFalse(reader.SelfClosingElement); Assert.IsFalse(reader.Read()); Assert.AreEqual(0, parseErrors.Count); }
public void TagUnquotedAttributeValue() { reader = HtmlReaderFactory.FromString("<a href=javascript:;>", parseErrors); Assert.IsTrue(reader.Read()); Assert.AreEqual(HtmlTokenKind.Tag, reader.TokenKind); Assert.AreEqual("a", reader.Name); Assert.AreEqual("javascript:;", reader.GetAttribute("href")); Assert.IsFalse(reader.SelfClosingElement); Assert.IsFalse(reader.Read()); Assert.AreEqual(0, parseErrors.Count); }
public void GetAttributeReturnsFirstAttributeValue() { reader = HtmlReaderFactory.FromString("<img src=\"a\" src=\"b\" />", parseErrors); Assert.IsTrue(reader.Read()); Assert.AreEqual(HtmlTokenKind.Tag, reader.TokenKind); Assert.AreEqual("img", reader.Name); Assert.AreEqual(2, reader.AttributeCount); Assert.AreEqual("a", reader.GetAttribute("src")); Assert.IsFalse(reader.Read()); Assert.AreEqual(0, parseErrors.Count); }
public List <string> ExtractLinks() { stream.Seek(0, SeekOrigin.Begin); var htmlReader = new HtmlReader(new StreamReader(stream)); var links = new List <string>(); while (htmlReader.Read()) { if (htmlReader.TokenKind == HtmlTokenKind.Tag && htmlReader.Name == "a") { var hrefAttributeValue = htmlReader.GetAttribute("href"); if (hrefAttributeValue != null) { links.Add(hrefAttributeValue); } } } return(links); }
public override async Task ProcessResponseStream(Stream dataStream) { if (dataStream == null) { return; } var ms = new MemoryStream(); dataStream.CopyTo(ms); ms.Seek(0, SeekOrigin.Begin); if (ContentType.IsUnknownOrNull(ContentType)) { ContentType = ContentType.Guess(ms); } ms.Seek(0, SeekOrigin.Begin); if (ContentType.IsUnknownOrNull(ContentType) || ContentType.MediaType == "text") { using (var p = new HtmlReader(ms)) { while (!p.EOF) { var node = p.NextNode(); if (node.Type == HtmlTokenType.StartTag) { if (node.Value == "script") { string src = p.GetAttribute("src"); if (!string.IsNullOrWhiteSpace(src)) { Console.WriteLine(src); } } } } } } }
public Uri NextUri() { Uri uri = null; while (!reader.EOF && uri == null) { var node = reader.NextNode(); if (node.Value == "a") { var href = reader.GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { var links = Utility.GetLinks(CurrentSourceUri, href); if (links.Any()) { uri = links.First(); } } } else if (node.Value == "script") { var href = reader.GetAttribute("src"); if (!string.IsNullOrWhiteSpace(href)) { var links = Utility.GetLinks(CurrentSourceUri, href); if (links.Any()) { uri = links.First(); } } } else if (node.Value == "link") { var href = reader.GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { var links = Utility.GetLinks(CurrentSourceUri, href); if (links.Any()) { uri = links.First(); } } } else if (node.Value == "base") { var href = reader.GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { // sometimes the links are bogus?! var l = Utility.GetLinks(null, href); if (l.Any()) { CurrentSourceUri = l.First(); } } } } return(uri); }