Ejemplo n.º 1
0
        void ProcessMetaTag(HtmlReader reader, WorkspaceResult result)
        {
            var propertyName = reader.GetAttribute("property").ToLower();

            if (!string.IsNullOrWhiteSpace(propertyName))
            {
                propertyName = SanitiseProperty(propertyName);
                var content = SanitiseAttribute(reader.GetAttribute("content"));

                if (!result.PropertyCache.ContainsKey(propertyName))
                {
                    result.PropertyCache.Add(propertyName, reader.GetAttribute("content"));
                }

                switch (propertyName)
                {
                case "og_title":     // og:title
                    if (!result.PropertyCache.ContainsKey("title"))
                    {
                        result.PropertyCache.Add("title", content);
                    }
                    result.PropertyCache["title"] = content;
                    result.Title = content;
                    break;

                case "og_description":     // og:description
                    result.Description = content;
                    if (!result.PropertyCache.ContainsKey("description"))
                    {
                        result.PropertyCache.Add("description", content);
                    }
                    result.PropertyCache["description"] = content;
                    break;

                default:
                    break;
                }
            }
            else
            {
                // other random historical meta tags here
                var metaname = reader.GetAttribute("name").ToLower();

                switch (metaname)
                {
                case "description":
                    var value = SanitiseAttribute(reader.GetAttribute("content"));
                    if (!result.PropertyCache.ContainsKey("description"))
                    {
                        result.PropertyCache.Add("description", value);
                    }
                    break;
                }
            }
        }
Ejemplo n.º 2
0
        public void AttributeValueNamedCharacterReference()
        {
            reader = HtmlReaderFactory.FromString("<a title=\"&lt;\">", parseErrors);

            Assert.IsTrue(reader.Read());
            Assert.AreEqual("<", reader.GetAttribute("title"));

            Assert.IsFalse(reader.Read());
            Assert.AreEqual(0, parseErrors.Count);
        }
Ejemplo n.º 3
0
        public void TagMissingAttribute()
        {
            reader = HtmlReaderFactory.FromString("<a>", parseErrors);

            Assert.IsTrue(reader.Read());
            Assert.AreEqual(HtmlTokenKind.Tag, reader.TokenKind);
            Assert.AreEqual("a", reader.Name);
            Assert.IsNull(reader.GetAttribute("href"));
            Assert.IsFalse(reader.SelfClosingElement);

            Assert.IsFalse(reader.Read());
            Assert.AreEqual(0, parseErrors.Count);
        }
Ejemplo n.º 4
0
        public void TagUnquotedAttributeValue()
        {
            reader = HtmlReaderFactory.FromString("<a href=javascript:;>", parseErrors);

            Assert.IsTrue(reader.Read());
            Assert.AreEqual(HtmlTokenKind.Tag, reader.TokenKind);
            Assert.AreEqual("a", reader.Name);
            Assert.AreEqual("javascript:;", reader.GetAttribute("href"));
            Assert.IsFalse(reader.SelfClosingElement);

            Assert.IsFalse(reader.Read());
            Assert.AreEqual(0, parseErrors.Count);
        }
Ejemplo n.º 5
0
        public void GetAttributeReturnsFirstAttributeValue()
        {
            reader = HtmlReaderFactory.FromString("<img src=\"a\" src=\"b\" />", parseErrors);

            Assert.IsTrue(reader.Read());
            Assert.AreEqual(HtmlTokenKind.Tag, reader.TokenKind);
            Assert.AreEqual("img", reader.Name);

            Assert.AreEqual(2, reader.AttributeCount);
            Assert.AreEqual("a", reader.GetAttribute("src"));

            Assert.IsFalse(reader.Read());
            Assert.AreEqual(0, parseErrors.Count);
        }
Ejemplo n.º 6
0
        public List <string> ExtractLinks()
        {
            stream.Seek(0, SeekOrigin.Begin);

            var htmlReader = new HtmlReader(new StreamReader(stream));
            var links      = new List <string>();

            while (htmlReader.Read())
            {
                if (htmlReader.TokenKind == HtmlTokenKind.Tag && htmlReader.Name == "a")
                {
                    var hrefAttributeValue = htmlReader.GetAttribute("href");
                    if (hrefAttributeValue != null)
                    {
                        links.Add(hrefAttributeValue);
                    }
                }
            }

            return(links);
        }
Ejemplo n.º 7
0
        public override async Task ProcessResponseStream(Stream dataStream)
        {
            if (dataStream == null)
            {
                return;
            }
            var ms = new MemoryStream();

            dataStream.CopyTo(ms);
            ms.Seek(0, SeekOrigin.Begin);
            if (ContentType.IsUnknownOrNull(ContentType))
            {
                ContentType = ContentType.Guess(ms);
            }

            ms.Seek(0, SeekOrigin.Begin);

            if (ContentType.IsUnknownOrNull(ContentType) || ContentType.MediaType == "text")
            {
                using (var p = new HtmlReader(ms))
                {
                    while (!p.EOF)
                    {
                        var node = p.NextNode();

                        if (node.Type == HtmlTokenType.StartTag)
                        {
                            if (node.Value == "script")
                            {
                                string src = p.GetAttribute("src");
                                if (!string.IsNullOrWhiteSpace(src))
                                {
                                    Console.WriteLine(src);
                                }
                            }
                        }
                    }
                }
            }
        }
Ejemplo n.º 8
0
        public Uri NextUri()
        {
            Uri uri = null;

            while (!reader.EOF && uri == null)
            {
                var node = reader.NextNode();

                if (node.Value == "a")
                {
                    var href = reader.GetAttribute("href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        var links = Utility.GetLinks(CurrentSourceUri, href);
                        if (links.Any())
                        {
                            uri = links.First();
                        }
                    }
                }
                else if (node.Value == "script")
                {
                    var href = reader.GetAttribute("src");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        var links = Utility.GetLinks(CurrentSourceUri, href);
                        if (links.Any())
                        {
                            uri = links.First();
                        }
                    }
                }
                else if (node.Value == "link")
                {
                    var href = reader.GetAttribute("href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        var links = Utility.GetLinks(CurrentSourceUri, href);
                        if (links.Any())
                        {
                            uri = links.First();
                        }
                    }
                }
                else if (node.Value == "base")
                {
                    var href = reader.GetAttribute("href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        // sometimes the links are bogus?!
                        var l = Utility.GetLinks(null, href);
                        if (l.Any())
                        {
                            CurrentSourceUri = l.First();
                        }
                    }
                }
            }

            return(uri);
        }