예제 #1
0
        public void Parse_HtmlWithComment(string html, string expectedXml)
        {
            var parser = new HtmlParser();

            var elements = parser.Parse(html);

            var root = elements.Single();

            Assert.That(root is XElement);
            Assert.That(root.ToString(), Is.EqualTo(expectedXml));
        }
예제 #2
0
        public void Parse_HtmlWithEntity(string html, string expectedXml)
        {
            var parser = new HtmlParser();

            var elements = parser.Parse(html);

            var root = elements.Single();

            Assert.That(root is XElement);
            Assert.That(root.ToString(SaveOptions.DisableFormatting), Is.EqualTo(expectedXml));
        }
예제 #3
0
        public void Parse_SimpleHtml(string html, string expectedRoot, string expectedText, int expectedChildCount)
        {
            var parser = new HtmlParser();

            var elements = parser.Parse(html);

            var root = elements.Single();

            Assert.That(root is XElement);
            Assert.That(((XElement)root).Name == expectedRoot);
            Assert.That(((XElement)root).Value == expectedText);
            Assert.That(((XElement)root).Nodes().Count(), Is.EqualTo(expectedChildCount));
        }
예제 #4
0
        protected async Task Read(Uri uri, Func<Tuple<Uri, IEnumerable<IToken>>, bool> tokenProcessor, Func<XElement, XElement> targetElement, int depth)
        {
            if (depth > _maxDepth || _visited.Count > MaxVisited) return;

            var mimeType = _mimeType;

            var response = await _client.GetAsync(uri);

            if (mimeType == TextMimeType.Default)
            {
                IEnumerable<string> mimes;

                if (response.Headers.TryGetValues("Content-Type", out mimes))
                {
                    mimeType = Parse(mimes.First());
                }
            }

            if (mimeType == TextMimeType.Default) mimeType = TextMimeType.Html;

            var stream = await response.Content.ReadAsStreamAsync();

            using (stream)
            {
                if (mimeType == TextMimeType.Xml)
                {
                    var doc = XDocument.Load(stream);

                    XElement root = targetElement == null ? doc.Root : targetElement(doc.Root);

                    if (root != null && tokenProcessor(new Tuple<Uri, IEnumerable<IToken>>(uri, ExtractTokens(root))))
                    {
                        await FollowLinksAndProcess(uri, _linkExtractor(doc.Root), tokenProcessor, targetElement, depth);
                    }
                }
                else
                {
                    using (var reader = new StreamReader(stream, _encoding))
                    {
                        if (mimeType == TextMimeType.Html)
                        {
                            var parser = new HtmlParser();

                            var elements = parser.Parse(reader);
                            var selected = targetElement != null ? elements.Where(e => e.NodeType == System.Xml.XmlNodeType.Element).Select(e => targetElement((XElement)e)) : elements;

                            if (tokenProcessor(new Tuple<Uri, IEnumerable<IToken>>(uri, selected.SelectMany(e => ExtractTokens(e)))))
                            {
                                foreach (var e in selected.Where(x => x != null && x.NodeType == System.Xml.XmlNodeType.Element).Cast<XElement>())
                                    await FollowLinksAndProcess(uri, _linkExtractor(e), tokenProcessor, targetElement, depth);
                            }
                        }
                        else
                        {
                            if (mimeType == TextMimeType.Plain)
                            {
                                var docContent = reader.ReadToEnd();

                                if (tokenProcessor(new Tuple<Uri, IEnumerable<IToken>>(uri, _tokeniser.Tokenise(docContent))))
                                {
                                    await FollowLinksAndProcess(uri, _linkExtractor(new XElement("x", docContent)), tokenProcessor, targetElement, depth);
                                }
                            }
                            else
                            {
                                throw new NotSupportedException(mimeType.ToString());
                            }
                        }
                    }
                }
            }
        }
예제 #5
0
        public async Task Parse_Uri(string uri)
        {
            var http = new HttpClient();

            var response = await http.GetAsync(uri);

            var stream = await response.Content.ReadAsStreamAsync();

            using (stream)
            {
                using (var reader = new StreamReader(stream))
                {
                    var parser = new HtmlParser();

                    var elements = parser.Parse(reader);

                    var root = elements.First();

                    //var n = ((XElement)root).Elements().SelectMany(r => r.Elements()).Where(r => r.n)

                    foreach(var e in elements)
                        Console.WriteLine(e.ToString());

                    Assert.That(elements.Count(), Is.EqualTo(1));
                }
            }
        }