public void Parse_HtmlWithComment(string html, string expectedXml) { var parser = new HtmlParser(); var elements = parser.Parse(html); var root = elements.Single(); Assert.That(root is XElement); Assert.That(root.ToString(), Is.EqualTo(expectedXml)); }
public void Parse_HtmlWithEntity(string html, string expectedXml) { var parser = new HtmlParser(); var elements = parser.Parse(html); var root = elements.Single(); Assert.That(root is XElement); Assert.That(root.ToString(SaveOptions.DisableFormatting), Is.EqualTo(expectedXml)); }
public void Parse_SimpleHtml(string html, string expectedRoot, string expectedText, int expectedChildCount) { var parser = new HtmlParser(); var elements = parser.Parse(html); var root = elements.Single(); Assert.That(root is XElement); Assert.That(((XElement)root).Name == expectedRoot); Assert.That(((XElement)root).Value == expectedText); Assert.That(((XElement)root).Nodes().Count(), Is.EqualTo(expectedChildCount)); }
protected async Task Read(Uri uri, Func<Tuple<Uri, IEnumerable<IToken>>, bool> tokenProcessor, Func<XElement, XElement> targetElement, int depth) { if (depth > _maxDepth || _visited.Count > MaxVisited) return; var mimeType = _mimeType; var response = await _client.GetAsync(uri); if (mimeType == TextMimeType.Default) { IEnumerable<string> mimes; if (response.Headers.TryGetValues("Content-Type", out mimes)) { mimeType = Parse(mimes.First()); } } if (mimeType == TextMimeType.Default) mimeType = TextMimeType.Html; var stream = await response.Content.ReadAsStreamAsync(); using (stream) { if (mimeType == TextMimeType.Xml) { var doc = XDocument.Load(stream); XElement root = targetElement == null ? doc.Root : targetElement(doc.Root); if (root != null && tokenProcessor(new Tuple<Uri, IEnumerable<IToken>>(uri, ExtractTokens(root)))) { await FollowLinksAndProcess(uri, _linkExtractor(doc.Root), tokenProcessor, targetElement, depth); } } else { using (var reader = new StreamReader(stream, _encoding)) { if (mimeType == TextMimeType.Html) { var parser = new HtmlParser(); var elements = parser.Parse(reader); var selected = targetElement != null ? elements.Where(e => e.NodeType == System.Xml.XmlNodeType.Element).Select(e => targetElement((XElement)e)) : elements; if (tokenProcessor(new Tuple<Uri, IEnumerable<IToken>>(uri, selected.SelectMany(e => ExtractTokens(e))))) { foreach (var e in selected.Where(x => x != null && x.NodeType == System.Xml.XmlNodeType.Element).Cast<XElement>()) await FollowLinksAndProcess(uri, _linkExtractor(e), tokenProcessor, targetElement, depth); } } else { if (mimeType == TextMimeType.Plain) { var docContent = reader.ReadToEnd(); if (tokenProcessor(new Tuple<Uri, IEnumerable<IToken>>(uri, _tokeniser.Tokenise(docContent)))) { await FollowLinksAndProcess(uri, _linkExtractor(new XElement("x", docContent)), tokenProcessor, targetElement, depth); } } else { throw new NotSupportedException(mimeType.ToString()); } } } } } }
public async Task Parse_Uri(string uri) { var http = new HttpClient(); var response = await http.GetAsync(uri); var stream = await response.Content.ReadAsStreamAsync(); using (stream) { using (var reader = new StreamReader(stream)) { var parser = new HtmlParser(); var elements = parser.Parse(reader); var root = elements.First(); //var n = ((XElement)root).Elements().SelectMany(r => r.Elements()).Where(r => r.n) foreach(var e in elements) Console.WriteLine(e.ToString()); Assert.That(elements.Count(), Is.EqualTo(1)); } } }