public async Task<string> GetContentByLinkAsync(string link) { HttpClient client = new HttpClient(); var html = await client.GetStringAsync(new Uri(link)); HtmlParser parser = new HtmlParser(); string selector = "div.b-posts-1-item__text"; var article = await Task.Run(() => parser.Parse(html).QuerySelector(selector).TextContent); article = Regex.Replace(article, @"^\s+$[\r\n]*", "", RegexOptions.Multiline); return article; }
public AngleSharpExaminer(ILog log, Uri uri, string content) { var parser = new HtmlParser(); _document = parser.Parse(content); _links = new HashSet<Uri>(); foreach (var link in _document.Links) { var href = link.GetAttribute("href"); if (string.IsNullOrWhiteSpace(href)) { log.Log(new LogMessage(LogType.Error, "A-tag has missing or empty href", uri)); continue; } href = href.Trim(); if (href[0] == '#') continue; if (!Uri.IsWellFormedUriString(href, UriKind.RelativeOrAbsolute)) { log.Log(new LogMessage(LogType.Error, $"A-tag has invalid href ({href})", uri)); return; } try { _links.Add(new Uri(uri, href)); } catch (UriFormatException exception) { log.Log(new LogMessage(LogType.Error, $"A-tag has invalid href ({href})", exception, uri)); } } }
public IEnumerable<IDocument> Execute(IReadOnlyList<IDocument> inputs, IExecutionContext context) { HtmlParser parser = new HtmlParser(); return inputs.AsParallel().Select(x => { try { IHtmlDocument htmlDocument; using (Stream stream = x.GetStream()) { htmlDocument = parser.Parse(stream); } IElement element = htmlDocument.QuerySelector(_querySelector); if (element != null) { return x.Clone(new Dictionary<string, object>() { {_metadataKey, _outerHtml ? element.OuterHtml : element.InnerHtml} }); } return x; } catch (Exception ex) { context.Trace.Warning("Exception while parsing HTML for {0}: {1}", x.Source, ex.Message); return x; } }); }
private LinkTagCssSource CreateSUT(string baseUrl = "http://a.com", string path = "a.css") { var node = new HtmlParser().Parse(String.Format("<link href=\"{0}\" />", path)); var sut = new LinkTagCssSource(node.Head.FirstElementChild, new Uri(baseUrl)); return sut; }
public IEnumerable<Data> GetStories() { var client = new WebClient(); string html = client.DownloadString("http://www.thelakewoodscoop.com/"); HtmlParser parser = new HtmlParser(); var document = parser.Parse(html); var itemDetails = document.QuerySelectorAll(".post"); List<Data> stories = new List<Data>(); foreach (var itemDetail in itemDetails) { Data story = new Data(); var anchor = itemDetail.QuerySelector("h2 a"); story.Title = anchor.TextContent; story.Url = anchor.GetAttribute("href"); var excerpt = itemDetail.QuerySelector("p"); if (excerpt != null) { story.Blurb = excerpt.TextContent; } var commentsTd = itemDetail.QuerySelector(".backtotop a").TextContent; //story.Comments = int.Parse(commentsTd); story.Comments = commentsTd; var imageHolder = itemDetail.QuerySelector("p a img"); if (imageHolder != null) { story.Image = imageHolder.GetAttribute("src"); } stories.Add(story); } return stories; }
private static IHtmlDocument CreateSampleDocument() { const String formHtml = @" <html> <body> <form> <!-- text input --> <input type='text' name='user' id='user' value='X' /> <!-- radio input --> <input type='radio' name='userType' id='memberOption' value='Member' /> <input type='radio' name='userType' id='managerOption' value='Manager' checked='checked' /> <input type='radio' name='userType' id='guestOption' value='Guest' /> <!-- select --> <select name='city' id='city'> <option value='0' id='cityOption0'>Jerusalem</option> <option value='1' id='cityOption1' selected='selected'>New york</option> <option value='2' id='cityOption2'>London</option> </select> </form> </body> </html>"; var parser = new HtmlParser(); var document = parser.Parse(formHtml); return document; }
public static void Main() { Console.WriteLine("Enter URL of website for which to print HTML: "); var url = Console.ReadLine(); GetWebsiteHtmlAsync(url); while (true) { if (websiteHtml == null) { Console.WriteLine("Loading..."); Thread.Sleep(100); } else { var parser = new HtmlParser(); var document = parser.Parse(websiteHtml); var links = document.QuerySelector("a").ChildElementCount; Console.WriteLine($"Found: {links} links"); Console.WriteLine("Enter URL of website for which to print HTML: "); url = Console.ReadLine(); GetWebsiteHtmlAsync(url); } } }
public void ApplyStylesToAllElements() { var elementDictionary = new Dictionary<IElement, StyleClass>(); var tableDomObject1 = new HtmlParser().Parse("<table id=\"tabletest1\" class=\"test1\" bgcolor=\"\"></table>"); var tableDomObject2 = new HtmlParser().Parse("<table id=\"tabletest2\" class=\"test2\" bgcolor=\"\" width=\"\"></table>"); var tableDomObject3 = new HtmlParser().Parse("<table id=\"tabletest3\" class=\"test3\" bgcolor=\"\" height=\"\"></table>"); var tableDomObject4 = new HtmlParser().Parse("<table id=\"tabletest4\" class=\"test4\" bgcolor=\"\" width=\"\"></table>"); var styleClassBgColor = new StyleClass(); styleClassBgColor.Attributes["background-color"] = CssAttribute.FromRule("background-color: #008001"); var styleClassWidth = new StyleClass(); styleClassWidth.Attributes["width"] = CssAttribute.FromRule("width: 10px"); var styleClassHeight = new StyleClass(); styleClassHeight.Attributes["height"] = CssAttribute.FromRule("height: 10px"); var styleClassBgAndWidth = new StyleClass(); styleClassBgAndWidth.Attributes["background-color"] = CssAttribute.FromRule("background-color: #008003"); styleClassBgAndWidth.Attributes["width"] = CssAttribute.FromRule("width: 10px"); elementDictionary.Add(tableDomObject1.Body.FirstElementChild, styleClassBgColor); elementDictionary.Add(tableDomObject2.Body.FirstElementChild, styleClassWidth); elementDictionary.Add(tableDomObject3.Body.FirstElementChild, styleClassHeight); elementDictionary.Add(tableDomObject4.Body.FirstElementChild, styleClassBgAndWidth); var result = StyleClassApplier.ApplyAllStyles(elementDictionary); Assert.AreEqual("<table id=\"tabletest1\" class=\"test1\" bgcolor=\"#008001\" style=\"background-color: #008001\"></table>", result.ElementAt(0).Key.OuterHtml); Assert.AreEqual("<table id=\"tabletest2\" class=\"test2\" bgcolor=\"\" width=\"10px\" style=\"width: 10px\"></table>", result.ElementAt(1).Key.OuterHtml); Assert.AreEqual("<table id=\"tabletest3\" class=\"test3\" bgcolor=\"\" height=\"10px\" style=\"height: 10px\"></table>", result.ElementAt(2).Key.OuterHtml); Assert.AreEqual("<table id=\"tabletest4\" class=\"test4\" bgcolor=\"#008003\" width=\"10px\" style=\"background-color: #008003;width: 10px\"></table>", result.ElementAt(3).Key.OuterHtml); }
/// <summary> /// Initializes a new instance of the <see cref="DocumentWrapper"/> class. /// </summary> /// <param name="buffer">The document represented as a byte array.</param> public DocumentWrapper(IEnumerable<byte> buffer) { var parser = new HtmlParser(); using (var stream = new MemoryStream(buffer.ToArray())) { this.document = parser.Parse(stream); } }
private List<KeyValuePair<string, string>> GetCategoryUrls() { const string URL = "http://www.meizitu.com/"; var webClient = new WebClient(); var html = webClient.DownloadString(URL); var doc = new HtmlParser(html).Parse(); return doc.QuerySelectorAll(".topmodel a").Select(a => new KeyValuePair<string, string>(a.GetAttribute("href"), a.NodeValue)).ToList(); }
public IEnumerable<IDocument> Execute(IReadOnlyList<IDocument> inputs, IExecutionContext context) { HtmlParser parser = new HtmlParser(); return inputs.AsParallel().SelectMany(x => { // Parse the HTML content IHtmlDocument htmlDocument; try { using (Stream stream = x.GetStream()) { htmlDocument = parser.Parse(stream); } } catch (Exception ex) { context.Trace.Warning("Exception while parsing HTML for {0}: {1}", x.Source, ex.Message); return new [] { x }; } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(_querySelector)) { IElement[] elements = _first ? new[] {htmlDocument.QuerySelector(_querySelector)} : htmlDocument.QuerySelectorAll(_querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { List<IDocument> documents = new List<IDocument>(); foreach (IElement element in elements) { // Get the metadata Dictionary<string, object> metadata = new Dictionary<string, object>(); foreach (Action<IElement, Dictionary<string, object>> metadataAction in _metadataActions) { metadataAction(element, metadata); } // Clone the document and optionally change content to the HTML element documents.Add(_outerHtmlContent.HasValue ? x.Clone(_outerHtmlContent.Value ? element.OuterHtml : element.InnerHtml, metadata.Count == 0 ? null : metadata) : x.Clone(metadata)); } return (IEnumerable<IDocument>) documents; } } return new[] { x }; } catch (Exception ex) { context.Trace.Warning("Exception while processing HTML for {0}: {1}", x.Source, ex.Message); return new[] { x }; } }); }
static IDocument TestHtml(String source, String title = "HTML document") { var parser = new HtmlParser(); var sw = Stopwatch.StartNew(); var document = parser.Parse(source); sw.Stop(); Console.WriteLine("Parsing " + title + " took " + sw.ElapsedMilliseconds + "ms"); return document; }
/// <summary> /// Initializes a new instance of the <see cref="T:System.Object"/> class. /// </summary> public AssertExtensionsTests() { var parser = new HtmlParser(); var document = parser.Parse(@"<html><head></head><body><div id='testId' class='myClass' attribute1 attribute2='value2'>Test</div><div class='anotherClass'>Tes</div><span class='class'>some contents</span><span class='class'>This has contents</span></body></html>"); this.query = new QueryWrapper(new[] { document.DocumentElement }); }
public async Task ParseAddress(string address) { //_document = await BrowsingContext.New().OpenAsync(address); using (var webClient = new WebClient()) { var html = await webClient.DownloadStringTaskAsync(address); var htmlParser = new HtmlParser(); _document = await htmlParser.ParseAsync(html); } }
public void FindEquivalentStyles() { var tableDomObject = new HtmlParser().Parse("<table id=\"tabletest\" class=\"test\" bgcolor=\"\"></table>"); var nodewithoutselector = (IElement)tableDomObject.Body.FirstChild; var clazz = new StyleClass(); clazz.Attributes["background-color"] = CssAttribute.FromRule("background-color: red"); var result = CssStyleEquivalence.FindEquivalent(nodewithoutselector, clazz); Assert.AreEqual(1, result.Count); }
//Old methods private IHtmlDocument GetHtmlDocument(string address) { WebRequest request = WebRequest.Create(address); WebResponse response = request.GetResponseAsync().Result; Stream stream = response.GetResponseStream(); StreamReader sr = new StreamReader(stream); string s = sr.ReadToEnd(); HtmlParser parser = new HtmlParser(); return parser.Parse(s); }
public void GetAllStylesForElement() { var tableDomObject = new HtmlParser().Parse("<table id=\"tabletest\" class=\"test\" bgcolor=\"\"></table>"); var nodewithoutselector = (IElement)tableDomObject.Body.FirstChild; var clazz = new StyleClass(); clazz.Attributes["background-color"] = CssAttribute.FromRule("background-color: red"); var result = CssElementStyleResolver.GetAllStyles(nodewithoutselector, clazz); Assert.AreEqual(2, result.Count()); Assert.AreEqual("style", result.ElementAt(0).AttributeName); Assert.AreEqual("bgcolor", result.ElementAt(1).AttributeName); }
/// <summary> /// WEB форма. /// </summary> /// <param name="result">Результат.</param> private WebForm(WebCallResult result) { Cookies = result.Cookies; OriginalUrl = result.RequestUrl.OriginalString; var parser = new HtmlParser(); _html = parser.Parse(result.Response); var uri = result.ResponseUrl; _responseBaseUrl = uri.Scheme + "://" + uri.Host + ":" + uri.Port; _inputs = ParseInputs(); }
public static (string preview, string description) MakePreviewAndDescription(string html, int descriptionLength, int previewLength) { if (html == null) { return(null, null); } (string preview, string description)rez; HtmlParser parser = new AngleSharp.Parser.Html.HtmlParser(); var doc = parser.Parse(html); int currentSize = 0; var endText = (IText)FindTextNodePlus(doc.Body, ref currentSize, previewLength); if (endText != null) { ClearNext(endText); } if (string.IsNullOrWhiteSpace(doc.Body.TextContent)) { rez.description = null; } else { rez.description = doc.Body.TextContent.Substring(0, Math.Min(descriptionLength, doc.Body.TextContent.Length)) + "..."; } var img1 = FindFirstBigImage(doc); if (img1 != null) { ClearNext(img1); } var iframe = FindFirstIFrame(doc); if (iframe != null) { ClearNext(iframe); } rez.preview = doc.Body.InnerHtml ?? null; return(rez); }
public async Task TestAsyncHtmlParsingFromString() { var source = "<html><head><title>My test</title></head><body><p>Some text</p></body></html>"; var parser = new HtmlParser(Configuration.Default); using (var task = parser.ParseAsync(source)) { Assert.IsTrue(task.IsCompleted); var result = await task; Assert.AreEqual("My test", result.Title); Assert.AreEqual(1, result.Body.ChildElementCount); Assert.AreEqual("Some text", result.Body.Children[0].TextContent); } }
public void TestAsyncHtmlParsing() { var source = "<html><head><title>My test</title></head><body><p>Some text</p></body></html>"; var parser = new HtmlParser(source, Configuration.Default); var task = parser.ParseAsync(); Assert.IsFalse(task.IsCompleted); Assert.IsNotNull(parser.Result); Assert.IsFalse(task.IsCompleted); task.Wait(); Assert.IsTrue(task.IsCompleted); Assert.IsNotNull(parser.Result); Assert.AreEqual("My test", parser.Result.Title); Assert.AreEqual(1, parser.Result.Body.ChildElementCount); Assert.AreEqual("Some text", parser.Result.Body.Children[0].TextContent); }
public void ParsedCssCanHaveExtraWhitespace() { var html = "<div style=\"background-color: http://www.codeplex.com?url=<!--[if gte IE 4]><SCRIPT>alert('XSS');</SCRIPT><![endif]-->\">"; var parser = new HtmlParser(Configuration.Default.WithCss(e => e.Options = new CssParserOptions { IsIncludingUnknownDeclarations = true, IsIncludingUnknownRules = true, IsToleratingInvalidConstraints = true, IsToleratingInvalidValues = true })); var dom = parser.Parse(html); var div = dom.QuerySelector<IHtmlElement>("div"); Assert.AreEqual("http://www.codeplex.com?url=<!--[if gte IE 4]><SCRIPT>alert(\"XSS\")", div.Style["background-color"]); Assert.AreEqual("background-color: http://www.codeplex.com?url=<!--[if gte IE 4]><SCRIPT>alert(\"XSS\")", div.Style.CssText); }
static async Task TestAsync() { Console.WriteLine("Starting async!"); var parser = new HtmlParser(HtmlFiles.W3C); var task = parser.ParseAsync(); var sw = Stopwatch.StartNew(); while (!task.IsCompleted) { await Task.Delay(15); Console.WriteLine("{0} | {1} elements", sw.ElapsedMilliseconds, parser.Result.All.Length); } sw.Stop(); Console.WriteLine("Done!"); }
public string GetContentByLink(string link) { //TODO: Get one article by link to show it on the Second Page WebRequest request = WebRequest.Create(link); WebResponse response = request.GetResponseAsync().Result; Stream stream = response.GetResponseStream(); StreamReader sr = new StreamReader(stream); string s = sr.ReadToEnd(); HtmlParser parser = new HtmlParser(); string selector = "div.b-posts-1-item__text"; var article = parser.Parse(s).QuerySelector(selector).TextContent; article = Regex.Replace(article, @"^\s+$[\r\n]*", "", RegexOptions.Multiline); return article; }
static async Task MainAsync(string[] args) { var targetUri = new Uri(args[0]); string targetHost = targetUri.Host; Queue<Uri> pending = new Queue<Uri>(); ISet<Uri> completed = new HashSet<Uri>(); pending.Enqueue(targetUri); var parser = new HtmlParser(); while(pending.Any()) { Uri current = pending.Dequeue(); Console.WriteLine(current + " " + pending.Count); completed.Add(current); using (var client = new HttpClient { BaseAddress = targetUri }) { var response = await client.GetAsync(current); if (!response.IsSuccessStatusCode) { Console.WriteLine(current + " " + response.StatusCode); continue; } string content = await response.Content.ReadAsStringAsync(); IHtmlDocument parsedHtml = await parser.ParseAsync(content); List<Uri> todo = parsedHtml.Links .Select(s => new Uri(current,s.Attributes["href"].Value)) .Where(s => s.Host == targetHost) .Except(completed) .ToList(); var rewrites = parsedHtml.Links .Select(s => new { s, u = new Uri(current, s.Attributes["href"].Value) }) .Where(s => s.u.Host == targetHost) .Where(s => !s.s.Attributes["href"].Value.StartsWith("#")) .Select(s => new { s.s, s.u, f = ToLink(s.u) }).ToList(); foreach(var a in rewrites) a.s.Attributes["href"].Value = a.f; string targetFile = "result" + ToFileName(current.AbsolutePath, current.Query); Directory.CreateDirectory(Directory.GetParent(targetFile).FullName); File.WriteAllText(targetFile, parsedHtml.ToHtml()); Console.WriteLine(targetFile); foreach (var a in todo) pending.Enqueue(a); } } }
public void ObtainElementPositionsFromHtml() { var positions = new Dictionary<IElement, TextPosition>(); var source = @"<article class=""grid-item large""> <div class=""grid-image""><a href=""/News/Page/298/cpp-mva-course""><img src=""/img/news/maxresdefault700x240.png"" alt=""Icon"" title=""C++ MVA Course"" /></a></div> <div class=""grid-title""><a href=""/News/Page/298/cpp-mva-course"">C++ MVA Course</a></div> <div class=""grid-abstract"">My Microsoft Virtual Academy course about modern C++ is now available.</div> <div class=""grid-date"">6/5/2015</div> <div class=""grid-admin""> <a href=""/Page/Delete/298"">Delete</a> | <a href=""/Page/Edit/298"">Edit</a> | <a href=""/Page/Create?parentId=1"">Create New</a> </div> </article>"; var parser = new HtmlParser(new HtmlParserOptions { OnCreated = (element, position) => positions[element] = position }); var document = parser.Parse(source); Assert.AreEqual(15, positions.Count); }
public void ParseInlineStyleWithToleratedInvalidValueShouldReturnThatValue() { var html = "<div style=\"background-image: url(javascript:alert(1))\"></div>"; var options = new CssParserOptions { IsIncludingUnknownDeclarations = true, IsIncludingUnknownRules = true, IsToleratingInvalidConstraints = true, IsToleratingInvalidValues = true }; var config = Configuration.Default.WithCss(e => e.Options = options); var parser = new HtmlParser(config); var dom = parser.Parse(html); var div = dom.QuerySelector<IHtmlElement>("div"); Assert.AreEqual(1, div.Style.Length); Assert.AreEqual("background-image", div.Style[0]); Assert.AreEqual("url(\"javascript:alert(1)\")", div.Style.BackgroundImage); }
public void ParseInlineStyleWithUnknownDeclarationShouldBeAbleToRemoveThatDeclaration() { var html = @"<DIV STYLE='background: url(""javascript:alert(foo)"")'>"; var options = new CssParserOptions { IsIncludingUnknownDeclarations = true, IsIncludingUnknownRules = true, IsToleratingInvalidConstraints = true, IsToleratingInvalidValues = true }; var config = Configuration.Default.WithCss(e => e.Options = options); var parser = new HtmlParser(config); var dom = parser.Parse(html); var div = dom.QuerySelector<IHtmlElement>("div"); Assert.AreEqual(1, div.Style.Length); Assert.AreEqual("background", div.Style[0]); div.Style.RemoveProperty("background"); Assert.AreEqual(0, div.Style.Length); }
/// <summary> /// Creates a new document fragment with the given nodelist as /// children. /// </summary> /// <param name="html">The HTML source to use.</param> /// <param name="context">The context for the fragment mode.</param> internal DocumentFragment(String html, Element context) : this() { var owner = context.Owner; var configuration = Configuration.Clone(owner != null ? owner.Options : Configuration.Default); configuration.IsScripting = false; configuration.UseQuirksMode = context.Owner != null && context.Owner.QuirksMode != QuirksMode.Off; var parser = new HtmlParser(html, configuration); parser.SwitchToFragment(context); parser.Parse(); var root = parser.Result.DocumentElement; while (root.HasChilds) { var child = root.FirstChild; root.RemoveChild(child); DefaultAppendChild(child); } }
public ConcertProvider( AngleSharpParser.HtmlParser parser, IHtmlProvider htmlProvider, IParserConfigProvider configService) { if (parser == null) { throw new ArgumentNullException(nameof(parser)); } if (htmlProvider == null) { throw new ArgumentNullException(nameof(htmlProvider)); } if (configService == null) { throw new ArgumentNullException(nameof(configService)); } this.parser = parser; this.htmlProvider = htmlProvider; this.configService = configService; }
private async Task DoLogin() { var pairs = new Dictionary<string, string> { { "username", configData.Username.Value }, { "password", configData.Password.Value }, { "keeplogged", "on" }, { "login", "Login" } }; CookieHeader = string.Empty; var response = await RequestLoginAndFollowRedirect(LoginUrl, pairs, CookieHeader, true, null, LoginUrl); await ConfigureIfOK(response.Cookies, response.Content != null && response.Content.Contains("logout.php"), () => { var parser = new HtmlParser(); var document = parser.Parse(response.Content); var messageEl = document.QuerySelector("form > span[class='warning']"); var errorMessage = messageEl.TextContent.Trim(); throw new ExceptionWithConfigData(errorMessage, configData); }); }
public static string ParseXamlToHtml(string xaml, Assembly[] assemblies) { var settings = new XamlParserSettings(); foreach (var assembly in assemblies) { settings.TypeFinder.RegisterAssembly(assembly); } var config = AngleSharp.Configuration.Default.WithCss(x => x.Options = new CssParserOptions() { IsIncludingUnknownDeclarations = true }); using (var xmlReader = XmlReader.Create(new StringReader(xaml))) { var xamlObj = XamlParser.Parse(xmlReader, settings); var parser = new HtmlParser(config); var htmlDocument = parser.Parse(""); ParseObject(xamlObj.RootElement, htmlDocument, htmlDocument.DocumentElement); return htmlDocument.DocumentElement.OuterHtml; } }
public SpectacleProvider( AngleSharpParser.HtmlParser parser, IHtmlProvider htmlProvider, IParserConfigProvider configProvider) { if (htmlProvider == null) { throw new ArgumentNullException(nameof(htmlProvider)); } if (parser == null) { throw new ArgumentNullException(nameof(parser)); } if (configProvider == null) { throw new ArgumentNullException(nameof(configProvider)); } this.htmlProvider = htmlProvider; this.parser = parser; this.configProvider = configProvider; }