public void TestInnerText() { XmlLightDocument doc = new HtmlLightDocument(document); XmlLightElement e = doc.SelectSingleNode("/html/body"); Assert.AreEqual("this is > cdata! Hi, this is content.", Normalize(e.InnerText)); Assert.AreEqual("Hi", e.SelectSingleNode(".//div[@id='two']").InnerText); Assert.AreEqual("this is > cdata!", Normalize(e.SelectSingleNode("text()").InnerText)); }
public void TestDocToXml() { HtmlLightDocument doc = new HtmlLightDocument(); XmlLightElement body = new XmlLightElement(new XmlLightElement(doc, "html"), "body"); body.IsEmpty = false; body.Attributes.Add("id", "bdy"); Assert.AreEqual("<html> <body id=\"bdy\"> </body> </html>", Normalize(doc.InnerXml)); }
public void TestComments() { XmlLightDocument doc = new HtmlLightDocument(document); XmlLightElement e = doc.SelectSingleNode("/html/head"); e = e.NextSibling; Assert.IsTrue(e.IsComment); Assert.AreEqual("<!-- comments included -->", e.InnerXml); }
public void HtmlHelp(ICommandInterpreter _ci) { CommandInterpreter ci = ((CommandInterpreter)_ci); HtmlLightDocument doc = new HtmlLightDocument(ci.GetHtmlHelp("help")); XmlLightElement e = doc.SelectRequiredNode("/html/body/h1[2]"); XmlLightElement body = e.Parent; int i = body.Children.IndexOf(e); body.Children.RemoveRange(i, body.Children.Count - i); StringWriter sw = new StringWriter(); // Command index sw.WriteLine("<html><body>"); sw.WriteLine("<h1>All Commands:</h1>"); sw.WriteLine("<blockquote><ul>"); ILookup <string, ICommand> categories = ci.Commands.Where(c => c.Visible).ToLookup(c => c.Category ?? "Unk"); foreach (IGrouping <string, ICommand> group in categories.OrderBy(g => g.Key)) { sw.WriteLine("<li><a href=\"#{0}\">{0}</a></li>", group.Key); sw.WriteLine("<ul>"); foreach (ICommand cmd in group) { sw.WriteLine("<li><a href=\"#{0}\">{0}</a> - {1}</li>", cmd.DisplayName, HttpUtility.HtmlEncode(cmd.Description)); } sw.WriteLine("</ul>"); } sw.WriteLine("</ul></blockquote>"); // Command Help foreach (IGrouping <string, ICommand> group in categories.OrderBy(g => g.Key)) { sw.WriteLine("<h2><a name=\"{0}\"></a>{0} Commands:</h2>", group.Key); sw.WriteLine("<blockquote>"); foreach (ICommand cmd in group) { e = new HtmlLightDocument(ci.GetHtmlHelp(cmd.DisplayName)).SelectRequiredNode("/html/body/h3"); sw.WriteLine("<a name=\"{0}\"></a>", cmd.DisplayName); sw.WriteLine(e.InnerXml); sw.WriteLine(e.NextSibling.NextSibling.InnerXml); } sw.WriteLine("</blockquote>"); } e = new HtmlLightDocument(sw.ToString()).SelectRequiredNode("/html/body"); body.Children.AddRange(e.Children); string html = body.Parent.InnerXml; string path = Path.Combine(Path.GetTempPath(), "HttpClone.Help.html"); File.WriteAllText(path, html); System.Diagnostics.Process.Start(path); }
public void TestParseDocument() { XmlLightDocument doc = new HtmlLightDocument(document); XmlLightDocument doc2; using (TempFile t = new TempFile()) { using (TextWriter tw = new StreamWriter(t.Open())) doc.WriteXml(tw); new XhtmlValidation(XhtmlDTDSpecification.XhtmlTransitional_10).Validate(t.TempPath); doc2 = new XmlLightDocument(t.ReadAllText()); Assert.AreEqual(doc.InnerXml, doc2.InnerXml); } }
public void HtmlHelp(ICommandInterpreter _ci) { CommandInterpreter ci = ((CommandInterpreter)_ci); HtmlLightDocument doc = new HtmlLightDocument(ci.GetHtmlHelp("help")); XmlLightElement e = doc.SelectRequiredNode("/html/body/h1[2]"); XmlLightElement body = e.Parent; int i = body.Children.IndexOf(e); body.Children.RemoveRange(i, body.Children.Count - i); StringWriter sw = new StringWriter(); // Command index sw.WriteLine("<html><body>"); sw.WriteLine("<h1>All Commands:</h1>"); sw.WriteLine("<blockquote><ul>"); ILookup<string, ICommand> categories = ci.Commands.Where(c => c.Visible).ToLookup(c => c.Category ?? "Unk"); foreach (IGrouping<string, ICommand> group in categories.OrderBy(g => g.Key)) { sw.WriteLine("<li><a href=\"#{0}\">{0}</a></li>", group.Key); sw.WriteLine("<ul>"); foreach (ICommand cmd in group) sw.WriteLine("<li><a href=\"#{0}\">{0}</a> - {1}</li>", cmd.DisplayName, HttpUtility.HtmlEncode(cmd.Description)); sw.WriteLine("</ul>"); } sw.WriteLine("</ul></blockquote>"); // Command Help foreach (IGrouping<string, ICommand> group in categories.OrderBy(g => g.Key)) { sw.WriteLine("<h2><a name=\"{0}\"></a>{0} Commands:</h2>", group.Key); sw.WriteLine("<blockquote>"); foreach (ICommand cmd in group) { e = new HtmlLightDocument(ci.GetHtmlHelp(cmd.DisplayName)).SelectRequiredNode("/html/body/h3"); sw.WriteLine("<a name=\"{0}\"></a>", cmd.DisplayName); sw.WriteLine(e.InnerXml); sw.WriteLine(e.NextSibling.NextSibling.InnerXml); } sw.WriteLine("</blockquote>"); } e = new HtmlLightDocument(sw.ToString()).SelectRequiredNode("/html/body"); body.Children.AddRange(e.Children); string html = body.Parent.InnerXml; string path = Path.Combine(Path.GetTempPath(), "HttpClone.Help.html"); File.WriteAllText(path, html); System.Diagnostics.Process.Start(path); }
public bool TryGetPingbackFromHtml(out Uri pingbackApi) { HttpRequestUtil http = new HttpRequestUtil(_targetLink); if (http.Get(_targetLink.PathAndQuery) != System.Net.HttpStatusCode.OK) { LogError(String.Format("GET {0}: {1}/{2}", _targetLink, (int)http.StatusCode, http.StatusCode)); } else if (!http.ContentType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase)) { LogError("Invalid content-type, expected text/html, found: " + http.ContentType); } else { try { HtmlLightDocument htmlDoc = new HtmlLightDocument(Encoding.UTF8.GetString(http.Content)); XmlLightElement link = htmlDoc.SelectSingleNode("/html/head/link[@rel='pingback']"); if (link == null) { LogError("Unable to locate <link rel=\"pingback\" ... in header."); } else { string pingback; if (!link.Attributes.TryGetValue("href", out pingback)) { LogError("Link for rel=pingback is missing the href attribute."); } else { LogInfo("Found rel=pingback: " + pingback); return(Uri.TryCreate(pingback, UriKind.Absolute, out pingbackApi)); } } } catch (Exception e) { LogError(e.Message); } } pingbackApi = null; return(false); }
private string CreateTemplate(string html) { HtmlLightDocument doc = new HtmlLightDocument(html); //Add css link: XmlLightElement cssLink = new XmlLightElement(doc.SelectRequiredNode("/html/head"), "link"); cssLink.Attributes["type"] = "text/css"; cssLink.Attributes["rel"] = "stylesheet"; cssLink.Attributes["href"] = new Uri(_baseUri, "search.css").AbsoluteUri; XmlLightElement startFrom = doc.Root; if (_config.Searching.XPathBase != null) { startFrom = startFrom.SelectRequiredNode(_config.Searching.XPathBase.XPath); } if (_config.Searching.FormXPath != null) { XmlLightElement form = startFrom.SelectRequiredNode(_config.Searching.FormXPath.XPath); foreach (XmlLightElement textbox in form.Select(".//input[@type='text']")) { textbox.Attributes["value"] = String.Empty; } } if (_config.Searching.TermsXPath != null) { InsertTag(startFrom, _config.Searching.TermsXPath.XPath, _config.Searching.TermsXPath.ReplaceOption, "search-terms"); } if (_config.Searching.ResultXPath != null) { InsertTag(startFrom, _config.Searching.ResultXPath.XPath, _config.Searching.ResultXPath.ReplaceOption, "search-result"); } using (StringWriter sw = new StringWriter()) { doc.WriteUnformatted(sw); return(sw.ToString()); } }
public void TestXPath() { XmlDocument xdoc = new XmlDocument(); XmlLightDocument doc = new HtmlLightDocument(document); string testpath = "/html/body[@id='one' and @class='cls']/../body/div[@id='two' and text() = 'Hi']/@id"; xdoc.LoadXml(doc.CreateNavigator().InnerXml); Assert.IsNotNull(xdoc.SelectSingleNode(testpath)); XPathNavigator nav = doc.CreateNavigator().SelectSingleNode(testpath); Assert.IsNotNull(nav); Assert.IsTrue(nav.NodeType == XPathNodeType.Attribute); Assert.AreEqual("id", nav.Name); Assert.AreEqual("two", nav.Value); XmlLightElement e = doc.SelectSingleNode("/html/Head"); Assert.IsNull(e); e = doc.SelectSingleNode("/html/head"); Assert.IsNotNull(e); }
public void TestXmlNavigator() { XPathNavigator nav = new HtmlLightDocument(document).CreateNavigator().SelectSingleNode("/html/body//p[@class='1']"); XPathNavigator pos = nav.Clone(); Assert.IsFalse(nav.MoveToPrevious()); Assert.IsTrue(nav.MoveToNext()); Assert.IsTrue(nav.MoveToPrevious()); Assert.IsTrue(nav.IsSamePosition(pos)); Assert.IsFalse(nav.MoveToFirstNamespace()); Assert.IsFalse(nav.MoveToNextNamespace()); Assert.IsTrue(Object.ReferenceEquals(nav.NameTable, pos.NameTable)); Assert.IsNotNull(nav.BaseURI); Assert.AreEqual(nav.BaseURI, pos.BaseURI); Assert.IsTrue(nav.MoveToId("one")); Assert.AreEqual("body", nav.Name); Assert.IsFalse(nav.MoveToId("none-exists")); Assert.AreEqual("body", nav.Name); }
public void TestHtmlEntityRef() { string html = @"<html> <body attrib=""this & that ><  !""> this char '<' and this one '>' and this one '&' should be encoded. We encoded ' ' and à and ' ' and ' ' all by ourselves. This in not valid xml �, nor is �, but we still allow it. This entity name will pass-through &unknown; this will not &whateverthatmeans; and nor will these &; &#; &h; &l t; &1two; &234; g; &#-123;. </body> </html>"; string expect = @"<html><body attrib=""this & that ><" + (Char)160 + @" !""> this char '<' and this one '>' and this one '&' should be encoded. We encoded ' ' and à and ' ' and ' ' all by ourselves. This in not valid xml �, nor is �, but we still allow it. This entity name will pass-through &unknown; this will not &whateverthatmeans; and nor will these &; &#; &h; &l t; &1two; &234; &#x00fg; &#-123;. </body></html>"; XmlLightDocument doc = new HtmlLightDocument(html); XmlWriterSettings settings = new XmlWriterSettings() { CheckCharacters = true, Indent = false, IndentChars = "", NewLineChars = "", NewLineHandling = NewLineHandling.None, OmitXmlDeclaration = true, CloseOutput = false }; StringWriter sw = new StringWriter(); XmlWriter wtr = XmlWriter.Create(sw, settings); doc.WriteXml(wtr); wtr.Flush(); string xml = sw.ToString(); Assert.AreEqual(expect, xml); }
public void TestXmlElement() { XmlLightDocument doc = new HtmlLightDocument(document); Assert.IsNull(doc.PrevSibling); Assert.IsNull(doc.Children[0].PrevSibling); Assert.IsNull(doc.NextSibling); Assert.IsNull(doc.Children[doc.Children.Count - 1].NextSibling); XmlLightElement e = doc.SelectSingleNode("/html/body//*[@class='2']"); Assert.IsNotNull(e); Assert.AreEqual("p", e.TagName); Assert.IsNotNull(e.PrevSibling); Assert.AreEqual("p", e.PrevSibling.TagName); Assert.AreEqual("", e.Namespace); Assert.AreEqual("p", e.LocalName); e = new XmlLightElement(null, "a:b"); Assert.AreEqual("a", e.Namespace); Assert.AreEqual("b", e.LocalName); }
public void TestParsers() { string notxml = "<html id=a ><body foo='bar' bar=\"foo\" />"; HtmlLightDocument html = new HtmlLightDocument(); XmlLightParser.Parse(notxml, html); Assert.AreEqual("html", html.Root.TagName); Assert.AreEqual(1, html.Root.Attributes.Count); Assert.AreEqual("a", html.Root.Attributes["id"]); Assert.AreEqual(1, html.Root.Children.Count); Assert.AreEqual("body", html.Root.Children[0].TagName); Assert.AreEqual("foo", html.Root.Children[0].Attributes["bar"]); Assert.AreEqual("bar", html.Root.Children[0].Attributes["foo"]); XmlLightDocument xml = new XmlLightDocument(); XmlLightParser.Parse(notxml, XmlLightParser.AttributeFormat.Xml, xml); Assert.AreEqual(2, xml.Root.Attributes.Count); //Not recognized: xml.Root.Attributes["id"] Assert.AreEqual("body", xml.Root.TagName); Assert.AreEqual("foo", xml.Root.Attributes["bar"]); Assert.AreEqual("bar", xml.Root.Attributes["foo"]); }
public bool TryGetPingbackFromHtml(out Uri pingbackApi) { HttpRequestUtil http = new HttpRequestUtil(_targetLink); if (http.Get(_targetLink.PathAndQuery) != System.Net.HttpStatusCode.OK) LogError(String.Format("GET {0}: {1}/{2}", _targetLink, (int)http.StatusCode, http.StatusCode)); else if (!http.ContentType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase)) LogError("Invalid content-type, expected text/html, found: " + http.ContentType); else { try { HtmlLightDocument htmlDoc = new HtmlLightDocument(Encoding.UTF8.GetString(http.Content)); XmlLightElement link = htmlDoc.SelectSingleNode("/html/head/link[@rel='pingback']"); if (link == null) LogError("Unable to locate <link rel=\"pingback\" ... in header."); else { string pingback; if (!link.Attributes.TryGetValue("href", out pingback)) LogError("Link for rel=pingback is missing the href attribute."); else { LogInfo("Found rel=pingback: " + pingback); return Uri.TryCreate(pingback, UriKind.Absolute, out pingbackApi); } } } catch (Exception e) { LogError(e.Message); } } pingbackApi = null; return false; }
private string CreateTemplate(string html) { HtmlLightDocument doc = new HtmlLightDocument(html); //Add css link: XmlLightElement cssLink = new XmlLightElement(doc.SelectRequiredNode("/html/head"), "link"); cssLink.Attributes["type"] = "text/css"; cssLink.Attributes["rel"] = "stylesheet"; cssLink.Attributes["href"] = new Uri(_baseUri, "search.css").AbsoluteUri; XmlLightElement startFrom = doc.Root; if (_config.Searching.XPathBase != null) startFrom = startFrom.SelectRequiredNode(_config.Searching.XPathBase.XPath); if(_config.Searching.FormXPath != null) { XmlLightElement form = startFrom.SelectRequiredNode(_config.Searching.FormXPath.XPath); foreach (XmlLightElement textbox in form.Select(".//input[@type='text']")) textbox.Attributes["value"] = String.Empty; } if(_config.Searching.TermsXPath != null) { InsertTag(startFrom, _config.Searching.TermsXPath.XPath, _config.Searching.TermsXPath.ReplaceOption, "search-terms"); } if (_config.Searching.ResultXPath != null) { InsertTag(startFrom, _config.Searching.ResultXPath.XPath, _config.Searching.ResultXPath.ReplaceOption, "search-result"); } using (StringWriter sw = new StringWriter()) { doc.WriteUnformatted(sw); return sw.ToString(); } }
public void BuildIndex() { if (_config == null) { throw new InvalidOperationException("The <search> element is missing from the configuration."); } Dictionary <string, string> hashes = new Dictionary <string, string>(StringComparer.Ordinal); foreach (KeyValuePair <string, ContentRecord> item in _content) { if (item.Key == SearchTemplate.SearchPath || item.Key == SearchTemplate.TemplatePath || item.Key == _config.TemplateUri) { continue; } if (item.Value.HasContentStoreId == false) { continue; } if (!_mimeInfo[item.Value.MimeType].Indexed || _mimeInfo[item.Value.MimeType].Type != ContentFormat.Html) { continue; } if (item.Value.HasHashContents) { if (hashes.ContainsKey(item.Value.HashContents)) { continue; } hashes.Add(item.Value.HashContents, item.Key); } string title = null, blurb = null, date = null; string content = Encoding.UTF8.GetString(_content.ReadContent(item.Value, true)); HtmlLightDocument xdoc = new HtmlLightDocument(content); XmlLightElement found, selectFrom = _config.XPathBase == null ? xdoc.Root : xdoc.SelectRequiredNode(_config.XPathBase.XPath); bool ignore = false; foreach (var xpath in _config.Conditions.SafeEnumeration()) { if (null != selectFrom.SelectSingleNode(xpath.XPath)) { ignore = true; break; } } if (ignore) { continue; } if (_config.TitlePath != null && selectFrom.TrySelectNode(_config.TitlePath.XPath, out found)) { title = found.InnerText.Trim(); } else if (_config.TitlePath == null && false == _mimeInfo.TryGetTitle(item.Value.MimeType, content, out title)) { title = null; } if (String.IsNullOrEmpty(title)) { continue; } if (_config.BlubXPath != null) { StringBuilder tmp = new StringBuilder(); foreach (XmlLightElement e in selectFrom.Select(_config.BlubXPath.XPath)) { if (e.IsText) { tmp.Append(e.Value); } else { foreach (XmlLightElement txt in e.Select(".//text()")) { tmp.Append(txt.Value); } } } if (tmp.Length == 0) { tmp.Append(selectFrom.SelectRequiredNode(_config.BlubXPath.XPath).InnerText); } blurb = tmp.ToString(); } DateTime dtvalue = item.Value.DateCreated; if (_config.DateXPath != null && selectFrom.TrySelectNode(_config.DateXPath.XPath, out found)) { DateTime contentDate; string dtText = found.InnerText.Trim(); dtText = DateTimeClean.Replace(dtText, m => m.Value.Substring(0, m.Length - 2)); if (!String.IsNullOrEmpty(_config.DateXPath.DateFormat)) { if (DateTime.TryParseExact(dtText, _config.DateXPath.DateFormat, CultureInfo.InvariantCulture, DateTimeStyles.AllowWhiteSpaces, out contentDate)) { dtvalue = contentDate; } else { throw new FormatException("Unable to parse date/time: " + dtText); } } else if (DateTime.TryParse(dtText, CultureInfo.InvariantCulture, DateTimeStyles.AllowWhiteSpaces, out contentDate)) { dtvalue = contentDate; } else { throw new FormatException("Unable to parse date/time: " + dtText); } } date = dtvalue.ToString("yyyy-MM-dd HH:mm:ss"); StringWriter indexed = new StringWriter(); indexed.WriteLine(title); foreach (var xpath in _config.Indexed.SafeEnumeration()) { foreach (var indexItem in selectFrom.Select(xpath.XPath)) { string innerText = indexItem.InnerText; indexed.WriteLine(innerText); indexed.WriteLine(NonAlphaNum.Replace(innerText, " "));//again, removing all special characters. } } if (String.IsNullOrEmpty(blurb)) { blurb = indexed.ToString().Substring(title.Length).Trim(); } title = WhiteSpaces.Replace(TrimString(title, _config.TitlePath != null ? (uint)_config.TitlePath.MaxLength : BlurbLength), " "); blurb = WhiteSpaces.Replace(TrimString(blurb, _config.BlubXPath != null ? (uint)_config.BlubXPath.MaxLength : BlurbLength), " "); string text = indexed.ToString(); using (TextReader rdr = new StringReader(text)) AddToIndex(item.Key, date, title, blurb, rdr); } }
public void BuildIndex() { if(_config == null) throw new InvalidOperationException("The <search> element is missing from the configuration."); Dictionary<string, string> hashes = new Dictionary<string, string>(StringComparer.Ordinal); foreach (KeyValuePair<string, ContentRecord> item in _content) { if (item.Key == SearchTemplate.SearchPath || item.Key == SearchTemplate.TemplatePath || item.Key == _config.TemplateUri) continue; if (item.Value.HasContentStoreId == false) continue; if (!_mimeInfo[item.Value.MimeType].Indexed || _mimeInfo[item.Value.MimeType].Type != ContentFormat.Html) continue; if (item.Value.HasHashContents) { if (hashes.ContainsKey(item.Value.HashContents)) continue; hashes.Add(item.Value.HashContents, item.Key); } string title = null, blurb = null, date = null; string content = Encoding.UTF8.GetString(_content.ReadContent(item.Value, true)); HtmlLightDocument xdoc = new HtmlLightDocument(content); XmlLightElement found, selectFrom = _config.XPathBase == null ? xdoc.Root : xdoc.SelectRequiredNode(_config.XPathBase.XPath); bool ignore = false; foreach(var xpath in _config.Conditions.SafeEnumeration()) { if(null != selectFrom.SelectSingleNode(xpath.XPath)) { ignore = true; break; } } if (ignore) continue; if (_config.TitlePath != null && selectFrom.TrySelectNode(_config.TitlePath.XPath, out found)) title = found.InnerText.Trim(); else if (_config.TitlePath == null && false == _mimeInfo.TryGetTitle(item.Value.MimeType, content, out title)) title = null; if (String.IsNullOrEmpty(title)) continue; if (_config.BlubXPath != null) { StringBuilder tmp = new StringBuilder(); foreach (XmlLightElement e in selectFrom.Select(_config.BlubXPath.XPath)) { if (e.IsText) tmp.Append(e.Value); else { foreach (XmlLightElement txt in e.Select(".//text()")) tmp.Append(txt.Value); } } if (tmp.Length == 0) tmp.Append(selectFrom.SelectRequiredNode(_config.BlubXPath.XPath).InnerText); blurb = tmp.ToString(); } DateTime dtvalue = item.Value.DateCreated; if (_config.DateXPath != null && selectFrom.TrySelectNode(_config.DateXPath.XPath, out found)) { DateTime contentDate; string dtText = found.InnerText.Trim(); dtText = DateTimeClean.Replace(dtText, m => m.Value.Substring(0, m.Length - 2)); if (!String.IsNullOrEmpty(_config.DateXPath.DateFormat)) { if (DateTime.TryParseExact(dtText, _config.DateXPath.DateFormat, CultureInfo.InvariantCulture, DateTimeStyles.AllowWhiteSpaces, out contentDate)) dtvalue = contentDate; else throw new FormatException("Unable to parse date/time: " + dtText); } else if (DateTime.TryParse(dtText, CultureInfo.InvariantCulture, DateTimeStyles.AllowWhiteSpaces, out contentDate)) dtvalue = contentDate; else throw new FormatException("Unable to parse date/time: " + dtText); } date = dtvalue.ToString("yyyy-MM-dd HH:mm:ss"); StringWriter indexed = new StringWriter(); indexed.WriteLine(title); foreach(var xpath in _config.Indexed.SafeEnumeration()) { foreach (var indexItem in selectFrom.Select(xpath.XPath)) { string innerText = indexItem.InnerText; indexed.WriteLine(innerText); indexed.WriteLine(NonAlphaNum.Replace(innerText, " "));//again, removing all special characters. } } if (String.IsNullOrEmpty(blurb)) blurb = indexed.ToString().Substring(title.Length).Trim(); title = WhiteSpaces.Replace(TrimString(title, _config.TitlePath != null ? (uint)_config.TitlePath.MaxLength : BlurbLength), " "); blurb = WhiteSpaces.Replace(TrimString(blurb, _config.BlubXPath != null ? (uint)_config.BlubXPath.MaxLength : BlurbLength), " "); string text = indexed.ToString(); using (TextReader rdr = new StringReader(text)) AddToIndex(item.Key, date, title, blurb, rdr); } }