/// <summary>Invokes the CSS parser on the element.</summary> /// <param name="node">The <see cref="HtmlNode"/> to scan.</param> /// <param name="parentNode">The parent of the node.</param> /// <returns><see langword="true"/> if processing ended with no exceptions.</returns> private bool ProcessStyleTag(HtmlNode node, HtmlNode parentNode) { var styleScanner = new CssScanner(Policy); try { CleanResults cleanStyleSheet = styleScanner.ScanStyleSheet(node.FirstChild.InnerHtml); errorMessages.AddRange(cleanStyleSheet.GetErrorMessages()); /* * If IE gets an empty style tag, i.e. <style/> it will break all CSS on the page. I wish I * was kidding. So, if after validation no CSS properties are left, we would normally be left * with an empty style tag and break all CSS. To prevent that, we have this check. */ string cleanHtml = cleanStyleSheet.GetCleanHtml(); node.FirstChild.InnerHtml = string.IsNullOrEmpty(cleanHtml) ? EMPTY_CSS_COMMENT : cleanHtml; } catch (Exception exc) { if (exc is ScanException || exc is ParseException) { AddError(Constants.ERROR_CSS_TAG_MALFORMED, HtmlEntityEncoder.HtmlEntityEncode(node.FirstChild.InnerHtml)); parentNode.RemoveChild(node); return(false); } else { throw; } } return(true); }
public void TestMessageInSupportedCulture() { foreach (string cultureName in Constants.SUPPORTED_LANGUAGES.Union(new List <string> { "en-US", "es-UY" })) { string message = null; try { policy.Should().NotBeNull(); antisamy.SetCulture(cultureName); CleanResults results = antisamy.Scan("<unknowntag>", policy); results.GetNumberOfErrors().Should().Be(1); message = results.GetErrorMessages().First(); } catch { // To comply with try/catch } message.Should().NotBeNull(because: $"\"{cultureName}\" should be a valid culture and have an associated message."); } }
/// <summary> Main parsing engine </summary> /// <param name="html">A String whose contents we want to scan.</param> /// <returns> A <code>CleanResults</code> object with an <code>XMLDocumentFragment</code> /// object and its String representation, as well as some scan statistics. /// </returns> /// <throws> ScanException </throws> public virtual CleanResults scan(string html, string inputEncoding, string outputEncoding) { if (html == null) { throw new ScanException("No input (null)"); } //had problems with the getting double encoded, so this converts it to a literal space. //this may need to be changed. html = html.Replace(" ", char.Parse("\u00a0").ToString()); //We have to replace any invalid XML characters html = stripNonValidXMLCharacters(html); //holds the maximum input size for the incoming fragment int maxInputSize = Policy.DEFAULT_MAX_INPUT_SIZE; //grab the size specified in the config file try { maxInputSize = int.Parse(policy.getDirective("maxInputSize")); } catch (FormatException fe) { Console.WriteLine("Format Exception: " + fe.ToString()); } //ensure our input is less than the max if (maxInputSize < html.Length) { throw new ScanException("File size [" + html.Length + "] is larger than maximum [" + maxInputSize + "]"); } //grab start time (to be put in the result set along with end time) DateTime start = DateTime.Now; //fixes some weirdness in HTML agility if (!HtmlNode.ElementsFlags.Contains("iframe")) HtmlNode.ElementsFlags.Add("iframe", HtmlElementFlag.Empty); HtmlNode.ElementsFlags.Remove("form"); //Let's parse the incoming HTML HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //add closing tags doc.OptionAutoCloseOnEnd = true; //enforces XML rules, encodes big 5 doc.OptionOutputAsXml = true; //loop through every node now, and enforce the rules held in the policy object for (int i = 0; i < doc.DocumentNode.ChildNodes.Count; i++) { //grab current node HtmlNode tmp = doc.DocumentNode.ChildNodes[i]; //this node can hold other nodes, so recursively validate recursiveValidateTag(tmp); if (tmp.ParentNode == null) { i--; } } //all the cleaned HTML string finalCleanHTML = doc.DocumentNode.InnerHtml; //grab end time (to be put in the result set along with start time) DateTime end = DateTime.Now; results = new CleanResults(start, end, finalCleanHTML, dom, errorMessages); return results; }
/// <summary> Main parsing engine </summary> /// <param name="html">A string whose contents we want to scan.</param> /// <returns> A <see cref="CleanResults"/> object with an <see cref="XmlDocumentFragment"/> /// object and its string representation, as well as some scan statistics.</returns> /// <exception cref="ScanException"/> public CleanResults Scan(string html) { if (html == null) { throw new ScanException("No input (null)."); } if (Results != null) { InitBlock(); // There was a scan before on the same instance } // Ensure our input is less than the max if (Policy.MaxInputSize < html.Length) { AddError(Constants.ERROR_SIZE_TOOLARGE, html.Length, Policy.MaxInputSize); throw new ScanException(errorMessages.First()); } // Had problems with the getting double encoded, so this converts it to a literal space. This may need to be changed. html = html.Replace(" ", char.Parse("\u00a0").ToString()); // We have to replace any invalid XML characters html = StripNonValidXmlCharacters(html); // Fixes some weirdness in HTML agility if (!HtmlNode.ElementsFlags.ContainsKey("iframe")) { HtmlNode.ElementsFlags.Add("iframe", HtmlElementFlag.Empty); } HtmlNode.ElementsFlags.Remove("form"); var htmlDocument = new HtmlDocument { OptionAutoCloseOnEnd = true, // Add closing tags OptionMaxNestedChildNodes = Constants.MAX_NESTED_TAGS, // TODO: Add directive for this like in MaxInputSize? OptionOutputAsXml = Policy.UsesXhtml, // Enforces XML rules, encodes big 5 OptionXmlForceOriginalComment = true // Fix provided by the library for weird added spaces in HTML comments }; // Grab start time (to be put in the result set along with end time) var start = DateTime.Now; try { // Let's parse the incoming HTML htmlDocument.LoadHtml(html); // Loop through every node now, and enforce the rules held in the policy object ProcessChildren(htmlDocument.DocumentNode); } catch (Exception exc) { if (!(exc is ScanException)) { throw new ScanException("There was an error while performing the scan.", exc); } else { throw; } } // All the cleaned HTML string finalCleanHTML = Policy.PreservesSpace ? htmlDocument.DocumentNode.InnerHtml : htmlDocument.DocumentNode.InnerHtml.Trim(); // Encode special/international characters if stated by policy if (Policy.EntityEncodesInternationalCharacters) { finalCleanHTML = SpecialCharactersEncoder.Encode(finalCleanHTML); } if (!Policy.UsesXhtml && !Policy.OmitsDoctypeDeclaration) { finalCleanHTML = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" " + "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" + finalCleanHTML; } if (Policy.UsesXhtml && !Policy.OmitsXmlDeclaration) { finalCleanHTML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + finalCleanHTML; } // Grab end time (to be put in the result set along with start time) var end = DateTime.Now; Results = new CleanResults(start, end, finalCleanHTML, errorMessages); return(Results); }
private bool ProcessAttributes(HtmlNode node, Tag tag) { string tagName = tag.Name; int currentAttributeIndex = 0; while (currentAttributeIndex < node.Attributes.Count) { HtmlAttribute htmlAttribute = node.Attributes[currentAttributeIndex]; string name = htmlAttribute.Name; string value = htmlAttribute.Value; Attribute attribute = tag.GetAttributeByName(name); if (attribute == null) { attribute = Policy.GetGlobalAttributeByName(name); // Not a global attribute, perhaps it is a dynamic attribute, if allowed. if (attribute == null && Policy.AllowsDynamicAttributes) { attribute = Policy.GetDynamicAttributeByName(name); } } if (name.ToLowerInvariant() == "style" && attribute != null) { var styleScanner = new CssScanner(Policy); try { CleanResults cleanInlineStyle = styleScanner.ScanInlineStyle(value, tagName); htmlAttribute.Value = cleanInlineStyle.GetCleanHtml(); errorMessages.AddRange(cleanInlineStyle.GetErrorMessages()); } catch (Exception exc) { if (exc is ScanException || exc is ParseException) { AddError(Constants.ERROR_CSS_ATTRIBUTE_MALFORMED, HtmlEntityEncoder.HtmlEntityEncode(value), HtmlEntityEncoder.HtmlEntityEncode(tagName)); node.Attributes.Remove(name); currentAttributeIndex--; } else { throw; } } } else { if (attribute != null) { value = HtmlEntity.DeEntitize(value); string lowerCaseValue = value.ToLowerInvariant(); bool isAttributeValid = attribute.AllowedValues.Any(v => v != null && v.ToLowerInvariant() == lowerCaseValue) || attribute.AllowedRegExp.Any(r => r != null && Regex.IsMatch(value, "^" + r + "$")); if (!isAttributeValid) { string onInvalidAction = attribute.OnInvalid; if (onInvalidAction == "removeTag") { RemoveNode(node); AddError(Constants.ERROR_ATTRIBUTE_INVALID_REMOVED, HtmlEntityEncoder.HtmlEntityEncode(tagName), HtmlEntityEncoder.HtmlEntityEncode(name), HtmlEntityEncoder.HtmlEntityEncode(value)); } else if (onInvalidAction == "filterTag") { // Remove the node and move up the rest that was inside the tag after processing ProcessChildren(node); PromoteChildren(node); AddError(Constants.ERROR_ATTRIBUTE_CAUSE_FILTER, HtmlEntityEncoder.HtmlEntityEncode(tagName), HtmlEntityEncoder.HtmlEntityEncode(name), HtmlEntityEncoder.HtmlEntityEncode(value)); } else if (onInvalidAction == "encodeTag") { // Encode the node and move up the rest that was inside the tag after processing ProcessChildren(node); EncodeAndPromoteChildren(node); AddError(Constants.ERROR_ATTRIBUTE_CAUSE_ENCODE, HtmlEntityEncoder.HtmlEntityEncode(tagName), HtmlEntityEncoder.HtmlEntityEncode(name), HtmlEntityEncoder.HtmlEntityEncode(value)); } else { // Just remove the attribute node.Attributes.Remove(attribute.Name); currentAttributeIndex--; AddError(Constants.ERROR_ATTRIBUTE_INVALID, HtmlEntityEncoder.HtmlEntityEncode(tagName), HtmlEntityEncoder.HtmlEntityEncode(name), HtmlEntityEncoder.HtmlEntityEncode(value)); } if (new string[] { "removeTag", "filterTag", "encodeTag" }.Contains(onInvalidAction)) { return(false); // Can't process any more if we remove/filter/encode the tag } } } else { AddError(Constants.ERROR_ATTRIBUTE_NOT_IN_POLICY, HtmlEntityEncoder.HtmlEntityEncode(tagName), HtmlEntityEncoder.HtmlEntityEncode(name), HtmlEntityEncoder.HtmlEntityEncode(value)); node.Attributes.Remove(name); currentAttributeIndex--; } } currentAttributeIndex++; } return(true); }
/// <summary> Main parsing engine </summary> /// <param name="html">A String whose contents we want to scan.</param> /// <returns> A <code>CleanResults</code> object with an <code>XMLDocumentFragment</code> /// object and its String representation, as well as some scan statistics. /// </returns> /// <throws> ScanException </throws> public virtual CleanResults scan(string html, string inputEncoding, string outputEncoding) { if (html == null) { throw new ScanException("No input (null)"); } //had problems with the getting double encoded, so this converts it to a literal space. //this may need to be changed. html = html.Replace(" ", char.Parse("\u00a0").ToString()); //We have to replace any invalid XML characters html = stripNonValidXMLCharacters(html); //holds the maximum input size for the incoming fragment int maxInputSize = Policy.DEFAULT_MAX_INPUT_SIZE; //grab the size specified in the config file try { maxInputSize = int.Parse(policy.getDirective("maxInputSize")); } catch (FormatException fe) { Console.WriteLine("Format Exception: " + fe.ToString()); } //ensure our input is less than the max if (maxInputSize < html.Length) { throw new ScanException("File size [" + html.Length + "] is larger than maximum [" + maxInputSize + "]"); } //grab start time (to be put in the result set along with end time) DateTime start = DateTime.Now; //fixes some weirdness in HTML agility if (!HtmlNode.ElementsFlags.Contains("iframe")) { HtmlNode.ElementsFlags.Add("iframe", HtmlElementFlag.Empty); } HtmlNode.ElementsFlags.Remove("form"); //Let's parse the incoming HTML HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //add closing tags doc.OptionAutoCloseOnEnd = true; //enforces XML rules, encodes big 5 doc.OptionOutputAsXml = true; //loop through every node now, and enforce the rules held in the policy object for (int i = 0; i < doc.DocumentNode.ChildNodes.Count; i++) { //grab current node HtmlNode tmp = doc.DocumentNode.ChildNodes[i]; //this node can hold other nodes, so recursively validate recursiveValidateTag(tmp); if (tmp.ParentNode == null) { i--; } } //all the cleaned HTML string finalCleanHTML = doc.DocumentNode.InnerHtml; //grab end time (to be put in the result set along with start time) DateTime end = DateTime.Now; results = new CleanResults(start, end, finalCleanHTML, dom, errorMessages); return(results); }
private void recursiveValidateTag(HtmlNode node) { int maxinputsize = int.Parse(policy.getDirective("maxInputSize")); num++; HtmlNode parentNode = node.ParentNode; HtmlNode tmp = null; string tagName = node.Name; //check this out //might not be robust enough if (tagName.ToLower().Equals("#text")) // || tagName.ToLower().Equals("#comment")) { return; } Tag tag = policy.getTagByName(tagName.ToLower()); if (tag == null || "filter".Equals(tag.Action)) { StringBuilder errBuff = new StringBuilder(); if (tagName == null || tagName.Trim().Equals("")) { errBuff.Append("An unprocessable "); } else { errBuff.Append("The <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName.ToLower()) + "</b> "); } errBuff.Append("tag has been filtered for security reasons. The contents of the tag will "); errBuff.Append("remain in place."); errorMessages.Add(errBuff.ToString()); for (int i = 0; i < node.ChildNodes.Count; i++) { tmp = node.ChildNodes[i]; recursiveValidateTag(tmp); if (tmp.ParentNode == null) { i--; } } promoteChildren(node); return; } else if ("validate".Equals(tag.Action)) { if ("style".Equals(tagName.ToLower()) && policy.getTagByName("style") != null) { CssScanner styleScanner = new CssScanner(policy); try { CleanResults cr = styleScanner.scanStyleSheet(node.FirstChild.InnerHtml, maxinputsize); foreach (string msg in cr.getErrorMessages()) { errorMessages.Add(msg.ToString()); } /* * If IE gets an empty style tag, i.e. <style/> * it will break all CSS on the page. I wish I * was kidding. So, if after validation no CSS * properties are left, we would normally be left * with an empty style tag and break all CSS. To * prevent that, we have this check. */ if (cr.getCleanHTML() == null || cr.getCleanHTML().Equals("")) { //node.getFirstChild().setNodeValue("/* */"); node.FirstChild.InnerHtml = "/* */"; } else { //node.getFirstChild().setNodeValue(cr.getCleanHTML()); node.FirstChild.InnerHtml = cr.getCleanHTML(); } } // catch (DomException e) // { // addError(ErrorMessageUtil.ERROR_CSS_TAG_MALFORMED, new Object[] { HTMLEntityEncoder.htmlEntityEncode(node.getFirstChild().getNodeValue()) }); // parentNode.removeChild(node); // } catch (ScanException e) { Console.WriteLine("Scan Exception: " + e.Message); //addError(ErrorMessageUtil.ERROR_CSS_TAG_MALFORMED, new Object[] { HTMLEntityEncoder.htmlEntityEncode(node.getFirstChild().getNodeValue()) }); parentNode.RemoveChild(node); } } HtmlAttribute attribute = null; for (int currentAttributeIndex = 0; currentAttributeIndex < node.Attributes.Count; currentAttributeIndex++) { attribute = node.Attributes[currentAttributeIndex]; string name = attribute.Name; string _value = attribute.Value; Attribute attr = tag.getAttributeByName(name); if (attr == null) { attr = policy.getGlobalAttributeByName(name); } bool isAttributeValid = false; if ("style".Equals(name.ToLower()) && attr != null) { CssScanner styleScanner = new CssScanner(policy); try { CleanResults cr = styleScanner.scanInlineStyle(_value, tagName, maxinputsize); //attribute.setNodeValue(cr.getCleanHTML()); attribute.Value = cr.getCleanHTML(); ArrayList cssScanErrorMessages = cr.getErrorMessages(); foreach (string msg in cr.getErrorMessages()) { errorMessages.Add(msg.ToString()); } } /* * catch (DOMException e) * { * * addError(ErrorMessageUtil.ERROR_CSS_ATTRIBUTE_MALFORMED, new Object[] { tagName, HTMLEntityEncoder.htmlEntityEncode(node.getNodeValue()) }); * * ele.removeAttribute(name); * currentAttributeIndex--; * * } */ catch (ScanException ex) { Console.WriteLine(ex.Message); //addError(ErrorMessageUtil.ERROR_CSS_ATTRIBUTE_MALFORMED, new Object[] { tagName, HTMLEntityEncoder.htmlEntityEncode(node.getNodeValue()) }); //ele.removeAttribute(name); currentAttributeIndex--; } } else { if (attr != null) { //try to find out how robust this is - do I need to do this in a loop? _value = HtmlEntity.DeEntitize(_value); foreach (string allowedValue in attr.AllowedValues) { if (isAttributeValid) { break; } if (allowedValue != null && allowedValue.ToLower().Equals(_value.ToLower())) { isAttributeValid = true; } } foreach (string ptn in attr.AllowedRegExp) { if (isAttributeValid) { break; } string pattern = "^" + ptn + "$"; Match m = Regex.Match(_value, pattern); if (m.Success) { isAttributeValid = true; } } if (!isAttributeValid) { string onInvalidAction = attr.OnInvalid; StringBuilder errBuff = new StringBuilder(); errBuff.Append("The <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName) + "</b> tag contained an attribute that we couldn't process. "); errBuff.Append("The <b>" + HTMLEntityEncoder.htmlEntityEncode(name) + "</b> attribute had a value of <u>" + HTMLEntityEncoder.htmlEntityEncode(_value) + "</u>. "); errBuff.Append("This value could not be accepted for security reasons. We have chosen to "); //Console.WriteLine(policy); if ("removeTag".Equals(onInvalidAction)) { parentNode.RemoveChild(node); errBuff.Append("remove the <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName) + "</b> tag and its contents in order to process this input. "); } else if ("filterTag".Equals(onInvalidAction)) { for (int i = 0; i < node.ChildNodes.Count; i++) { tmp = node.ChildNodes[i]; recursiveValidateTag(tmp); if (tmp.ParentNode == null) { i--; } } promoteChildren(node); errBuff.Append("filter the <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName) + "</b> tag and leave its contents in place so that we could process this input."); } else { node.Attributes.Remove(attr.Name); currentAttributeIndex--; errBuff.Append("remove the <b>" + HTMLEntityEncoder.htmlEntityEncode(name) + "</b> attribute from the tag and leave everything else in place so that we could process this input."); } errorMessages.Add(errBuff.ToString()); if ("removeTag".Equals(onInvalidAction) || "filterTag".Equals(onInvalidAction)) { return; // can't process any more if we remove/filter the tag } } } else { StringBuilder errBuff = new StringBuilder(); errBuff.Append("The <b>" + HTMLEntityEncoder.htmlEntityEncode(name)); errBuff.Append("</b> attribute of the <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName) + "</b> tag has been removed for security reasons. "); errBuff.Append("This removal should not affect the display of the HTML submitted."); errorMessages.Add(errBuff.ToString()); node.Attributes.Remove(name); currentAttributeIndex--; } // end if attribute is or is not found in policy file } // end if style.equals("name") } // end while loop through attributes for (int i = 0; i < node.ChildNodes.Count; i++) { tmp = node.ChildNodes[i]; recursiveValidateTag(tmp); if (tmp.ParentNode == null) { i--; } } } else if ("truncate".Equals(tag.Action)) { Console.WriteLine("truncate"); HtmlAttributeCollection nnmap = node.Attributes; while (nnmap.Count > 0) { StringBuilder errBuff = new StringBuilder(); errBuff.Append("The <b>" + HTMLEntityEncoder.htmlEntityEncode(nnmap[0].Name)); errBuff.Append("</b> attribute of the <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName) + "</b> tag has been removed for security reasons. "); errBuff.Append("This removal should not affect the display of the HTML submitted."); node.Attributes.Remove(nnmap[0].Name); errorMessages.Add(errBuff.ToString()); } HtmlNodeCollection cList = node.ChildNodes; int i = 0; int j = 0; int length = cList.Count; while (i < length) { HtmlNode nodeToRemove = cList[j]; if (nodeToRemove.NodeType != HtmlNodeType.Text && nodeToRemove.NodeType != HtmlNodeType.Comment) { node.RemoveChild(nodeToRemove); } else { j++; } i++; } } else { errorMessages.Add("The <b>" + HTMLEntityEncoder.htmlEntityEncode(tagName) + "</b> tag has been removed for security reasons."); parentNode.RemoveChild(node); } }