private static void AddNodes(HtmlElement parent, string html, bool removeInstructions) { int len = html.Length; int i = -1; string text = ""; while (i < len - 1) { i++; if (i >= html.Length) break; string s = html.Substring(i, 1); if (s == "<") { int start = i + 1; int stop = html.IndexOf(">", i, StringComparison.Ordinal); if (stop < i) continue; if (text.Trim() != "") parent.Add(new HtmlText(text)); text = ""; string fragment = html.Substring(start, stop - start); //html comment if (fragment.StartsWith("!--") && fragment.EndsWith("--")) { parent.Add(new HtmlComment(fragment.Substring(3).Substring(0, fragment.Length - 5))); i = stop; } //html instruction else if (fragment.StartsWith("!")) { if (!removeInstructions) parent.Add(new HtmlInstruction(fragment.Substring(1).Substring(0, fragment.Length - 1))); i = stop; } else if (fragment.StartsWith("/")) { //remove invalid closing tags i = stop; } else { HtmlElement node = CreateNode(html, start, ref stop); parent.Add(node); i = stop + (node.IsClosed ? 0 : (node.Name.Length + 2)); } continue; } text += s; } if (text.Trim() != string.Empty) parent.Add(new HtmlText(text)); }
public static string StripHtml(HtmlElement element) { StringBuilder builder = new StringBuilder(); StripHtml(builder, element); string result = builder.ToString(); return Regex.Replace(result, @" |\s+", " "); }
private static bool HasContent(HtmlElement element) { if (element.Name == "img" || element.Name == "iframe" || element.Name == "frame") return true; // пустой element без контейнеров и текста if (element.ChildNodes.Count == 0) return false; // проверить вложенные элементы и удалить пустые for (int i = element.ChildNodes.Count - 1; i >= 0; i--) { if (element.ChildNodes[i] is HtmlElement) { HtmlElement child = element.ChildNodes[i] as HtmlElement; bool hasContent = HasContent(child); if (!hasContent) { element.ChildNodes.RemoveAt(i); } } else if (element.ChildNodes[i] is HtmlComment || element.ChildNodes[i] is HtmlInstruction) { element.ChildNodes.RemoveAt(i); } } // если нету вложенныех элементов (элементов контейнеров и элементов текстов) if (element.ChildNodes.Count == 0) return false; // проверить тексты (если есть хоть один контейнер, то ищём контент в нём) if (!element.ChildNodes.Any(p => p is HtmlElement)) { bool hasText = HasText(element); if (!hasText) { return false; } } return true; }
public static void ReplaceMedia(HtmlElement element) { for (int i = 0; i < element.ChildNodes.Count; i++) { if (element.ChildNodes[i] is HtmlElement) { HtmlElement innerElement = element.ChildNodes[i] as HtmlElement; if (innerElement.Name == "img") { HtmlAttribute srcAttribute = innerElement.Attributes.FirstOrDefault(p => p.Name == "src"); Uri address; if (srcAttribute != null && !string.IsNullOrEmpty(srcAttribute.Value) && Uri.TryCreate(srcAttribute.Value, UriKind.RelativeOrAbsolute, out address)) { if (!address.PathAndQuery.StartsWith("/scripts/ckeditor/plugins/smiley/images/")) { innerElement.Add("data-original", srcAttribute.Value); innerElement.Add("src", "/images/gimp_5834.png"); innerElement.Add("style", ""); innerElement.Add("width", ""); innerElement.Add("height", ""); innerElement.Add("class", "art-img-thumb"); } } } else if (innerElement.Name == "iframe" || innerElement.Name == "frame") { element.RemoveChildAt(i); HtmlElement newImage = new HtmlElement("img", true); newImage.Add("src", "/images/youtube-icon.png"); newImage.Add("class", "art-img-thumb-yt"); element.Insert(i, newImage); } else { ReplaceMedia(innerElement); } } } }
//поиск изображений public static List<HtmlElement> FindElementsOfType(HtmlElement root, string type) { List<HtmlElement> images = new List<HtmlElement>(); for (int i = 0; i < root.ChildNodes.Count; i++) { if (root.ChildNodes[i] is HtmlElement) { HtmlElement child = root.ChildNodes[i] as HtmlElement; if (child.Name == type) { images.Add(child); } images.AddRange(FindElementsOfType(child, type)); } } return images; }
private static HtmlElement CreateNode(string htmlPart) { string html = htmlPart.Trim(); bool isClosed = html.EndsWith("/"); int spaceIndex = html.IndexOf(" ", StringComparison.Ordinal); if (spaceIndex < 0) { if(isClosed) html = html.Substring(0, html.Length - 1); return new HtmlElement(html, !CanHaveContent(html)); } string tagName = html.Substring(0, spaceIndex); html = html.Substring(spaceIndex + 1); if (isClosed) html = html.Substring(0, html.Length - 1); if (!isClosed && !CanHaveContent(tagName.ToLower())) isClosed = true; HtmlElement element = new HtmlElement(tagName, isClosed); int len = html.Length; string attrName = string.Empty; string attrVal = string.Empty; bool isName = true; bool isValue = false; bool hasQuotes = false; for (var i = 0; i < len; i++) { string s = html.Substring(i, 1); if (s == "=" && !isValue) { isName = false; isValue = true; string nextChar = html.Substring(i + 1, 1); hasQuotes = (nextChar == "\"" || nextChar == "'"); } else if (s == " " && isName) { //add attribute that requires no value element.Add(attrName, attrName); //reset attribute name attrName = string.Empty; attrVal = string.Empty; } else if (s == " " && attrVal.Length > 0) { if (!hasQuotes || (attrVal[0] == attrVal[attrVal.Length - 1])) { isValue = false; isName = true; string value = FixAttributeValue(attrVal); element.Add(attrName, value); attrName = string.Empty; attrVal = string.Empty; } else if (isValue) attrVal += s; } else if (isName) { attrName += s; } else if (isValue) attrVal += s; } if (isName && attrName != string.Empty && attrVal == string.Empty) attrVal = attrName; if (attrName != string.Empty && attrVal != string.Empty) { string value = FixAttributeValue(attrVal); element.Add(attrName, value); } return element; }
public HtmlImageInfo(HtmlElement element) { Element = element; _src = element.Attributes.FirstOrDefault(p => p.Name == "src"); }
public void Sanitize(HtmlElement element) { IEnumerable<HtmlAttribute> attributesToRemove = from attribute in element.Attributes where IsUnSafe(attribute) select attribute; element.RemoveAll(attributesToRemove); for (var i = element.ChildNodes.Count - 1; i >= 0; i--) { HtmlNode childNode = element.ChildNodes[i]; if (IsUnSafe(childNode) || childNode is HtmlInstruction || childNode is HtmlComment) { element.ChildNodes.RemoveAt(i); } } foreach (HtmlNode node in element.ChildNodes) { if (node is HtmlElement) Sanitize(node as HtmlElement); } }
public static void RemoveEmptyTags(HtmlElement document) { bool hasContent = HasContent(document); }
private static void StripHtml(StringBuilder builder, HtmlElement element) { foreach (HtmlNode node in element.ChildNodes) { if (node is HtmlText) { builder.Append(node.ToString()); } else if (node is HtmlElement) { HtmlElement child = node as HtmlElement; if(SKIP_TAGS.Contains(child.Name)) { AddWhitespace(builder); } else if (NEW_LINE_TAGS.Contains(child.Name)) { AddWhitespace(builder); StripHtml(builder, child); } } } }
private static bool HasText(HtmlElement element) { StringBuilder content = new StringBuilder(); for (int j = 0; j < element.ChildNodes.Count; j++) { if (element.ChildNodes[j] is HtmlText) { string nodeText = element.ChildNodes[j].ToString().ToLower(); content.Append(nodeText); } } string whiteSpacesExpression = @" |\s+|\t|\n|\r|" + Environment.NewLine; string pureText = Regex.Replace(content.ToString(), whiteSpacesExpression, string.Empty); return pureText != string.Empty; }
public static List<HtmlImageInfo> FindImages(HtmlElement root) { return FindElementsOfType(root, "img") .Select(p => new HtmlImageInfo(p)) .ToList(); }