/// <summary> /// Remove invalid HTML tags, attributes, and javascript from the HTML. /// </summary> /// <returns>Returns a string consisting of clean HTML.</returns> private string Clean() { int dirtyHtmlIndex = 0; bool foundFirstTag = false; if (this._allowJavascript) { this._dirtyHtml = this._originalHtml; } else { this._dirtyHtml = DeleteScriptTags(this._originalHtml); } while (dirtyHtmlIndex < this._dirtyHtml.Length) { // Look for start tag and process if we find it. Match tagMatch = _startTag.Match(this._dirtyHtml, dirtyHtmlIndex); if (tagMatch.Success) { foundFirstTag = true; // Increment our index the length of the tag. dirtyHtmlIndex = tagMatch.Index + tagMatch.Length; // Process the start tag. The method might increment our index if there is content after this tag. dirtyHtmlIndex = this.ProcessStartTag(tagMatch, dirtyHtmlIndex); continue; } // Look for end tag and process if we find it. tagMatch = _endTag.Match(this._dirtyHtml, dirtyHtmlIndex); if (tagMatch.Success) { // Increment our index the length of the tag. dirtyHtmlIndex = tagMatch.Index + tagMatch.Length; // Process the end tag. The method might increment our index if their is content after this tag. dirtyHtmlIndex = this.ProcessEndTag(tagMatch, dirtyHtmlIndex); continue; } if (!foundFirstTag) { // We haven't encountered an HTML tag yet, so append the current character. this._cleanHtml.Append(this._dirtyHtml.Substring(dirtyHtmlIndex, 1)); } dirtyHtmlIndex++; } return(this._cleanHtml.ToString()); }
public static string GetOutterText(string html, Match startTagMatch) { StringBuilder sb = new StringBuilder(5000); //Start searching at the end of the start tag int index = startTagMatch.Index + startTagMatch.Length; //Get the string version of the tag string startTagName = GetTagName(startTagMatch); string endTag = ""; //loop thru the bml text while (index < html.Length) { Match m = null; //search out the next ending type of tag EndTagRegex bmlEndTag = new EndTagRegex(); m = bmlEndTag.Match(html, index); //check if we find a match if (m.Success) { //verify the tagnames match endTag = GetTagName(m); if (endTag == startTagName) { //match //return the outertext of the match. int endindex = m.Index + m.Length; int stLength = endindex - startTagMatch.Index; return(html.Substring(startTagMatch.Index, stLength)); } { //No Match, increment index index = m.Index + m.Length; continue; } } index++; } //TODO: Throw error? throw new ApplicationException( string.Format("No end tag match found for tag {0} at position {1}", startTagMatch.Value, startTagMatch.Index)); //return ""; }