public string Generate(SummarizeContext context) { var document = new HtmlDocument(); var html = context.Html.SanitizeHtml(context.HtmlWhiteList); document.LoadHtml(html); var paragraphs = document.DocumentNode.Elements("p").Take(context.BoundaryCount); var htmlExcerpt = string.Join("\r\n", paragraphs.Select(p => p.OuterHtml)); return(htmlExcerpt); }
public string Generate(SummarizeContext context) { var ingress = Regex.Replace(context.Html, "<(.|\n)+?>", String.Empty, RegexOptions.IgnoreCase); if (!String.IsNullOrEmpty(ingress)) { var original = ingress; var wordColl = Regex.Matches(original, @"(\S+\s+)"); if (wordColl.Count > context.BoundaryCount) { var word = new StringBuilder(); foreach (var subWord in wordColl.Cast <Match>().Select(r => r.Value).Take(context.BoundaryCount)) { word.Append(subWord); } ingress = word.ToString().Trim() + context.Ellipses; } } return(ingress); }
public string Generate(SummarizeContext context) { var html = context.Html.SanitizeHtml(context.HtmlWhiteList); var max = context.BoundaryCount; var index = 0; var count = 0; var isTag = false; var isEndTag = false; var isEntity = false; var lastTagName = ""; var tagName = ""; var isWord = false; var lastWordIndex = 0; var isAttribute = false; while (index < html.Length && count < max) { var character = html[index]; if (!isTag && !isEntity) { switch (character) { case '<': isTag = true; isEndTag = false; isWord = false; isAttribute = false; break; case '&': isEntity = true; isWord = false; break; default: count++; if ((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z')) { if (!isWord) { isWord = true; lastWordIndex = index; } } else if (isWord) { isWord = false; } break; } } else if (isEntity) { switch (character) { case ';': isEntity = false; break; } } else { switch (character) { case '/': if (!isAttribute) { isEndTag = true; lastTagName = tagName; tagName = ""; } break; case '>': isTag = false; isAttribute = false; lastTagName = tagName; tagName = ""; break; case ' ': isAttribute = true; break; default: if (!isAttribute) { tagName += character; } break; } } index++; } string htmlExcerpt; if (isWord && lastWordIndex > 0 && index < html.Length && ((html[index] >= 'a' && html[index] <= 'z') || (html[index] >= 'A' && html[index] <= 'Z'))) { htmlExcerpt = html.Substring(0, lastWordIndex); } else { htmlExcerpt = html.Substring(0, index); } if (index < html.Length) { htmlExcerpt = Regex.Replace(htmlExcerpt, @"( |\s|\t)( |\s|\t)*$", string.Empty); } if (!isEndTag) { var endTag = "</" + lastTagName + ">"; htmlExcerpt += endTag; } return(htmlExcerpt); }
public static string Summarize(this HtmlHelper htmlHelper, SummarizeContext context) { var summarizer = htmlHelper.GetWorkContext().Resolve <ISummarizer>(); return(summarizer.Summarize(context)); }
public string Generate(SummarizeContext context) { var html = context.Html.SanitizeHtml(context.HtmlWhiteList); var max = context.BoundaryCount; var index = 0; var count = 0; var isTag = false; var isEndTag = false; var isEntity = false; var lastTagName = ""; var tagName = ""; var isWord = false; var isAttribute = false; while (index < html.Length && (count < max || isWord)) { var character = html[index]; if (!isTag && !isEntity) { if (character == '<') { isTag = true; isEndTag = false; isWord = false; isAttribute = false; } else if (character == '&') { isEntity = true; isWord = false; } else if ((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') || (character >= '0' && character <= '9')) { if (!isWord) { isWord = true; count++; } } else { isWord = false; } } else if (isEntity) { switch (character) { case ';': isEntity = false; break; default: tagName += character; break; } } else { switch (character) { case '/': if (!isAttribute) { isEndTag = true; lastTagName = tagName; tagName = ""; } break; case '>': isTag = false; isAttribute = false; lastTagName = tagName; tagName = ""; break; case ' ': isAttribute = true; break; default: if (!isAttribute) { tagName += character; } break; } } index++; } var htmlExcerpt = html.Substring(0, index); if (!isEndTag && !String.IsNullOrWhiteSpace(lastTagName)) { var endTag = "</" + lastTagName + ">"; htmlExcerpt += endTag; } return(htmlExcerpt); }