Example #1
0
        public string Generate(SummarizeContext context)
        {
            var document = new HtmlDocument();
            var html     = context.Html.SanitizeHtml(context.HtmlWhiteList);

            document.LoadHtml(html);
            var paragraphs  = document.DocumentNode.Elements("p").Take(context.BoundaryCount);
            var htmlExcerpt = string.Join("\r\n", paragraphs.Select(p => p.OuterHtml));

            return(htmlExcerpt);
        }
        public string Generate(SummarizeContext context)
        {
            var ingress = Regex.Replace(context.Html, "<(.|\n)+?>", String.Empty, RegexOptions.IgnoreCase);

            if (!String.IsNullOrEmpty(ingress))
            {
                var original = ingress;
                var wordColl = Regex.Matches(original, @"(\S+\s+)");
                if (wordColl.Count > context.BoundaryCount)
                {
                    var word = new StringBuilder();
                    foreach (var subWord in wordColl.Cast <Match>().Select(r => r.Value).Take(context.BoundaryCount))
                    {
                        word.Append(subWord);
                    }
                    ingress = word.ToString().Trim() + context.Ellipses;
                }
            }

            return(ingress);
        }
        public string Generate(SummarizeContext context)
        {
            var html          = context.Html.SanitizeHtml(context.HtmlWhiteList);
            var max           = context.BoundaryCount;
            var index         = 0;
            var count         = 0;
            var isTag         = false;
            var isEndTag      = false;
            var isEntity      = false;
            var lastTagName   = "";
            var tagName       = "";
            var isWord        = false;
            var lastWordIndex = 0;
            var isAttribute   = false;

            while (index < html.Length && count < max)
            {
                var character = html[index];

                if (!isTag && !isEntity)
                {
                    switch (character)
                    {
                    case '<':
                        isTag       = true;
                        isEndTag    = false;
                        isWord      = false;
                        isAttribute = false;
                        break;

                    case '&':
                        isEntity = true;
                        isWord   = false;
                        break;

                    default:
                        count++;
                        if ((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z'))
                        {
                            if (!isWord)
                            {
                                isWord        = true;
                                lastWordIndex = index;
                            }
                        }
                        else if (isWord)
                        {
                            isWord = false;
                        }
                        break;
                    }
                }
                else if (isEntity)
                {
                    switch (character)
                    {
                    case ';':
                        isEntity = false;
                        break;
                    }
                }
                else
                {
                    switch (character)
                    {
                    case '/':
                        if (!isAttribute)
                        {
                            isEndTag    = true;
                            lastTagName = tagName;
                            tagName     = "";
                        }
                        break;

                    case '>':
                        isTag       = false;
                        isAttribute = false;
                        lastTagName = tagName;
                        tagName     = "";
                        break;

                    case ' ':
                        isAttribute = true;
                        break;

                    default:
                        if (!isAttribute)
                        {
                            tagName += character;
                        }
                        break;
                    }
                }
                index++;
            }

            string htmlExcerpt;

            if (isWord && lastWordIndex > 0 && index < html.Length && ((html[index] >= 'a' && html[index] <= 'z') || (html[index] >= 'A' && html[index] <= 'Z')))
            {
                htmlExcerpt = html.Substring(0, lastWordIndex);
            }
            else
            {
                htmlExcerpt = html.Substring(0, index);
            }

            if (index < html.Length)
            {
                htmlExcerpt = Regex.Replace(htmlExcerpt, @"(&nbsp;|\s|\t)(&nbsp;|\s|\t)*$", string.Empty);
            }

            if (!isEndTag)
            {
                var endTag = "</" + lastTagName + ">";
                htmlExcerpt += endTag;
            }

            return(htmlExcerpt);
        }
Example #4
0
        public static string Summarize(this HtmlHelper htmlHelper, SummarizeContext context)
        {
            var summarizer = htmlHelper.GetWorkContext().Resolve <ISummarizer>();

            return(summarizer.Summarize(context));
        }
Example #5
0
        public string Generate(SummarizeContext context)
        {
            var html        = context.Html.SanitizeHtml(context.HtmlWhiteList);
            var max         = context.BoundaryCount;
            var index       = 0;
            var count       = 0;
            var isTag       = false;
            var isEndTag    = false;
            var isEntity    = false;
            var lastTagName = "";
            var tagName     = "";
            var isWord      = false;
            var isAttribute = false;

            while (index < html.Length && (count < max || isWord))
            {
                var character = html[index];

                if (!isTag && !isEntity)
                {
                    if (character == '<')
                    {
                        isTag       = true;
                        isEndTag    = false;
                        isWord      = false;
                        isAttribute = false;
                    }
                    else if (character == '&')
                    {
                        isEntity = true;
                        isWord   = false;
                    }
                    else if ((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') || (character >= '0' && character <= '9'))
                    {
                        if (!isWord)
                        {
                            isWord = true;
                            count++;
                        }
                    }
                    else
                    {
                        isWord = false;
                    }
                }
                else if (isEntity)
                {
                    switch (character)
                    {
                    case ';':
                        isEntity = false;
                        break;

                    default:
                        tagName += character;
                        break;
                    }
                }
                else
                {
                    switch (character)
                    {
                    case '/':
                        if (!isAttribute)
                        {
                            isEndTag    = true;
                            lastTagName = tagName;
                            tagName     = "";
                        }
                        break;

                    case '>':
                        isTag       = false;
                        isAttribute = false;
                        lastTagName = tagName;
                        tagName     = "";
                        break;

                    case ' ':
                        isAttribute = true;
                        break;

                    default:
                        if (!isAttribute)
                        {
                            tagName += character;
                        }
                        break;
                    }
                }
                index++;
            }

            var htmlExcerpt = html.Substring(0, index);

            if (!isEndTag && !String.IsNullOrWhiteSpace(lastTagName))
            {
                var endTag = "</" + lastTagName + ">";
                htmlExcerpt += endTag;
            }

            return(htmlExcerpt);
        }