private List <Token> GetTokens(string data) { if (String.IsNullOrEmpty(data)) { return(new List <Token>()); } List <string> lines = new List <string>(); // Using HtmlAgilityPack HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(data); HtmlNode root = doc.DocumentNode; if (root != null) { IEnumerable <HtmlNode> nodes = root.DescendantsAndSelf(); if (nodes != null && nodes.Count() > 0) { foreach (HtmlNode node in nodes.ToList()) { if (!node.HasChildNodes) { string text = node.InnerText; if (!String.IsNullOrEmpty(text)) { text = text.Trim(); } if (!String.IsNullOrEmpty(text)) { lines.Add(text); } } } } } List <Token> ret = new List <Token>(); if (lines != null && lines.Count > 0) { foreach (string line in lines) { ParseResult pr = _TextParser.ParseString(line); if (pr != null && pr.Tokens != null && pr.Tokens.Count > 0) { foreach (Token currToken in pr.Tokens) { ret = ParserCommon.AddToken(ret, currToken); } } } } if (ret != null && ret.Count > 0) { ret = ret.OrderByDescending(u => u.Count).ToList(); } return(ret); }
private ParseResult ProcessSourceContent(string data) { #region Load-Document HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(data); ParseResult ret = new ParseResult(); ret.Html = new ParseResult.HtmlParseResult(); #endregion #region Head ret.Html.Head.Title = GetTitle(data); ret.Html.Head.MetaDescription = GetMetaDescription(doc); ret.Html.Head.MetaKeywords = GetMetaKeywords(doc); ret.Html.Head.MetaImageOpengraph = GetMetaImageOpengraph(doc); ret.Html.Head.MetaDescriptionOpengraph = GetMetaDescriptionOpengraph(doc); ret.Html.Head.MetaVideoTagsOpengraph = GetMetaVideoTagsOpengraph(doc); StringBuilder head = new StringBuilder(" "); if (!String.IsNullOrEmpty(ret.Html.Head.Title)) { head.Append(" " + ret.Html.Head.Title); } if (!String.IsNullOrEmpty(ret.Html.Head.MetaDescription)) { head.Append(" " + ret.Html.Head.MetaDescription); } if (ret.Html.Head.MetaKeywords != null && ret.Html.Head.MetaKeywords.Count > 0) { head.Append(" " + String.Join(" ", ret.Html.Head.MetaKeywords)); } if (!String.IsNullOrEmpty(ret.Html.Head.MetaDescriptionOpengraph)) { head.Append(" " + ret.Html.Head.MetaDescriptionOpengraph); } if (ret.Html.Head.MetaVideoTagsOpengraph != null && ret.Html.Head.MetaVideoTagsOpengraph.Count > 0) { head.Append(" " + String.Join(" ", ret.Html.Head.MetaVideoTagsOpengraph)); } ret.Html.Head.Content = head.ToString(); ret.Html.Head.Tokens = ParserCommon.GetTokens(ret.Html.Head.Content, _ParseOptions.Text); #endregion #region Body ret.Html.Body.ImageUrls = GetImageUrls(doc, data); ret.Html.Body.Links = GetLinks(doc); ret.Html.Body.Content = GetHtmlBody(doc); ret.Html.Body.Tokens = ParserCommon.GetTokens(ret.Html.Body.Content, _ParseOptions.Text); #endregion #region Data ret.Tokens = new List <Token>(); long bodyStartingPosition = 0; if (ret.Html.Head.Tokens != null && ret.Html.Head.Tokens.Count > 0) { ret.Tokens.AddRange(ret.Html.Head.Tokens); foreach (Token token in ret.Html.Head.Tokens) { if (token.Positions != null && token.Positions.Count > 0) { long maxPos = token.Positions.Max(); if (maxPos >= bodyStartingPosition) { bodyStartingPosition = (maxPos + 1); } } } } // bodyStartingPosition + [body token position] will yield the correct position across the entire set of tokens if (ret.Html.Body.Tokens != null && ret.Html.Body.Tokens.Count > 0) { List <Token> updatedTokens = new List <Token>(); foreach (Token token in ret.Html.Body.Tokens) { Token updated = new Token(); updated.Value = token.Value; updated.Count = token.Count; updated.Positions = new List <long>(); if (token.Positions != null && token.Positions.Count > 0) { foreach (long tokenPos in token.Positions) { long updatedPosition = bodyStartingPosition + tokenPos; updated.Positions.Add(updatedPosition); } } updatedTokens.Add(updated); ret.Html.Body.Tokens = updatedTokens; } foreach (Token token in ret.Html.Body.Tokens) { ret.Tokens = ParserCommon.AddToken(ret.Tokens, token); } } ret.Schema = BuildSchema(); #endregion ret.Success = true; ret.Time.End = DateTime.UtcNow; return(ret); }