コード例 #1
0
        private List <Token> GetTokens(string data)
        {
            if (String.IsNullOrEmpty(data))
            {
                return(new List <Token>());
            }

            List <string> lines = new List <string>();

            // Using HtmlAgilityPack
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(data);

            HtmlNode root = doc.DocumentNode;

            if (root != null)
            {
                IEnumerable <HtmlNode> nodes = root.DescendantsAndSelf();
                if (nodes != null && nodes.Count() > 0)
                {
                    foreach (HtmlNode node in nodes.ToList())
                    {
                        if (!node.HasChildNodes)
                        {
                            string text = node.InnerText;
                            if (!String.IsNullOrEmpty(text))
                            {
                                text = text.Trim();
                            }
                            if (!String.IsNullOrEmpty(text))
                            {
                                lines.Add(text);
                            }
                        }
                    }
                }
            }

            List <Token> ret = new List <Token>();

            if (lines != null && lines.Count > 0)
            {
                foreach (string line in lines)
                {
                    ParseResult pr = _TextParser.ParseString(line);
                    if (pr != null && pr.Tokens != null && pr.Tokens.Count > 0)
                    {
                        foreach (Token currToken in pr.Tokens)
                        {
                            ret = ParserCommon.AddToken(ret, currToken);
                        }
                    }
                }
            }

            if (ret != null && ret.Count > 0)
            {
                ret = ret.OrderByDescending(u => u.Count).ToList();
            }

            return(ret);
        }
コード例 #2
0
        private ParseResult ProcessSourceContent(string data)
        {
            #region Load-Document

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(data);

            ParseResult ret = new ParseResult();
            ret.Html = new ParseResult.HtmlParseResult();

            #endregion

            #region Head

            ret.Html.Head.Title                    = GetTitle(data);
            ret.Html.Head.MetaDescription          = GetMetaDescription(doc);
            ret.Html.Head.MetaKeywords             = GetMetaKeywords(doc);
            ret.Html.Head.MetaImageOpengraph       = GetMetaImageOpengraph(doc);
            ret.Html.Head.MetaDescriptionOpengraph = GetMetaDescriptionOpengraph(doc);
            ret.Html.Head.MetaVideoTagsOpengraph   = GetMetaVideoTagsOpengraph(doc);

            StringBuilder head = new StringBuilder(" ");

            if (!String.IsNullOrEmpty(ret.Html.Head.Title))
            {
                head.Append(" " + ret.Html.Head.Title);
            }

            if (!String.IsNullOrEmpty(ret.Html.Head.MetaDescription))
            {
                head.Append(" " + ret.Html.Head.MetaDescription);
            }

            if (ret.Html.Head.MetaKeywords != null && ret.Html.Head.MetaKeywords.Count > 0)
            {
                head.Append(" " + String.Join(" ", ret.Html.Head.MetaKeywords));
            }

            if (!String.IsNullOrEmpty(ret.Html.Head.MetaDescriptionOpengraph))
            {
                head.Append(" " + ret.Html.Head.MetaDescriptionOpengraph);
            }

            if (ret.Html.Head.MetaVideoTagsOpengraph != null && ret.Html.Head.MetaVideoTagsOpengraph.Count > 0)
            {
                head.Append(" " + String.Join(" ", ret.Html.Head.MetaVideoTagsOpengraph));
            }

            ret.Html.Head.Content = head.ToString();
            ret.Html.Head.Tokens  = ParserCommon.GetTokens(ret.Html.Head.Content, _ParseOptions.Text);

            #endregion

            #region Body

            ret.Html.Body.ImageUrls = GetImageUrls(doc, data);
            ret.Html.Body.Links     = GetLinks(doc);
            ret.Html.Body.Content   = GetHtmlBody(doc);
            ret.Html.Body.Tokens    = ParserCommon.GetTokens(ret.Html.Body.Content, _ParseOptions.Text);

            #endregion

            #region Data

            ret.Tokens = new List <Token>();

            long bodyStartingPosition = 0;
            if (ret.Html.Head.Tokens != null && ret.Html.Head.Tokens.Count > 0)
            {
                ret.Tokens.AddRange(ret.Html.Head.Tokens);

                foreach (Token token in ret.Html.Head.Tokens)
                {
                    if (token.Positions != null && token.Positions.Count > 0)
                    {
                        long maxPos = token.Positions.Max();

                        if (maxPos >= bodyStartingPosition)
                        {
                            bodyStartingPosition = (maxPos + 1);
                        }
                    }
                }
            }

            // bodyStartingPosition + [body token position] will yield the correct position across the entire set of tokens

            if (ret.Html.Body.Tokens != null && ret.Html.Body.Tokens.Count > 0)
            {
                List <Token> updatedTokens = new List <Token>();

                foreach (Token token in ret.Html.Body.Tokens)
                {
                    Token updated = new Token();
                    updated.Value     = token.Value;
                    updated.Count     = token.Count;
                    updated.Positions = new List <long>();

                    if (token.Positions != null && token.Positions.Count > 0)
                    {
                        foreach (long tokenPos in token.Positions)
                        {
                            long updatedPosition = bodyStartingPosition + tokenPos;
                            updated.Positions.Add(updatedPosition);
                        }
                    }

                    updatedTokens.Add(updated);

                    ret.Html.Body.Tokens = updatedTokens;
                }

                foreach (Token token in ret.Html.Body.Tokens)
                {
                    ret.Tokens = ParserCommon.AddToken(ret.Tokens, token);
                }
            }

            ret.Schema = BuildSchema();

            #endregion

            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }