예제 #1
0
        private void GetMeta(Dictionary <string, double> dictionary)
        {
            string   description = GetMetaDescription();
            IStemmer p           = new Porter2();

            if (description != null)
            {
                var arr = Regex.Matches(description, @"\b\w{2,}\b", RegexOptions.Compiled);

                foreach (Match m in arr)
                {
                    var w = m.Value;
                    if (!stopwords.Contains(w) && w != string.Empty)
                    {
                        string stemmed = p.stem(w);
                        if (dictionary.ContainsKey(stemmed))
                        {
                            dictionary[stemmed] += Ranker["keywords"];
                        }
                        else
                        {
                            dictionary.Add(stemmed, Ranker["keywords"]);
                        }
                    }
                }
            }

            string keywords = GetMetaKeywords();

            if (keywords != null)
            {
                var arr = Regex.Matches(keywords, @"\b\w{2,}\b", RegexOptions.Compiled);
                foreach (Match m in arr)
                {
                    var    w       = m.Value;
                    string stemmed = p.stem(w);
                    if (w != string.Empty)
                    {
                        if (dictionary.ContainsKey(stemmed))
                        {
                            dictionary[stemmed] += Ranker["description"];
                        }
                        else
                        {
                            dictionary.Add(stemmed, Ranker["description"]);
                        }
                    }
                }
            }
        }
예제 #2
0
        public Dictionary <string, double> KeywordsVectorsFromText()
        {
            IStemmer p = new Porter2();

            double divider = 0;

            Dictionary <string, double> dictionary = new Dictionary <string, double>();

            GetMeta(dictionary);

            string word;

            foreach (var NodeType in Ranker)
            {
                var Nodes = doc.DocumentNode.Descendants(NodeType.Key);

                if (Nodes.Count() == 0)
                {
                    continue;
                }

                foreach (var Node in Nodes)
                {
                    var KeywordsMatches = Regex.Matches(HttpUtility.HtmlDecode(
                                                            string.Join(" ", Node.Descendants()
                                                                        .Where(n => !n.HasChildNodes && !string.IsNullOrWhiteSpace(n.InnerText))
                                                                        .Select(n => n.InnerText))
                                                            ), @"\b\w{2,}\b", RegexOptions.Compiled);

                    foreach (Match KeywordMatch in KeywordsMatches)
                    {
                        word = KeywordMatch.Value.ToLowerInvariant();

                        if (!stopwords.Contains(word))
                        {
                            word = p.stem(word);

                            if (dictionary.ContainsKey(word))
                            {
                                dictionary[word] += NodeType.Value;
                            }
                            else
                            {
                                dictionary.Add(word, NodeType.Value);
                            }
                        }
                    }
                }

                Nodes.ToList().ForEach(N => N.Remove());
            }

            foreach (string LinkWord in GetLinkWords())
            {
                word = p.stem(LinkWord);
                if (dictionary.ContainsKey(word))
                {
                    dictionary[word] += 25;
                }
                else
                {
                    dictionary[word] = 25;
                }
            }

            if (dictionary.Keys.Count > 0)
            {
                divider = dictionary.Values.Max();
            }

            foreach (var Key in dictionary.Keys.ToList())
            {
                dictionary[Key] = Math.Round(dictionary[Key] / divider, 4);
            }

            word = p.stem(GetDomainWord());

            if (dictionary.ContainsKey(word))
            {
                dictionary[word] += 0.25;
            }
            else
            {
                dictionary[word] = 1;
            }

            return(dictionary);
        }