예제 #1
0
        private double SentanceSimilarity(IList <string> sentance1, IList <string> sentance2)
        {
            var allWords = sentance1.Concat(sentance2).Distinct().ToList();

            var v1 = new DenseVector(allWords.Count);
            var v2 = new DenseVector(allWords.Count);

            foreach (var word in sentance1)
            {
                if (_stopWordFilter.IsStopWord(word))
                {
                    continue;
                }

                var index = allWords.IndexOf(word);
                v1[index] += 1;
            }

            foreach (var word in sentance2)
            {
                if (_stopWordFilter.IsStopWord(word))
                {
                    continue;
                }

                var index = allWords.IndexOf(word);
                v2[index] += 1;
            }

            return(1 - Utils.CosineSimilarity(v1, v2));
        }
예제 #2
0
        public KeywordExtractor(INlpServiceProvider nlpServices, string lang)
        {
            _stopWordFilter = nlpServices.GetStopWordFilter(lang);

            _wordStemmer = nlpServices.GetStemmer(lang);

            _filter = (word) => {
                return(!_stopWordFilter.IsStopWord(word));
            };

            _mapper = (word) => {
                return(_wordStemmer.Stem(word));
            };
        }
        /// <summary>
        /// Note: this method has side-effects. In addition to returning the array of phrases, it maintains the internal index of unique words.
        /// </summary>
        /// <param name="tokens"></param>
        /// <returns></returns>
        public string[] ToPhrases(string[] tokens)
        {
            _uniqueWords = new SortedSet <string>();
            List <string> phrases = new List <string>();

            string current = string.Empty;

            foreach (string t in tokens)
            {
                if (_stopWords.IsPunctuation(t) || _stopWords.IsStopWord(t))
                {
                    //Throw it away!
                    if (current.Length > 0)
                    {
                        phrases.Add(current);
                        current = string.Empty;
                    }
                }
                else
                {
                    _uniqueWords.Add(t);
                    if (current.Length == 0)
                    {
                        current = t;
                    }
                    else
                    {
                        current += " " + t;
                    }
                }
            }

            if (current.Length > 0)
            {
                phrases.Add(current);
            }

            return(phrases.ToArray());
        }