private double SentanceSimilarity(IList <string> sentance1, IList <string> sentance2) { var allWords = sentance1.Concat(sentance2).Distinct().ToList(); var v1 = new DenseVector(allWords.Count); var v2 = new DenseVector(allWords.Count); foreach (var word in sentance1) { if (_stopWordFilter.IsStopWord(word)) { continue; } var index = allWords.IndexOf(word); v1[index] += 1; } foreach (var word in sentance2) { if (_stopWordFilter.IsStopWord(word)) { continue; } var index = allWords.IndexOf(word); v2[index] += 1; } return(1 - Utils.CosineSimilarity(v1, v2)); }
public KeywordExtractor(INlpServiceProvider nlpServices, string lang) { _stopWordFilter = nlpServices.GetStopWordFilter(lang); _wordStemmer = nlpServices.GetStemmer(lang); _filter = (word) => { return(!_stopWordFilter.IsStopWord(word)); }; _mapper = (word) => { return(_wordStemmer.Stem(word)); }; }
/// <summary> /// Note: this method has side-effects. In addition to returning the array of phrases, it maintains the internal index of unique words. /// </summary> /// <param name="tokens"></param> /// <returns></returns> public string[] ToPhrases(string[] tokens) { _uniqueWords = new SortedSet <string>(); List <string> phrases = new List <string>(); string current = string.Empty; foreach (string t in tokens) { if (_stopWords.IsPunctuation(t) || _stopWords.IsStopWord(t)) { //Throw it away! if (current.Length > 0) { phrases.Add(current); current = string.Empty; } } else { _uniqueWords.Add(t); if (current.Length == 0) { current = t; } else { current += " " + t; } } } if (current.Length > 0) { phrases.Add(current); } return(phrases.ToArray()); }