/// <summary>
        /// Gets relative priority of words from a sentence
        /// 1. Removes common and question words
        /// 2. Finds relative word priority with inverse doc frequency
        /// </summary>
        /// <param name="sentence"></param>
        /// <param name="docWordFrequency"></param>
        /// <returns></returns>
        public IEnumerable <WordPriority> GetWordsWithPriority(string sentence,
                                                               IEnumerable <WordFrequency> docWordFrequency)
        {
            if (string.IsNullOrWhiteSpace(sentence))
            {
                return(null);
            }

            var words            = StringProcessor.GetWordsLower(sentence);
            var nonQuestionWords = StringProcessor.RemoveWords(words, _questionWords);
            var meaningfulWords  = StringProcessor.RemoveWords(nonQuestionWords, _frequentWords);

            //arrange by descending order of frequency in original content (take 0 if not found)
            var standardUpperLimit      = docWordFrequency.Max(w => w.Frequency);
            var nonQuestionWordsByValue = meaningfulWords
                                          .Select(w => new WordPriority
            {
                Word     = w,
                Priority = standardUpperLimit - (docWordFrequency.FirstOrDefault(iw => iw.Word == w)?.Frequency ?? 0)
            })
                                          .OrderBy(wp => wp.Priority);

            //.Select((wp, idx) => new WordPriority { Word = wp.Word, Priority = idx + 1 });

            return(nonQuestionWordsByValue);
        }
        public void GetWordsLower_should_get_all_words()
        {
            var sentence = @"There are three species of zebras: the plains zebra, the Grévy's zebra and the mountain zebra.";

            var d = StringProcessor.GetWordsLower(sentence);

            Assert.IsTrue(d.Count() == 16);
        }
Example #3
0
        public ContentIndex Build(string paragraph)
        {
            if (string.IsNullOrWhiteSpace(paragraph))
            {
                return(null);
            }

            var result = new ContentIndex
            {
                Content  = paragraph,
                Lines    = new List <LineIndex>(),
                AllWords = new List <WordFrequency>()
            };

            var lines = StringProcessor.GetSentences(paragraph);

            var lineIndex = 0;

            foreach (var line in lines)
            {
                var words = StringProcessor.GetWordsLower(line);
                result.Lines.Add(new LineIndex
                {
                    Id        = ++lineIndex,
                    Line      = line,
                    WordIndex = GetWordFrequency(words).ToList()
                });
            }

            var allWordCombinationsInAllLines = result.Lines.Select(l => l.WordIndex);

            result.AllWords = GetCombinedWordFrequency(allWordCombinationsInAllLines)
                              .ToList();

            return(result);
        }