/// <summary> /// Gets relative priority of words from a sentence /// 1. Removes common and question words /// 2. Finds relative word priority with inverse doc frequency /// </summary> /// <param name="sentence"></param> /// <param name="docWordFrequency"></param> /// <returns></returns> public IEnumerable <WordPriority> GetWordsWithPriority(string sentence, IEnumerable <WordFrequency> docWordFrequency) { if (string.IsNullOrWhiteSpace(sentence)) { return(null); } var words = StringProcessor.GetWordsLower(sentence); var nonQuestionWords = StringProcessor.RemoveWords(words, _questionWords); var meaningfulWords = StringProcessor.RemoveWords(nonQuestionWords, _frequentWords); //arrange by descending order of frequency in original content (take 0 if not found) var standardUpperLimit = docWordFrequency.Max(w => w.Frequency); var nonQuestionWordsByValue = meaningfulWords .Select(w => new WordPriority { Word = w, Priority = standardUpperLimit - (docWordFrequency.FirstOrDefault(iw => iw.Word == w)?.Frequency ?? 0) }) .OrderBy(wp => wp.Priority); //.Select((wp, idx) => new WordPriority { Word = wp.Word, Priority = idx + 1 }); return(nonQuestionWordsByValue); }
public void GetWordsLower_should_get_all_words() { var sentence = @"There are three species of zebras: the plains zebra, the Grévy's zebra and the mountain zebra."; var d = StringProcessor.GetWordsLower(sentence); Assert.IsTrue(d.Count() == 16); }
public ContentIndex Build(string paragraph) { if (string.IsNullOrWhiteSpace(paragraph)) { return(null); } var result = new ContentIndex { Content = paragraph, Lines = new List <LineIndex>(), AllWords = new List <WordFrequency>() }; var lines = StringProcessor.GetSentences(paragraph); var lineIndex = 0; foreach (var line in lines) { var words = StringProcessor.GetWordsLower(line); result.Lines.Add(new LineIndex { Id = ++lineIndex, Line = line, WordIndex = GetWordFrequency(words).ToList() }); } var allWordCombinationsInAllLines = result.Lines.Select(l => l.WordIndex); result.AllWords = GetCombinedWordFrequency(allWordCombinationsInAllLines) .ToList(); return(result); }