public void CountWordOccurancesInSequences() { WordSequenceParser wordSequenceParser = new WordSequenceParser(new Stemmer(), 5); List <Word> wordList = new List <Word>(); wordList.Add(new Word("cats", "cat")); wordList.Add(new Word("puppies", "puppy")); wordList.Add(new Word("pie", "pie")); wordList.Add(new Word("women", "woman")); wordList.Add(new Word("feet", "foot")); List <SentenceFragment> sentenceFragments = new List <SentenceFragment>(); sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList)); sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList)); sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList)); sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList)); sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList)); IEnumerable <CountedWord> countedStems = wordSequenceParser.CountWordOccurancesInSequences(sentenceFragments); foreach (CountedWord countedWord in countedStems) { Assert.AreEqual(4, countedWord.Count); } }
static (List <Word> Words, List <SentenceFragment> SentenceFragment) Query(string filepath, string keyword, DateTime start, DateTime end, int resultLimit) { List <string> textSegments = SqliteDAO.GetText(filepath, start, end, keyword, SqlLimit); List <SentenceFragment> sentenceFragments = new List <SentenceFragment>(); // settings const int MaxWordsAround = 5; List <Word> importantStems = new List <Word>(); List <SentenceFragment> exampleFragments = new List <SentenceFragment>(); WordSequenceParser wordSequenceParser = new WordSequenceParser(new Stemmer(), MaxWordsAround); foreach (string text in textSegments) { sentenceFragments.AddRange(wordSequenceParser.FindSentenceFragments(Tokenizer.GetTokens(text, keyword), keyword)); } IEnumerable <CountedWord> countedStems = wordSequenceParser.CountWordOccurancesInSequences(sentenceFragments); importantStems = countedStems.Where(x => x.Count > 0 && !WordService.IsStopword(x.Word.FullWord) && x.Word.FullWord.Length > 2) .OrderBy(x => x, new CountedWordComparer()) .Select(x => x.Word) .Take(resultLimit).ToList(); exampleFragments = sentenceFragments.Where(s => s.ContainsAnyStems(importantStems)) .Distinct(new SentenceFragmentEqualityComparer()) .Take(resultLimit).ToList(); return(importantStems, exampleFragments); }
public static (List <CountedWord> ImportantWords, List <SentenceFragment> ExampleFragments) Search(string DatabaseFilepath, DateTime from, DateTime to, List <int> classes, string keyword, int selectLimit, bool useFilingDate, int maxWordsAround, int resultsLimit) { List <CountedWord> importantWords = new List <CountedWord>(); List <SentenceFragment> exampleFragments = new List <SentenceFragment>(); string text = DAO.GetText(DatabaseFilepath, from, to, classes, keyword, selectLimit, useFilingDate); if (text.Length > 0) { WordSequenceParser wordSequenceParser = new WordSequenceParser(new Stemmer(), maxWordsAround); var words = Tokenizer.GetTokens(text, keyword); PhraseParser phraseParser = new PhraseParser(); phraseParser.JoinAllPhrases(ref words, 2, 1); List <SentenceFragment> sentenceFragments = wordSequenceParser.FindSentenceFragments(new List <string>(words), keyword); IEnumerable <CountedWord> countedStems = wordSequenceParser.CountWordOccurancesInSequences(sentenceFragments); importantWords = countedStems.Where(x => x.Count > 0 && !WordService.IsStopword(x.Word.FullWord) && x.Word.FullWord.Length > 2) .OrderBy(x => x, new CountedWordComparer()) .Take(resultsLimit).ToList(); exampleFragments = sentenceFragments.Where(s => s.ContainsAnyStems(importantWords.Select(w => w.Word))) .Distinct(new SentenceFragmentEqualityComparer()) .Take(resultsLimit).ToList(); } return(importantWords, exampleFragments); }