Exemple #1
0
        public void CountWordOccurancesInSequences()
        {
            WordSequenceParser wordSequenceParser = new WordSequenceParser(new Stemmer(), 5);

            List <Word> wordList = new List <Word>();

            wordList.Add(new Word("cats", "cat"));
            wordList.Add(new Word("puppies", "puppy"));
            wordList.Add(new Word("pie", "pie"));
            wordList.Add(new Word("women", "woman"));
            wordList.Add(new Word("feet", "foot"));

            List <SentenceFragment> sentenceFragments = new List <SentenceFragment>();

            sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList));
            sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList));
            sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList));
            sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList));
            sentenceFragments.Add(new SentenceFragment("keyword", wordList, wordList));

            IEnumerable <CountedWord> countedStems = wordSequenceParser.CountWordOccurancesInSequences(sentenceFragments);

            foreach (CountedWord countedWord in countedStems)
            {
                Assert.AreEqual(4, countedWord.Count);
            }
        }
        static (List <Word> Words, List <SentenceFragment> SentenceFragment) Query(string filepath, string keyword, DateTime start, DateTime end, int resultLimit)
        {
            List <string>           textSegments      = SqliteDAO.GetText(filepath, start, end, keyword, SqlLimit);
            List <SentenceFragment> sentenceFragments = new List <SentenceFragment>();
            // settings
            const int MaxWordsAround = 5;

            List <Word>             importantStems     = new List <Word>();
            List <SentenceFragment> exampleFragments   = new List <SentenceFragment>();
            WordSequenceParser      wordSequenceParser = new WordSequenceParser(new Stemmer(), MaxWordsAround);

            foreach (string text in textSegments)
            {
                sentenceFragments.AddRange(wordSequenceParser.FindSentenceFragments(Tokenizer.GetTokens(text, keyword), keyword));
            }

            IEnumerable <CountedWord> countedStems = wordSequenceParser.CountWordOccurancesInSequences(sentenceFragments);

            importantStems = countedStems.Where(x => x.Count > 0 && !WordService.IsStopword(x.Word.FullWord) && x.Word.FullWord.Length > 2)
                             .OrderBy(x => x, new CountedWordComparer())
                             .Select(x => x.Word)
                             .Take(resultLimit).ToList();

            exampleFragments = sentenceFragments.Where(s => s.ContainsAnyStems(importantStems))
                               .Distinct(new SentenceFragmentEqualityComparer())
                               .Take(resultLimit).ToList();

            return(importantStems, exampleFragments);
        }
Exemple #3
0
        public void FindSentenceFragments()
        {
            const string         jews       = "Association services, namely, promoting the interests of local, national and international Jewish communities, namely, Jewish chaplains, Jews in the armed forces and military veterans; Charitable services, namely, providing assistance, food, spiritual guidance and training to Jews in the armed forces and military veterans";
            IEnumerable <string> jewsTokens = Tokenizer.GetTokens(jews, "jews");

            WordSequenceParser      wordSequenceParser = new WordSequenceParser(new Stemmer(), 5);
            List <SentenceFragment> sentenceFragments  = wordSequenceParser.FindSentenceFragments(new List <string>(jewsTokens), "jews");

            Assert.AreEqual(2, sentenceFragments.Count);
        }
        public static (List <CountedWord> ImportantWords, List <SentenceFragment> ExampleFragments)   Search(string DatabaseFilepath,
                                                                                                             DateTime from,
                                                                                                             DateTime to,
                                                                                                             List <int> classes,
                                                                                                             string keyword,
                                                                                                             int selectLimit,
                                                                                                             bool useFilingDate,
                                                                                                             int maxWordsAround,
                                                                                                             int resultsLimit)
        {
            List <CountedWord>      importantWords   = new List <CountedWord>();
            List <SentenceFragment> exampleFragments = new List <SentenceFragment>();

            string text = DAO.GetText(DatabaseFilepath, from, to, classes, keyword, selectLimit, useFilingDate);

            if (text.Length > 0)
            {
                WordSequenceParser wordSequenceParser = new WordSequenceParser(new Stemmer(), maxWordsAround);
                var          words        = Tokenizer.GetTokens(text, keyword);
                PhraseParser phraseParser = new PhraseParser();
                phraseParser.JoinAllPhrases(ref words, 2, 1);

                List <SentenceFragment>   sentenceFragments = wordSequenceParser.FindSentenceFragments(new List <string>(words), keyword);
                IEnumerable <CountedWord> countedStems      = wordSequenceParser.CountWordOccurancesInSequences(sentenceFragments);

                importantWords = countedStems.Where(x => x.Count > 0 && !WordService.IsStopword(x.Word.FullWord) && x.Word.FullWord.Length > 2)
                                 .OrderBy(x => x, new CountedWordComparer())
                                 .Take(resultsLimit).ToList();

                exampleFragments = sentenceFragments.Where(s => s.ContainsAnyStems(importantWords.Select(w => w.Word)))
                                   .Distinct(new SentenceFragmentEqualityComparer())
                                   .Take(resultsLimit).ToList();
            }

            return(importantWords, exampleFragments);
        }