Exemple #1
0
        public void testStemming()
        {
            var stemmer = new PorterStemmer();

            Assert.AreEqual("deni", stemmer.Stem("deny"));
            Assert.AreEqual("declin", stemmer.Stem("declining"));
            Assert.AreEqual("divers", stemmer.Stem("diversity"));
            Assert.AreEqual("diver", stemmer.Stem("divers"));
            Assert.AreEqual("dental", stemmer.Stem("dental"));
        }
        public override bool IncrementToken()
        {
            if (orginal != null)
            {
                termAtt.SetTermBuffer(orginal, 0, orginal.Length);
                orginal = null;
                return(true);
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            orginal = new char[termAtt.TermLength()];
            Array.Copy(termAtt.TermBuffer(), 0, orginal, 0, termAtt.TermLength());

            if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength()))
            {
                termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength);

                if (CharArrayIsEqual(orginal, termAtt.TermBuffer()))
                {
                    orginal = null;
                }
            }

            return(true);
        }
        public void StemWordTest()
        {
            var stemmer = new PorterStemmer();

            var builder = new StringBuilder();

            using (var stream = typeof(PorterStemmerTests).Assembly.GetManifestResourceStream(typeof(PorterStemmerTests), "StemmerTestCases.txt"))
            {
                using (var reader = new StreamReader(stream))
                {
                    string   line;
                    string[] testCase;
                    var      space = new[] { ' ' };
                    while ((line = reader.ReadLine()) != null)
                    {
                        testCase = line.Split(space, StringSplitOptions.RemoveEmptyEntries);
                        if (testCase.Length != 2)
                        {
                            throw new Exception("Expected an array of two - word, stemmed word");
                        }

                        builder.Length = 0;
                        builder.Append(testCase[0]);
                        stemmer.Stem(builder);
                        builder.ToString().Should().Be(testCase[1], because: "Stemming {0}", testCase[0]);
                    }
                }
            }
        }
        public void Stem(string word, string wordExpected)
        {
            // Arrange
            var stemmer = new PorterStemmer();

            // Act
            var wordStemmed = stemmer.Stem(word);

            // Assert
            Assert.AreEqual(wordExpected, wordStemmed);
        }
Exemple #5
0
        public void TestFromPorter()
        {
            var stemmer = new PorterStemmer("en");

            foreach (var pair in dic)
            {
                var stem = stemmer.Stem(pair.Key);

                Assert.AreEqual(pair.Value, stem);
            }
        }
Exemple #6
0
        public static List <DocumentResult> Search(string input)
        {
            //  Take the input, split up into words while discarding symbols and numbers, then remove the stop words and set all cases to lowercase
            var tokens = input.Split(" ", StringSplitOptions.RemoveEmptyEntries)
                         .Select(x => (x.Where(c => char.IsLetter(c)).Aggregate("", (current, c) => current + c)).ToLower());

            tokens = tokens.Where(x => !BLL.Constants.stopwords.Contains(x)).Distinct();
            var stemmedTokens = new List <string>();
            // Instantiate the stemmer
            PorterStemmer stem = new PorterStemmer();

            // Stem all the words in the input and add to the list
            foreach (var word in tokens)
            {
                stem.SetCurrent(word);
                stem.Stem();
                var result = stem.Current;
                stemmedTokens.Add(result);
            }
            // just in case some words have common stems, we apply the Distinct filter again
            var words = stemmedTokens.Distinct();
            // Get all word ids of cleaned token list
            var wordIds = _context.Word.Where(x => words.Contains(x.Word1)).Select(x => x.WordId).ToList();
            // Generate list od DocumentIds based on words and get the top 10
            var pull        = _context.DocumentWord.ToList();
            var svd         = pull.Where(x => wordIds.Contains(x.WordId));
            var totalcounts = _context.DocumentWord.ToList().GroupBy(g => g.DocumentId).ToDictionary(x => x.Key, x => x.Sum(z => z.Count));
            var counts      = svd.GroupBy(g => g.DocumentId).Select(y => new
            {
                DocumentId = y.Key,
                Counts     = totalcounts[y.Key] != 0 ? y.Sum(z => z.Count) / totalcounts[y.Key] : 0
            });

            var                   top10        = counts.OrderByDescending(c => c.Counts).Take(10);
            var                   documentIds  = top10.Select(x => x.DocumentId).ToList();
            List <int>            filteredDocs = _context.Document.Where(x => documentIds.Contains(x.DocumentId)).ToList().Select(x => x.DocumentId).ToList();
            var                   subtitles    = _context.Subtitle.Select(x => new { x.SubtitleId, x.SubtitleName, x.SubtitleNumber }).ToDictionary(x => x.SubtitleId, x => new { x.SubtitleName, x.SubtitleNumber });
            List <DocumentResult> documents    = _context.Document.Where(x => filteredDocs.Contains(x.DocumentId)).Include(j => j.Title).Select(y => new DocumentResult
            {
                DocumentText   = y.DocumentText,
                SubtitleName   = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleName : "",
                SubtitleNumber = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleNumber : "",
                TitleName      = y.Title.TitleName,
                TitleNumber    = y.Title.TitleNumber,
                Citation       = y.UniversalCitation,
                DocumentHeader = y.DocumentHeader
            }).ToList();

            return(documents);
        }
Exemple #7
0
        private string ConvertFromPluralToSingular(string word)
        {
            string result = pluralStemmer.Stem(word);

            //Example:  Gaming will be stemmed to "game"
            if (word != null && word.EndsWith("ing"))
            {
                int origLength = word.Length;
                int newLength  = result.Length;

                if (result.EndsWith("e") && origLength - newLength == 2)
                {
                    result = result.Substring(0, result.Length - 1);
                }
            }

            return(result);
        }