public void testStemming() { var stemmer = new PorterStemmer(); Assert.AreEqual("deni", stemmer.Stem("deny")); Assert.AreEqual("declin", stemmer.Stem("declining")); Assert.AreEqual("divers", stemmer.Stem("diversity")); Assert.AreEqual("diver", stemmer.Stem("divers")); Assert.AreEqual("dental", stemmer.Stem("dental")); }
public override bool IncrementToken() { if (orginal != null) { termAtt.SetTermBuffer(orginal, 0, orginal.Length); orginal = null; return(true); } if (!input.IncrementToken()) { return(false); } orginal = new char[termAtt.TermLength()]; Array.Copy(termAtt.TermBuffer(), 0, orginal, 0, termAtt.TermLength()); if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength())) { termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength); if (CharArrayIsEqual(orginal, termAtt.TermBuffer())) { orginal = null; } } return(true); }
public void StemWordTest() { var stemmer = new PorterStemmer(); var builder = new StringBuilder(); using (var stream = typeof(PorterStemmerTests).Assembly.GetManifestResourceStream(typeof(PorterStemmerTests), "StemmerTestCases.txt")) { using (var reader = new StreamReader(stream)) { string line; string[] testCase; var space = new[] { ' ' }; while ((line = reader.ReadLine()) != null) { testCase = line.Split(space, StringSplitOptions.RemoveEmptyEntries); if (testCase.Length != 2) { throw new Exception("Expected an array of two - word, stemmed word"); } builder.Length = 0; builder.Append(testCase[0]); stemmer.Stem(builder); builder.ToString().Should().Be(testCase[1], because: "Stemming {0}", testCase[0]); } } } }
public void Stem(string word, string wordExpected) { // Arrange var stemmer = new PorterStemmer(); // Act var wordStemmed = stemmer.Stem(word); // Assert Assert.AreEqual(wordExpected, wordStemmed); }
public void TestFromPorter() { var stemmer = new PorterStemmer("en"); foreach (var pair in dic) { var stem = stemmer.Stem(pair.Key); Assert.AreEqual(pair.Value, stem); } }
public static List <DocumentResult> Search(string input) { // Take the input, split up into words while discarding symbols and numbers, then remove the stop words and set all cases to lowercase var tokens = input.Split(" ", StringSplitOptions.RemoveEmptyEntries) .Select(x => (x.Where(c => char.IsLetter(c)).Aggregate("", (current, c) => current + c)).ToLower()); tokens = tokens.Where(x => !BLL.Constants.stopwords.Contains(x)).Distinct(); var stemmedTokens = new List <string>(); // Instantiate the stemmer PorterStemmer stem = new PorterStemmer(); // Stem all the words in the input and add to the list foreach (var word in tokens) { stem.SetCurrent(word); stem.Stem(); var result = stem.Current; stemmedTokens.Add(result); } // just in case some words have common stems, we apply the Distinct filter again var words = stemmedTokens.Distinct(); // Get all word ids of cleaned token list var wordIds = _context.Word.Where(x => words.Contains(x.Word1)).Select(x => x.WordId).ToList(); // Generate list od DocumentIds based on words and get the top 10 var pull = _context.DocumentWord.ToList(); var svd = pull.Where(x => wordIds.Contains(x.WordId)); var totalcounts = _context.DocumentWord.ToList().GroupBy(g => g.DocumentId).ToDictionary(x => x.Key, x => x.Sum(z => z.Count)); var counts = svd.GroupBy(g => g.DocumentId).Select(y => new { DocumentId = y.Key, Counts = totalcounts[y.Key] != 0 ? y.Sum(z => z.Count) / totalcounts[y.Key] : 0 }); var top10 = counts.OrderByDescending(c => c.Counts).Take(10); var documentIds = top10.Select(x => x.DocumentId).ToList(); List <int> filteredDocs = _context.Document.Where(x => documentIds.Contains(x.DocumentId)).ToList().Select(x => x.DocumentId).ToList(); var subtitles = _context.Subtitle.Select(x => new { x.SubtitleId, x.SubtitleName, x.SubtitleNumber }).ToDictionary(x => x.SubtitleId, x => new { x.SubtitleName, x.SubtitleNumber }); List <DocumentResult> documents = _context.Document.Where(x => filteredDocs.Contains(x.DocumentId)).Include(j => j.Title).Select(y => new DocumentResult { DocumentText = y.DocumentText, SubtitleName = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleName : "", SubtitleNumber = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleNumber : "", TitleName = y.Title.TitleName, TitleNumber = y.Title.TitleNumber, Citation = y.UniversalCitation, DocumentHeader = y.DocumentHeader }).ToList(); return(documents); }
private string ConvertFromPluralToSingular(string word) { string result = pluralStemmer.Stem(word); //Example: Gaming will be stemmed to "game" if (word != null && word.EndsWith("ing")) { int origLength = word.Length; int newLength = result.Length; if (result.EndsWith("e") && origLength - newLength == 2) { result = result.Substring(0, result.Length - 1); } } return(result); }