Example #1
0
        static void Stem(Sample sample)
        {
            EnglishPorter2Stemmer    stemmer      = new EnglishPorter2Stemmer();
            Dictionary <string, int> stemmedWords = new Dictionary <string, int>();

            foreach (var word in sample.words)
            {
                var value      = word.Value;
                var key        = word.Key;
                var stemmedKey = stemmer.Stem(key).Value;

                if (stemmedWords.ContainsKey(stemmedKey))
                {
                    var valueHolder = stemmedWords.GetValueOrDefault(stemmedKey);
                    stemmedWords.Remove(stemmedKey);
                    stemmedWords.Add(stemmedKey, value + valueHolder);
                }
                else
                {
                    stemmedWords.Add(stemmedKey, value);
                }
            }
            sample.words.Clear();
            sample.words = stemmedWords;
        }
Example #2
0
        public void Stem_WithBatchData_StemsAllWordsCorrectly(string unstemmed, string expected)
        {
            var stemmer = new EnglishPorter2Stemmer();

            var stemmed = stemmer.Stem(unstemmed).Value;

            Assert.AreEqual(expected, stemmed);
        }
        public PatternsFinder(string patternsFileNameJSON)
        {
            stemmer     = new EnglishPorter2Stemmer();
            _wordsTypes = new Dictionary <string, HashSet <string> >();
            _patterns   = new Dictionary <string, Dictionary <string, string> >();
            _cutWords   = new Dictionary <string, LinkedList <string> >();
            _keyWords   = new LinkedList <string>();

            LoadWordsStructures(patternsFileNameJSON);
        }
Example #4
0
        public void MarkVowelsAsConsonants_WithInitialY_MarksYAsConsonant()
        {
            const string word    = "youth";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("Youth", actual);
        }
Example #5
0
        public void MarkVowelsAsConsonants_WithYBetweenTwoVowels_MarksYAsConsonant()
        {
            const string word    = "boyish";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("boYish", actual);
        }
Example #6
0
        public void MarkVowelsAsConsonants_WithVowelOnlyFollowingY_DoesNotMarkYAsConsonant()
        {
            const string word    = "flying";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("flying", actual);
        }
Example #7
0
        public void MarkVowelsAsConsonants_WithNoVowelsButY_DoesNotMarkAnyYAsConsonant()
        {
            const string word    = "syzygy";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("syzygy", actual);
        }
Example #8
0
        public void MarkVowelsAsConsonants_WithDoubledY_MarksFirstButNotSecondYAsConsonant()
        {
            const string word    = "sayyid";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("saYyid", actual);
        }
        public void RemoveLySuffixes_EndingInInglyAndAtProceedsThat_ReplacesSuffixWithE()
        {
            const string word    = "luxuriated";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("luxuriate", actual);
        }
        public void RemoveSPluralSuffix_WithWordEndingInApostropheSApostrophe_RemovesSuffix()
        {
            const string word    = "holy's'";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step0RemoveSPluralSuffix(word);

            // Assert
            Assert.AreEqual("holy", actual);
        }
        public void RemoveOtherSPluralSuffix_EndingInSAndContainingAVowelRightBeforeAndEarlierInWord_DeletesTheS()
        {
            const string word    = "kiwis";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("kiwi", actual);
        }
        public void RemoveOtherSPluralSuffix_WithShortWordEndingInIed_ReplaceWithIe()
        {
            const string word    = "tied";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("tie", actual);
        }
        public void RemoveLySuffixes_EndingInEedAndInR1_ReplacesSuffixWithEe()
        {
            const string word    = "inbreed";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("inbree", actual);
        }
        public void RemoveOtherSPluralSuffix_EndingInUs_LeavesWordAlone()
        {
            const string word    = "consensus";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("consensus", actual);
        }
        public void RemoveLySuffixes_EndingInIngAndIsShortWord_ReplacesSuffixWithE()
        {
            const string word    = "hoping";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("hope", actual);
        }
Example #16
0
        public void ReplaceYSuffix_NotPreceededyConsonant_DoesNotReplaceSuffix()
        {
            const string word    = "say";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1CReplaceSuffixYWithIIfPreceededWithConsonant(word);

            // Assert
            Assert.AreEqual("say", actual);
        }
        public void RemoveLySuffixes_EndingInIngAndDoubledLetterProceedsThat_RemovesDoubledLetter()
        {
            const string word    = "hopping";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("hop", actual);
        }
Example #18
0
        public void ReplaceYSuffix_PreceededByConsonant_ReplacesSuffixWithI()
        {
            const string word    = "cry";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1CReplaceSuffixYWithIIfPreceededWithConsonant(word);

            // Assert
            Assert.AreEqual("cri", actual);
        }
        public void RemoveOtherSPluralSuffix_WithLongWordEndingInIes_ReplaceWithI()
        {
            const string word    = "cries";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("cri", actual);
        }
        public void RemoveLySuffixes_EndingInEdAndDoesNotContainVowel_LeavesWord()
        {
            const string word    = "fred";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("fred", actual);
        }
        public void EndInShortSyllable_TestingDisturb_IsCountedAsShort()
        {
            // Arrange
            const string word = "disturb";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.EndsInShortSyllable(word);

            // Assert
            Assert.IsFalse(actual);
        }
        public void GetRegion2_WithWordContainingRegion1AndRegion2_ProvidesCorrectRangeForRegion2()
        {
            // Arrange
            const string word    = "beautiful";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.GetRegion2(word);

            // Assert
            Assert.AreEqual(7, actual);
        }
        public void EndInShortSyllable_TestingEntrap_IsCountedAsShort()
        {
            // Arrange
            const string word = "entrap";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.EndsInShortSyllable(word);

            // Assert
            Assert.IsTrue(actual);
        }
        public void EndInShortSyllable_TestingUproot_IsNotCountedAsShort()
        {
            // Arrange
            const string word    = "uproot";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.EndsInShortSyllable(word);

            // Assert
            Assert.IsFalse(actual);
        }
        public void GetRegion2_WithWordContainingOnlyRegion1_ProvidesRangeWithLength0()
        {
            // Arrange
            const string word    = "beauty";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.GetRegion2(word);

            // Assert
            Assert.AreEqual(0, actual - word.Length);
        }
        public void IsShortWord_TestingBeds_IsNotCountedAsShort()
        {
            // Arrange
            const string word    = "beds";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.IsShortWord(word);

            // Assert
            Assert.IsFalse(actual);
        }
        public void IsShortWord_TestingShred_IsCountedAsShort()
        {
            // Arrange
            const string word    = "shred";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.IsShortWord(word);

            // Assert
            Assert.IsTrue(actual);
        }
        public void GetRegion1_WithWordContainingOnlyRegion1_ProvidesCorrectRangeForRegion1()
        {
            // Arrange
            const string word    = "beauty";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.GetRegion1(word);

            // Assert
            Assert.AreEqual(5, actual);
        }
        public void EndInShortSyllable_TestingOn_IsCountedAsShort()
        {
            // Arrange
            const string word    = "on";
            var          stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.EndsInShortSyllable(word);

            // Assert
            Assert.IsTrue(actual);
        }
Example #30
0
        public void Stem_WithBatchData_StemsAllWordsCorrectly()
        {
            // Arrange
            var stemmer   = new EnglishPorter2Stemmer();
            var row       = TestContext.DataRow;
            var unstemmed = row[0].ToString();
            var expected  = row[1].ToString();

            // Act
            var stemmed = stemmer.Stem(unstemmed).Value;

            // Asssert
            Assert.AreEqual(expected, stemmed);
        }
Example #31
0
        public static string NormalizeIndexWord(string input)
        {
            if (input == null)
            {
                return(null);
            }

            var result = FullWidthCharToHalfWidthChar(input);

            result = RemoveSpecialCharacters(result);
            result = new EnglishPorter2Stemmer().Stem(result).Value;
            result = result.ToLower();

            return(result);
        }
Example #32
0
        public string stemming(string webcontent)
        {
            //the code for stemming was already implemented online. I just added the project in my solution.
            EnglishPorter2Stemmer stem = new EnglishPorter2Stemmer();


            string[] words        = webcontent.Split(' ');
            string   stemmedwords = "";


            foreach (var word in words)
            {
                stemmedwords = stemmedwords + " " + stem.Stem(word).Value;
            }

            return(stemmedwords);
        }
        public void MarkVowelsAsConsonants_WithYBetweenTwoVowels_MarksYAsConsonant()
        {
            const string word = "boyish";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("boYish", actual);
        }
        public void RemoveLySuffixes_EndingInEdAndDoesNotContainVowel_LeavesWord()
        {
            const string word = "fred";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("fred", actual);
        }
        public void GetRegion1_WithWordContainingOnlyRegion1_ProvidesCorrectRangeForRegion1()
        {
            // Arrange
            const string word = "beauty";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.GetRegion1(word);

            // Assert
            Assert.AreEqual(5, actual);
        }
        public void GetRegion2_WithWordContainingRegion1AndRegion2_ProvidesCorrectRangeForRegion2()
        {
            // Arrange
            const string word = "beautiful";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.GetRegion2(word);

            // Assert
            Assert.AreEqual(7, actual);
        }
        public void IsShortWord_TestingShred_IsCountedAsShort()
        {
            // Arrange
            const string word = "shred";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.IsShortWord(word);

            // Assert
            Assert.IsTrue(actual);
        }
        public void MarkVowelsAsConsonants_WithInitialY_MarksYAsConsonant()
        {
            const string word = "youth";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("Youth", actual);
        }
        public void RemoveOtherSPluralSuffix_EndingInSAndContainingAVowelRightBefore_LeavesTheS()
        {
            const string word = "gas";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("gas", actual);
        }
        public void RemoveOtherSPluralSuffix_EndingInUs_LeavesWordAlone()
        {
            const string word = "consensus";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("consensus", actual);
        }
        public void RemoveOtherSPluralSuffix_WithShortWordEndingInIed_ReplaceWithIe()
        {
            const string word = "tied";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("tie", actual);
        }
        public void RemoveOtherSPluralSuffix_WithWordEndingInSses_ReplaceWithSs()
        {
            const string word = "assesses";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("assess", actual);
        }
        public void RemoveSPluralSuffix_WithWordEndingInApostrophe_RemovesSuffix()
        {
            const string word = "holy'";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step0RemoveSPluralSuffix(word);

            // Assert
            Assert.AreEqual("holy", actual);
        }
        public void ReplaceYSuffix_PreceededByConsonantAsFirstLetterOfWord_DoesNotReplaceSuffix()
        {
            const string word = "by";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1CReplaceSuffixYWithIIfPreceededWithConsonant(word);

            // Assert
            Assert.AreEqual("by", actual);
        }
        public void ReplaceYSuffix_PreceededByConsonant_ReplacesSuffixWithI()
        {
            const string word = "cry";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1CReplaceSuffixYWithIIfPreceededWithConsonant(word);

            // Assert
            Assert.AreEqual("cri", actual);
        }
        public void MarkVowelsAsConsonants_WithYAfterConsonant_DoesNotMarkYAsConsonant()
        {
            const string word = "fly";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("fly", actual);
        }
        public void MarkVowelsAsConsonants_WithNoVowelsButY_DoesNotMarkAnyYAsConsonant()
        {
            const string word = "syzygy";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("syzygy", actual);
        }
        public void RemoveOtherSPluralSuffix_EndingInSAndContainingAVowelEarlierInWord_DeletesTheS()
        {
            const string word = "gaps";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1ARemoveOtherSPluralSuffixes(word);

            // Assert
            Assert.AreEqual("gap", actual);
        }
        public void MarkVowelsAsConsonants_WithDoubledY_MarksFirstButNotSecondYAsConsonant()
        {
            const string word = "sayyid";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.MarkYsAsConsonants(word);

            // Assert
            Assert.AreEqual("saYyid", actual);
        }
        public void RemoveLySuffixes_EndingInInglyAndAtProceedsThat_ReplacesSuffixWithE()
        {
            const string word = "luxuriated";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("luxuriate", actual);
        }
        public void IsShortWord_TestingBeds_IsNotCountedAsShort()
        {
            // Arrange
            const string word = "beds";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.IsShortWord(word);

            // Assert
            Assert.IsFalse(actual);
        }
        public void RemoveLySuffixes_EndingInIngAndIsShortWord_ReplacesSuffixWithE()
        {
            const string word = "hoping";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("hope", actual);
        }
        public void GetRegion2_WithWordContainingOnlyRegion1_ProvidesRangeWithLength0()
        {
            // Arrange
            const string word = "beauty";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.GetRegion2(word);

            // Assert
            Assert.AreEqual(0, actual - word.Length);
        }
        public void RemoveLySuffixes_EndingInIngAndDoubledLetterProceedsThat_RemovesDoubledLetter()
        {
            const string word = "hopping";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("hop", actual);
        }
        public void Stem_WithBatchData_StemsAllWordsCorrectly()
        {
            // Arrange
            var stemmer = new EnglishPorter2Stemmer();
            var row = TestContext.DataRow;
            var unstemmed = row[0].ToString();
            var expected = row[1].ToString();

            // Act
            var stemmed = stemmer.Stem(unstemmed).Value;

            // Asssert
            Assert.AreEqual(expected, stemmed);
        }
        public void RemoveLySuffixes_EndingInEedlyAndInR1_ReplacesSuffixWithEe()
        {
            const string word = "inbreedly";
            var stemmer = new EnglishPorter2Stemmer();

            // Act
            var actual = stemmer.Step1BRemoveLySuffixes(word, stemmer.GetRegion1(word));

            // Assert
            Assert.AreEqual("inbree", actual);
        }