Пример #1
0
        //Ignore punctuation and capitalisation
        //I started implementing this to make it more user friendly and get more accurate results. I was trying to avoid having to use regex because of the impact it has on performance.
        //It might be worth using regex (\w+) and strip out words rather than trying to remove Romans, Punctuation and some special characters
        private void CleanupText(TextBook textBook)
        {
            //Remove modern roman numerals (NOTE: Case insensitive)
            textBook.RawText = myWordCountUtils.RegexCleaner(textBook.RawText, textBook.Pattern);

            //Format any capitalisation to lower case in text file
            textBook.RawText = textBook.RawText.ToLower();

            //Remove Punctuation from text file
            textBook.RawText = myWordCountUtils.RemovePunctuation(textBook.RawText);

            //Remove new line, tab and numbers
            textBook.RawText = myWordCountUtils.CleanDistinctCharacters(textBook.RawText, textBook.StripCharacters);
        }
Пример #2
0
        public void Remove_Specific_Characters()
        {
            //Arrange
            string inputText = "CHILDREN\r\n\r\nBy E   ^   \r\n\r\n\r\n  I    The beginning of things\r\n";

            // Define characters to strip from the input and do it
            string[] stripChars         = { "^", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "\n", "\t", "\r" };
            string   expectedOutputText = "CHILDREN    By E               I    The beginning of things  ";
            var      myWordCountUtils   = new WordCountUtils();


            //Act
            inputText = myWordCountUtils.CleanDistinctCharacters(inputText, stripChars);

            //Assert
            Assert.AreEqual(expectedOutputText, inputText);
        }