//Ignore punctuation and capitalisation //I started implementing this to make it more user friendly and get more accurate results. I was trying to avoid having to use regex because of the impact it has on performance. //It might be worth using regex (\w+) and strip out words rather than trying to remove Romans, Punctuation and some special characters private void CleanupText(TextBook textBook) { //Remove modern roman numerals (NOTE: Case insensitive) textBook.RawText = myWordCountUtils.RegexCleaner(textBook.RawText, textBook.Pattern); //Format any capitalisation to lower case in text file textBook.RawText = textBook.RawText.ToLower(); //Remove Punctuation from text file textBook.RawText = myWordCountUtils.RemovePunctuation(textBook.RawText); //Remove new line, tab and numbers textBook.RawText = myWordCountUtils.CleanDistinctCharacters(textBook.RawText, textBook.StripCharacters); }
public void Remove_Specific_Characters() { //Arrange string inputText = "CHILDREN\r\n\r\nBy E ^ \r\n\r\n\r\n I The beginning of things\r\n"; // Define characters to strip from the input and do it string[] stripChars = { "^", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "\n", "\t", "\r" }; string expectedOutputText = "CHILDREN By E I The beginning of things "; var myWordCountUtils = new WordCountUtils(); //Act inputText = myWordCountUtils.CleanDistinctCharacters(inputText, stripChars); //Assert Assert.AreEqual(expectedOutputText, inputText); }