public static string Process(string text, PreprocessorCleanUpComponentEnum removeDigits) { string result = string.Empty; StringBuilder builder = new StringBuilder(); // Reduces the text to only characters - using Regular Expressions - TODO: test if this is correct string cleanText = Regex.Replace(text, "[^A-Za-z0-9 ]", " "); if (removeDigits == PreprocessorCleanUpComponentEnum.Yes) { cleanText = Regex.Replace(cleanText, @"\b[0-9]+\b", ""); } //remove duplicate white spaces... //this method is apparently faster than Regex.Replace(input, "[\s]+", "", RegexOptions.Singleline | RegexOptions.IgnoreCase); //for significantly larger files string[] parts = cleanText.Split(new char[] { ' ', '\n', '\t', '\r', '\f', '\v' }, StringSplitOptions.RemoveEmptyEntries); foreach (string part in parts) { builder.AppendFormat("{0} ", part); } result = builder.ToString(); //convert to lower case result = result.ToLower().Trim(); return(result); }
public static string Process(string text, PreprocessorCleanUpComponentEnum removeDigits) { string result = string.Empty; StringBuilder builder = new StringBuilder(); // Reduces the text to only characters - using Regular Expressions - TODO: test if this is correct string cleanText = Regex.Replace(text, "[^A-Za-z0-9 ]", " "); if (removeDigits == PreprocessorCleanUpComponentEnum.Yes) { cleanText = Regex.Replace(cleanText, @"\b[0-9]+\b", ""); } //remove duplicate white spaces... //this method is apparently faster than Regex.Replace(input, "[\s]+", "", RegexOptions.Singleline | RegexOptions.IgnoreCase); //for significantly larger files string[] parts = cleanText.Split(new char[] { ' ', '\n', '\t', '\r', '\f', '\v' }, StringSplitOptions.RemoveEmptyEntries); foreach (string part in parts) { builder.AppendFormat("{0} ", part); } result = builder.ToString(); //convert to lower case result = result.ToLower().Trim(); return result; }