Exemplo n.º 1
0
        public static string Process(string text, PreprocessorCleanUpComponentEnum removeDigits)
        {
            string        result  = string.Empty;
            StringBuilder builder = new StringBuilder();

            // Reduces the text to only characters - using Regular Expressions - TODO: test if this is correct
            string cleanText = Regex.Replace(text, "[^A-Za-z0-9 ]", " ");

            if (removeDigits == PreprocessorCleanUpComponentEnum.Yes)
            {
                cleanText = Regex.Replace(cleanText, @"\b[0-9]+\b", "");
            }

            //remove duplicate white spaces...
            //this method is apparently faster than Regex.Replace(input, "[\s]+", "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            //for significantly larger files
            string[] parts = cleanText.Split(new char[] { ' ', '\n', '\t', '\r', '\f', '\v' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string part in parts)
            {
                builder.AppendFormat("{0} ", part);
            }
            result = builder.ToString();

            //convert to lower case
            result = result.ToLower().Trim();

            return(result);
        }
        public static string Process(string text, PreprocessorCleanUpComponentEnum removeDigits)
        {
            string result = string.Empty;
            StringBuilder builder = new StringBuilder();

            // Reduces the text to only characters - using Regular Expressions - TODO: test if this is correct
            string cleanText = Regex.Replace(text, "[^A-Za-z0-9 ]", " ");
            if (removeDigits == PreprocessorCleanUpComponentEnum.Yes)
            {
                cleanText = Regex.Replace(cleanText, @"\b[0-9]+\b", "");
            }

            //remove duplicate white spaces... 
            //this method is apparently faster than Regex.Replace(input, "[\s]+", "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
            //for significantly larger files
            string[] parts = cleanText.Split(new char[] { ' ', '\n', '\t', '\r', '\f', '\v' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string part in parts)
            {
                builder.AppendFormat("{0} ", part);
            }
            result = builder.ToString();

            //convert to lower case
            result = result.ToLower().Trim();

            return result;

        }