// here was a subtle, ascii-optimized version of the cleaning code, and I was // very proud of it until benchmarking showed it was an order of magnitude slower // that the utf8 version. Micro-optimizing sometimes isn't such a good idea. // note: does NOT support surrogate pairs in text internal string CleanCodeString(string text, CleanStringType caseType, char separator, string culture, DefaultShortStringHelperConfig.Config config) { int opos = 0, ipos = 0; var state = StateBreak; culture = culture ?? string.Empty; caseType &= CleanStringType.CaseMask; // if we apply global ToUpper or ToLower to text here // then we cannot break words on uppercase chars var input = text; // it's faster to use an array than a StringBuilder var ilen = input.Length; var output = new char[ilen * 2]; // twice the length should be OK in all cases for (var i = 0; i < ilen; i++) { var c = input[i]; // leading as long as StateBreak and ipos still zero var leading = state == StateBreak && ipos == 0; var isTerm = config.IsTerm(c, leading); //var isDigit = char.IsDigit(c); var isUpper = char.IsUpper(c); // false for digits, symbols... //var isLower = char.IsLower(c); // false for digits, symbols... // what should I do with surrogates? - E.g emojis like 🎈 // no idea, really, so they are not supported at the moment and we just continue var isPair = char.IsSurrogate(c); if (isPair) { continue; } switch (state) { // within a break case StateBreak: // begin a new term if char is a term char, // and ( pos > 0 or it's also a valid leading char ) if (isTerm) { ipos = i; if (opos > 0 && separator != char.MinValue) { output[opos++] = separator; } state = isUpper ? StateUp : StateWord; } break; // within a term / word case StateWord: // end a term if char is not a term char, // or ( it's uppercase and we break terms on uppercase) if (isTerm == false || (config.BreakTermsOnUpper && isUpper)) { CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, false); ipos = i; state = isTerm ? StateUp : StateBreak; if (state != StateBreak && separator != char.MinValue) { output[opos++] = separator; } } break; // within a term / acronym case StateAcronym: // end an acronym if char is not a term char, // or if it's not uppercase / config if (isTerm == false || (config.CutAcronymOnNonUpper && isUpper == false)) { // whether it's part of the acronym depends on whether we're greedy if (isTerm && config.GreedyAcronyms == false) { i -= 1; // handle that char again, in another state - not part of the acronym } if (i - ipos > 1) // single-char can't be an acronym { CopyTerm(input, ipos, output, ref opos, i - ipos, caseType, culture, true); ipos = i; state = isTerm ? StateWord : StateBreak; if (state != StateBreak && separator != char.MinValue) { output[opos++] = separator; } } else if (isTerm) { state = StateWord; } } else if (isUpper == false) // isTerm == true { // it's a term char and we don't cut... // keep moving forward as a word state = StateWord; } break; // within a term / uppercase = could be a word or an acronym case StateUp: if (isTerm) { // add that char to the term and pick word or acronym state = isUpper ? StateAcronym : StateWord; } else { // single char, copy then break CopyTerm(input, ipos, output, ref opos, 1, caseType, culture, false); state = StateBreak; } break; default: throw new Exception("Invalid state."); } } switch (state) { case StateBreak: break; case StateWord: CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, false); break; case StateAcronym: case StateUp: CopyTerm(input, ipos, output, ref opos, input.Length - ipos, caseType, culture, true); break; default: throw new Exception("Invalid state."); } return(new string(output, 0, opos)); }
protected virtual string CleanString(string text, CleanStringType stringType, string?culture, char?separator) { // be safe if (text == null) { throw new ArgumentNullException(nameof(text)); } culture = culture ?? string.Empty; // get config DefaultShortStringHelperConfig.Config config = _config.For(stringType, culture); stringType = config.StringTypeExtend(stringType); // apply defaults if ((stringType & CleanStringType.CaseMask) == CleanStringType.None) { stringType |= CleanStringType.CamelCase; } if ((stringType & CleanStringType.CodeMask) == CleanStringType.None) { stringType |= CleanStringType.Ascii; } // use configured unless specified separator = separator ?? config.Separator; // apply pre-filter if (config.PreFilter != null) { text = config.PreFilter(text); } // apply replacements //if (config.Replacements != null) // text = ReplaceMany(text, config.Replacements); // recode CleanStringType codeType = stringType & CleanStringType.CodeMask; switch (codeType) { case CleanStringType.Ascii: text = Utf8ToAsciiConverter.ToAsciiString(text); break; case CleanStringType.TryAscii: const char ESC = (char)27; var ctext = Utf8ToAsciiConverter.ToAsciiString(text, ESC); if (ctext.Contains(ESC) == false) { text = ctext; } break; default: text = RemoveSurrogatePairs(text); break; } // clean text = CleanCodeString(text, stringType, separator.Value, culture, config); // apply post-filter if (config.PostFilter != null) { text = config.PostFilter(text); } return(text); }