/// <summary> /// Compares substrings of two specified strings using the specified comparison rules, /// and returns an integer that indicates their relative position in the sort order. /// </summary> /// <param name="strA">The first string to use in the comparison.</param> /// <param name="indexA">The zero-based starting character position of the substring within <paramref name="strA"/>.</param> /// <param name="lengthA">The number of characters constituting the substring from <paramref name="strA"/>.</param> /// <param name="strB">The second string to use in the comparison.</param> /// <param name="indexB">The zero-based starting character position of the substring within <paramref name="strB"/>.</param> /// <param name="lengthB">The number of characters constituting the substring from <paramref name="strB"/>.</param> /// <param name="comparisonType">One of the enumeration values that specifies the rules to use in the comparison.</param> /// <returns> /// A signed integer that indicates the lexical relationship between the two comparands. /// <list type="table"> /// <listheader> /// <term>Value</term> /// <term>Condition</term> /// </listheader> /// <item> /// <term>Less than zero</term> /// <term>The substring in <paramref name="strA"/> is less than the substring in <paramref name="strB"/>.</term> /// </item> /// <item> /// <term>Zero</term> /// <term>The substrings are equal, or <paramref name="lengthA"/> and <paramref name="lengthB"/> are both zero.</term> /// </item> /// <item> /// <term>Greater than zero</term> /// <term>The substring in <paramref name="strA"/> is greater than the substring in <paramref name="strB"/>.</term> /// </item> /// </list> /// </returns> /// <remarks> /// <para> /// This method is similar to the <see cref="string.Compare(string, int, string, int, int, StringComparison)"/> method /// in the .NET Framework Class Library, but allows different lengths to be specified for the two substrings. /// It is implemented by calling the <see cref="CompareInfo.Compare(string, int, int, string, int, int, CompareOptions)"/> method /// on the appropriate <see cref="CompareInfo"/> instance with the appropriate <see cref="CompareOptions"/> value /// for each known value of <paramref name="comparisonType"/>. /// For performance, substring instantiation is avoided, working with the start indexes and lengths instead. /// </para> /// <para> /// The implementation of this method is adapted from the internal implementations for /// <see cref="string.Compare(string, int, string, int, int, StringComparison)"/> /// (<see href="https://referencesource.microsoft.com/#mscorlib/system/string.cs,1ae4d07b01230bb6">source</see>) /// and <see cref="string.IndexOf(string, int, int, StringComparison)"/> /// (<see href="https://referencesource.microsoft.com/#mscorlib/system/string.cs,ef82268cfee756fe">source</see>). /// </para> /// </remarks> public static int Compare(string strA, int indexA, int lengthA, string strB, int indexB, int lengthB, StringComparison comparisonType) { ArgumentValidate.EnumDefined(comparisonType, nameof(comparisonType)); if (strA == null) { return(strB == null ? 0 : -1); } if (strB == null) { return(1); } ArgumentValidate.StringIndexLength(strA, nameof(strA), indexA, nameof(indexA), lengthA, nameof(lengthA)); ArgumentValidate.StringIndexLength(strB, nameof(strB), indexB, nameof(indexB), lengthB, nameof(lengthB)); if (lengthA == 0 && lengthB == 0) { return(0); } if (string.ReferenceEquals(strA, strB) && indexA == indexB && lengthA == lengthB) { return(0); } return(CompareInner(strA, indexA, lengthA, strB, indexB, lengthB, comparisonType)); }
/// <summary> /// Reports the zero-based index and length of the first occurrence of the specified substring in the source string. /// </summary> /// <param name="source">The source string in which to search.</param> /// <param name="substring">The substring to seek.</param> /// <param name="searchIndex">The zero-based starting character position in <paramref name="source"/> to search from.</param> /// <param name="searchLength">The number of character positions in <paramref name="source"/> to search through.</param> /// <param name="comparisonType">One of the enumeration values that specifies the rules for the search.</param> /// <param name="matchIndex"> /// When this method returns, contains the zero-based starting character position of the match, if found; /// or -1 if no match is found. /// If <paramref name="substring"/> is the empty string (<c>""</c>), the value will be <paramref name="searchIndex"/>. /// </param> /// <param name="matchLength"> /// When this method returns, contains the length (in characters) of the match, if found; /// or -1 if no match is found. /// If <paramref name="substring"/> is the empty string (<c>""</c>), the value will be 0. /// </param> /// <remarks> /// <para> /// This method builds upon the <see cref="string.IndexOf(string, int, int, StringComparison)"/> method /// from the .NET Framework Class Library, but extends it to also return the <i>length</i> of the match, /// allowing string manipulation operations to subsequently be performed correctly. /// </para> /// <para> /// Culture-sensitive comparisons can result in a match that has a different length from the specified <paramref name="substring"/> parameter. /// For example, under the en-US culture, <c>"æ"</c> and <c>"ae"</c> are considered equal. /// <c>"Encyclopædia".IndexOf("aedia")</c> evaluates to 8, indicating a match. /// However, the length of the matched substring, <c>"ædia"</c>, is 4, whilst the length of the searched-for parameter, <c>"aedia"</c>, is 5. /// This can lead to subtle bugs. Consider the following code for removing the first occurrence of substring from a string, /// taken from a <see href="http://stackoverflow.com/a/2201648/1149773">highly-upvoted answer</see> on Stack Overflow: /// <code> /// int index = sourceString.IndexOf(removeString); /// string cleanPath = index < 0 ? sourceString : sourceString.Remove(index, removeString.Length); /// </code> /// If one were to run the above code snippet with <c>sourceString = "Encyclopædia"</c> and <c>removeString = "aedia"</c>, /// then it would throw an <see cref="ArgumentOutOfRangeException"/>. /// On the other hand, one would get correct results by using the current extension method: /// <code> /// int index, length; /// sourceString.Find(removeString, StringComparison.CurrentCulture, out index, out length); /// string cleanPath = index < 0 ? sourceString : sourceString.Remove(index, length); /// </code> /// </para> /// <para> /// There is no public functionality provided in the .NET Framework Class Library that performs such substring searches. /// The current method first calls <see cref="string.IndexOf(string, int, int, StringComparison)"/> to get the /// starting position of the match, then iteratively attempts to identify its length. /// It begins with the most likely case (hot path) of the match having the same length as <paramref name="substring"/>, /// verifying this through a call to <see cref="SubstringCompare.Compare(string, int, int, string, int, int, StringComparison)"/>. /// If not equal, it would attempt to decrement and increment the length of the match by one character each time, /// calling the aforementioned method until equality is confirmed. /// </para> /// <para> /// The approach of iterating over the substring's length is endorsed by /// <see href="http://stackoverflow.com/questions/15980310/how-can-i-perform-a-culture-sensitive-starts-with-operation-from-the-middle-of/16001302?noredirect=1#comment22956089_16062528">usr</see>: /// </para> /// <blockquote> /// I have solved a similar problem once like this (search-string highlighting in HTML). I did it similarly. /// You can tune the loop and search strategy in a way that makes it completed very quickly by checking the likely cases first. /// The nice thing about this is that it seems to be totally correct and no Unicode details leak into your code. /// </blockquote> /// <para> /// An alternative to this approach sacrifices portability for performance by executing a P/Invoke call to the /// <see href="https://msdn.microsoft.com/en-us/library/dd318056%28v=vs.85%29.aspx"><c>FindNLSString</c></see> function /// (or related), as is done internally within the <see cref="string"/> class implementation. /// This approach is described under <see href="http://stackoverflow.com/a/20484094/1149773">this Stack Overflow answer</see>. /// </para> /// <para> /// Another alternative approach involves subjecting the strings to Unicode normalization /// (through the <see cref="string.Normalize(NormalizationForm)"/> method) before comparison, /// as suggested in <see href="http://stackoverflow.com/a/16001302/1149773">this Stack Overflow answer</see>. /// However, this approach is undesirable since the returned results would only apply to the <i>normalized</i> forms /// of <paramref name="source"/> and <paramref name="substring"/>, requiring the original strings to be discarded /// and replaced by their normalized forms for all subsequent processing and storage. /// </para> /// <para> /// Furthermore, Unicode normalization would not always yield results consistent with <see cref="string.Compare(string, string)"/> /// or <see cref="string.Equals(string, string, StringComparison)"/> under <see cref="StringComparison.CurrentCulture"/>. /// As discussed under <see href="http://unicode.org/reports/tr15/">Unicode Normalization Forms</see>, /// <see cref="NormalizationForm.FormC"/> and <see cref="NormalizationForm.FormD"/> only support <i>canonical</i> mappings, /// such as between precomposed characters and combining character sequences – for example, <c>"é"</c> and <c>"e\u0301"</c>. /// However, the said forms do not perform <i>compatibility</i> mappings, as is required for ligatures. /// For example, <c>"æ"</c> is not decomposed to <c>"ae"</c>, nor <c>"ffi"</c> to <c>"ffi"</c>, despite that /// the said ligatures are considered to be equal to their corresponding character sequences under the en-US culture. /// <see cref="NormalizationForm.FormKC"/> and <see cref="NormalizationForm.FormKD"/> handle compatibility mappings, /// and can decompose some ligatures, such as <c>"ffi"</c>, but miss others, such as <c>"æ"</c>. /// (A <see href="http://stackoverflow.com/a/15485970/1149773">Stack Overflow answer</see> mentions that /// “Unicode 6.2 doesn't appear to contain a normative mapping from Æ to AE.”) /// The issue is made worse by the discrepancies between cultures – <c>"æ"</c> is equal to <c>"ae"</c> under en-US, /// but not under da-DK, as discussed under the MSDN documentation for /// <see href="https://msdn.microsoft.com/en-us/library/system.string%28v=vs.110%29.aspx#comparison">string comparison</see>. /// Thus, normalization (to any form) would not give results that are consistent with <see cref="StringComparison.CurrentCulture"/> comparisons. /// </para> /// <para> /// Yet another alternative involves iterating over the strings as a sequence of <i>text elements</i>, /// rather than UTF-16 code units, using the <see cref="StringInfo.GetNextTextElement(string, int)"/> method, /// as presented in <see href="http://stackoverflow.com/a/22513015/1149773">this Stack Overflow answer</see>. /// Results would be similar to those obtained from Unicode normalization: canonical mappings are honored, /// but compatibility mappings are not. /// </para> /// <list type="bullet"> /// <listheader>References</listheader> /// <item><see href="http://stackoverflow.com/q/35485677/1149773">Get substring from string using culture-sensitive comparison</see>, <i>Stack Overflow</i></item> /// <item><see href="http://stackoverflow.com/q/20480016/1149773">Length of substring matched by culture-sensitive String.IndexOf method</see>, <i>Stack Overflow</i></item> /// <item><see href="http://stackoverflow.com/q/15980310/1149773">How can I perform a culture-sensitive “starts-with” operation from the middle of a string?</see> by Jon Skeet, <i>Stack Overflow</i></item> /// <item><see href="http://stackoverflow.com/q/9376621/1149773">Folding/Normalizing Ligatures (e.g. Æ to ae) Using (Core)Foundation</see>, <i>Stack Overflow</i></item> /// </list> /// </remarks> public static void Find(this string source, string substring, int searchIndex, int searchLength, StringComparison comparisonType, out int matchIndex, out int matchLength) { ArgumentValidate.NotNull(source, nameof(source)); ArgumentValidate.NotNull(substring, nameof(substring)); ArgumentValidate.StringIndexLength(source, nameof(source), searchIndex, nameof(searchIndex), searchLength, nameof(searchLength)); ArgumentValidate.EnumDefined(comparisonType, nameof(comparisonType)); FindInner(source, substring, searchIndex, searchLength, comparisonType, out matchIndex, out matchLength); }
/// <summary> /// Reports the zero-based index and length of the first occurrence of the specified substring in the source string. /// </summary> /// <param name="source">The source string in which to search.</param> /// <param name="searchValue">The substring to seek.</param> /// <param name="searchIndex">The zero-based starting character position in <paramref name="source"/> to search from.</param> /// <param name="searchLength">The number of character positions in <paramref name="source"/> to search through.</param> /// <param name="comparisonType">One of the enumeration values that specifies the rules for the search.</param> /// <param name="matchIndex"> /// When this method returns, contains the zero-based starting character position of the match, if found; /// or -1 if no match is found. /// If <paramref name="searchValue"/> is the empty string (<c>""</c>), /// the value will be <paramref name="searchIndex"/>. /// </param> /// <param name="matchLength"> /// When this method returns, contains the length (in characters) of the match, if found; /// or -1 if no match is found. /// If <paramref name="searchValue"/> is the empty string (<c>""</c>), the value will be 0. /// </param> /// <returns> /// <see langword="true"/> if a match for <paramref name="searchValue"/> is found in the source string; /// otherwise, <see langword="false"/>. /// </returns> /// <remarks> /// <para> /// This method builds upon the <see cref="string.IndexOf(string, int, int, StringComparison)"/> method /// from the .NET Framework Class Library, but extends it to also return the <i>length</i> of the match, /// allowing string manipulation operations to subsequently be performed correctly. /// </para> /// <para> /// Culture-sensitive comparisons can result in a match that has a different length /// than the specified <paramref name="searchValue"/> argument. /// For example, under the en-US culture, <c>"æ"</c> and <c>"ae"</c> are considered equal. /// <c>"Encyclopædia".IndexOf("aedia")</c> evaluates to 8, indicating a match. /// However, the length of the matched substring, <c>"ædia"</c>, is 4, /// whilst the length of the searched-for parameter, <c>"aedia"</c>, is 5. /// This can lead to subtle bugs. /// Consider the following code for removing the first occurrence of substring from a string, /// taken from a <see href="https://stackoverflow.com/a/2201648/1149773">highly-upvoted answer</see> on Stack Overflow: /// <code> /// int index = sourceString.IndexOf(removeString); /// string cleanPath = index < 0 ? sourceString : sourceString.Remove(index, removeString.Length); /// </code> /// If one were to run the above code snippet with <c>sourceString = "Encyclopædia"</c> and <c>removeString = "aedia"</c>, /// then it would throw an <see cref="ArgumentOutOfRangeException"/>. /// On the other hand, one would get correct results by using the current extension method: /// <code> /// int index, length; /// sourceString.Find(removeString, StringComparison.CurrentCulture, out index, out length); /// string cleanPath = index < 0 ? sourceString : sourceString.Remove(index, length); /// </code> /// </para> /// <para> /// There is no public functionality provided in the .NET Framework Class Library that performs such substring searches. /// The current method first calls <see cref="string.IndexOf(string, int, int, StringComparison)"/> to get the /// starting position of the match, then iteratively attempts to identify its length. /// It begins with the most likely case (hot path) of the match having the same length as <paramref name="searchValue"/>, /// verifying this through a call to <see cref="Substring.Compare(string, int, int, string, int, int, StringComparison)"/>. /// If not equal, it would attempt to decrement and increment the length of the match by one character each time, /// calling the aforementioned method until equality is confirmed. /// </para> /// <para> /// The approach of iterating over the substring's length is endorsed by /// <see href="https://stackoverflow.com/q/15980310/1149773#comment22956089_16062528">usr</see>: /// </para> /// <blockquote> /// I have solved a similar problem once like this (search-string highlighting in HTML). I did it similarly. /// You can tune the loop and search strategy in a way that makes it completed very quickly by checking the likely cases first. /// The nice thing about this is that it seems to be totally correct and no Unicode details leak into your code. /// </blockquote> /// <para> /// An alternative to this approach sacrifices portability for performance by executing a P/Invoke call to the /// <see href="https://docs.microsoft.com/en-us/windows/desktop/api/winnls/nf-winnls-findnlsstring"><c>FindNLSString</c></see> function /// (or related), as is done internally within the <see cref="string"/> class implementation. /// This approach is described under <see href="https://stackoverflow.com/a/20484094/1149773">this Stack Overflow answer</see>. /// </para> /// <para> /// Another alternative approach involves subjecting the strings to Unicode normalization /// (through the <see cref="string.Normalize(NormalizationForm)"/> method) before comparison, /// as suggested in <see href="https://stackoverflow.com/a/16001302/1149773">this Stack Overflow answer</see>. /// However, this approach is undesirable since the returned results would only apply to the <i>normalized</i> forms /// of <paramref name="source"/> and <paramref name="searchValue"/>, requiring the original strings to be discarded /// and replaced by their normalized forms for all subsequent processing and storage. /// </para> /// <para> /// Furthermore, Unicode normalization would not always yield results consistent with /// culture-sensitive comparisons in .NET (such as <see cref="string.Compare(string, string)"/> /// or <see cref="string.Equals(string, string, StringComparison)"/> /// with <see cref="StringComparison.CurrentCulture"/>). /// As mentioned in the <see href="https://unicode.org/reports/tr15/">Unicode Normalization Forms</see> annex, /// <see cref="NormalizationForm.FormC"/> and <see cref="NormalizationForm.FormD"/> only support <i>canonical</i> mappings, /// such as between precomposed characters and combining character sequences – for example, <c>"é"</c> and <c>"e\u0301"</c>. /// However, the said forms do not perform <i>compatibility</i> mappings, as is required for ligatures. /// For example, <c>"æ"</c> is not decomposed to <c>"ae"</c>, nor <c>"ffi"</c> to <c>"ffi"</c>, despite that /// the said ligatures are considered to be equal to their corresponding character sequences under the en-US culture. /// <see cref="NormalizationForm.FormKC"/> and <see cref="NormalizationForm.FormKD"/> handle compatibility mappings, /// and can decompose some ligatures, such as <c>"ffi"</c>, but miss others, such as <c>"æ"</c>. /// (A <see href="https://stackoverflow.com/a/15485970/1149773">Stack Overflow answer</see> mentions that /// “Unicode 6.2 doesn't appear to contain a normative mapping from Æ to AE.”) /// The issue is made worse by the discrepancies between cultures – <c>"æ"</c> is equal to <c>"ae"</c> under en-US, /// but not under da-DK, as discussed under the MSDN documentation for /// <see href="https://docs.microsoft.com/en-us/dotnet/api/system.string?view=netframework-4.7#comparison">string comparison</see>. /// Thus, normalization (to any form) would not give results that are consistent with <see cref="StringComparison.CurrentCulture"/> comparisons. /// </para> /// <para> /// Yet another alternative involves iterating over the strings as a sequence of <i>text elements</i>, /// rather than UTF-16 code units, using the <see cref="StringInfo.GetNextTextElement(string, int)"/> method, /// as presented in <see href="https://stackoverflow.com/a/22513015/1149773">this Stack Overflow answer</see>. /// Results would be similar to those obtained from Unicode normalization: canonical mappings are honored, /// but compatibility mappings are not. /// </para> /// <list type="bullet"> /// <listheader>References</listheader> /// <item><see href="https://stackoverflow.com/q/35485677/1149773">Get substring from string using culture-sensitive comparison</see>, <i>Stack Overflow</i></item> /// <item><see href="https://stackoverflow.com/q/20480016/1149773">Length of substring matched by culture-sensitive String.IndexOf method</see>, <i>Stack Overflow</i></item> /// <item><see href="https://stackoverflow.com/q/15980310/1149773">How can I perform a culture-sensitive “starts-with” operation from the middle of a string?</see> by Jon Skeet, <i>Stack Overflow</i></item> /// <item><see href="https://stackoverflow.com/q/9376621/1149773">Folding/Normalizing Ligatures (e.g. Æ to ae) Using (Core)Foundation</see>, <i>Stack Overflow</i></item> /// </list> /// </remarks> public static bool Find(this string source, string searchValue, int searchIndex, int searchLength, StringComparison comparisonType, out int matchIndex, out int matchLength) { ArgumentValidate.NotNull(source, nameof(source)); ArgumentValidate.NotNull(searchValue, nameof(searchValue)); ArgumentValidate.StringIndexLength(source, nameof(source), searchIndex, nameof(searchIndex), searchLength, nameof(searchLength)); ArgumentValidate.EnumDefined(comparisonType, nameof(comparisonType)); return(FindInner(source, searchValue, searchIndex, searchLength, comparisonType, out matchIndex, out matchLength)); }
private static void StringIndexLengthFail <TException>( string str, int idx, int len, bool strFail = false, bool idxFail = false, bool lenFail = false) where TException : ArgumentException { var exception = ExceptionAssert.Throws <TException>(() => ArgumentValidate.StringIndexLength(str, nameof(str), idx, nameof(idx), len, nameof(len))); string failParamName = strFail ? nameof(str) : idxFail?nameof(idx) : lenFail?nameof(len) : null; Assert.AreEqual(failParamName, exception.ParamName); }
public void StringIndexLength() { string str; int idx; int len; ArgumentValidate.StringIndexLength("", nameof(str), 0, nameof(idx), 0, nameof(len)); ArgumentValidate.StringIndexLength("abc", nameof(str), 0, nameof(idx), 0, nameof(len)); ArgumentValidate.StringIndexLength("abc", nameof(str), 0, nameof(idx), 2, nameof(len)); ArgumentValidate.StringIndexLength("abc", nameof(str), 0, nameof(idx), 3, nameof(len)); ArgumentValidate.StringIndexLength("abc", nameof(str), 1, nameof(idx), 2, nameof(len)); ArgumentValidate.StringIndexLength("abc", nameof(str), 2, nameof(idx), 1, nameof(len)); ArgumentValidate.StringIndexLength("abc", nameof(str), 3, nameof(idx), 0, nameof(len)); StringIndexLengthFail <ArgumentNullException>(null, 0, 0, strFail: true); StringIndexLengthFail <ArgumentOutOfRangeException>("abc", 4, 0, idxFail: true); StringIndexLengthFail <ArgumentOutOfRangeException>("abc", 0, 4, lenFail: true); StringIndexLengthFail <ArgumentOutOfRangeException>("abc", 2, 2, lenFail: true); StringIndexLengthFail <ArgumentOutOfRangeException>("", 1, 0, idxFail: true); StringIndexLengthFail <ArgumentOutOfRangeException>("", 0, 1, lenFail: true); }