/// <summary> /// Debugging: Used to get length distribution to understand string sizes. /// </summary> /// <returns></returns> internal int[] CountByPaddedLength() { int maxLength = 0; Dictionary <int, int> result = new Dictionary <int, int>(); for (int i = 0; i < _sortedExistingValues.Count; ++i) { String8 current = _sortedExistingValues[i]; int length = current.Length; length = length & ~3; if (length > maxLength) { maxLength = length; } int count; if (!result.TryGetValue(length, out count)) { count = 0; } result[length] = count + 1; } int[] resultByFourByteLength = new int[maxLength / 4]; for (int i = 0; i < resultByFourByteLength.Length; ++i) { int count; if (!result.TryGetValue(i * 4, out count)) { count = 0; } resultByFourByteLength[i] = count; } return(resultByFourByteLength); }
public int CompareValues(int leftIdentifier, int rightIdentifier) { if (leftIdentifier == 0) { if (rightIdentifier == 0) { // Both empty - equal return(0); } else { // Left is before right return(-1); } } else if (rightIdentifier == 0) { // Right is before left return(1); } else if (leftIdentifier < 0 && rightIdentifier < 0) { // If both new, compare in AddedValues return(_addedValues.CompareValues(-leftIdentifier, -rightIdentifier)); } else if (leftIdentifier >= 0 && rightIdentifier >= 0) { // If both existing, compare in ExistingValues return(_existingValues.CompareValues(leftIdentifier, rightIdentifier)); } else { // Otherwise, get and compare string values for each String8 left = this[leftIdentifier]; String8 right = this[rightIdentifier]; return(left.CompareTo(right)); } }
/// <summary> /// Split a string on a given delimiter into a provided byte[]. Used /// to split strings without allocation when a large byte[] is created /// and reused for many strings. /// </summary> /// <param name="value">String8 value to split</param> /// <param name="delimiter">Delimiter to split on</param> /// <param name="positions">PartialArray<int> to contain split positions</param> /// <returns>String8Set containing split value</returns> public static String8Set Split(String8 value, byte delimiter, PartialArray <int> positions) { // Ensure the delimiter is single byte if (delimiter >= 128) { throw new ArgumentException(String.Format(Resources.UnableToSupportMultibyteCharacter, delimiter)); } if (value.IsEmpty()) { return(String8Set.Empty); } // Clear any previous values in the array positions.Clear(); // Record each delimiter position positions.Add(0); // Get the String8 array directly and loop from index to (index + length) // 3x faster than String8[index]. byte[] array = value._buffer; int end = value._index + value._length; for (int i = value._index; i < end; ++i) { if (array[i] == delimiter) { // Next start position is after this delimiter positions.Add(i - value._index + 1); } } positions.Add(value.Length + 1); return(new String8Set(value, 1, positions)); }
/// <summary> /// Take a case insensitive range and restrict it to the case sensitive subset. /// This can only be done for Ranges containing different casings of one value. /// [Ranges from TryFindString, but not TryGetRangeStartingWith] /// </summary> /// <param name="r">Range to restrict</param> /// <param name="value">String8 casing of value to restrict to</param> /// <returns>Range constrained to subset matching value casing</returns> private Range MakeCaseSensitive(Range r, String8 value) { if (r.IsEmpty()) { return(r); } // Verify this was called only for casing variations [there isn't a single Range for prefixes] if (this[r.End].Length != value.Length) { throw new ArgumentOutOfRangeException(); } // Exclude values from start which don't match value case-sensitive int start; for (start = r.Start; start <= r.End; ++start) { if (value.CompareTo(this[start], false) == 0) { break; } } // Exclude values from end which don't match value case-sensitive int end; for (end = r.End; end > start; --end) { if (value.CompareTo(this[end], false) == 0) { break; } } return(new Range(start, end)); }
/// <summary> /// Split a CSV row into cells. This method splits and unencodes quoted values together. /// It changes the underlying buffer in the process. /// </summary> /// <param name="row">String8 containing a CSV row</param> /// <param name="positions">PartialArray<int> to contain split positions</param> /// <returns>String8Set containing unencoded cell values</returns> public static String8Set SplitAndDecodeCsvCells(String8 row, PartialArray <int> positions) { // If row is empty, return empty set if (row.IsEmpty()) { return(String8Set.Empty); } // Clear any previous values in the array positions.Clear(); // The first part always begins at the start of the (shifted) string positions.Add(0); byte[] array = row._buffer; int i = row._index; int end = i + row._length; // We're shifting values in the string to overwrite quotes around cells // and doubled quotes. copyTo is where we've written to in the unescaped // string. int copyTo = i; // Walk each cell, handling quoted and unquoted cells. while (i < end) { bool inQuote = (array[i] == UTF8.Quote); if (!inQuote) { // Unquoted cell. Copy until next comma. for (; i < end; ++i, ++copyTo) { // Copy everything as-is (no unescaping) array[copyTo] = array[i]; // If a delimiter is found, add another split position if (array[i] == UTF8.Comma) { positions.Add(copyTo - row._index + 1); i++; copyTo++; break; } } } else { // Quoted cell. // Overwrite opening quote i++; // Look for end quote (undoubled quote) for (; i < end; ++i, ++copyTo) { if (array[i] != UTF8.Quote) { // Copy everything that wasn't an escaped quote array[copyTo] = array[i]; } else { // Quote found. End of cell, escaped quote, or unescaped quote (error)? i++; // End of cell [end of line] if (i == end) { break; } if (array[i] == UTF8.Comma) { // End of cell [comma]. Copy comma, end of cell. positions.Add(copyTo - row._index + 1); array[copyTo] = array[i]; i++; copyTo++; break; } else if (array[i] == UTF8.Quote) { // Escaped quote. Copy the second quote, continue cell. array[copyTo] = array[i]; } else { // Unescaped quote. Abort; caller will see incomplete row and can throw return(new String8Set(row, 1, positions)); } } } } } // The last part always ends at the end of the (shifted) string positions.Add(copyTo - row._index + 1); // Overwrite duplicate values left from shifting to make bugs clearer for (; copyTo < end; ++copyTo) { array[copyTo] = UTF8.Null; } return(new String8Set(row, 1, positions)); }
internal String8Set(String8 content, int delimiterWidth, PartialArray <int> partPositions) { _content = content; _partPositions = partPositions; _delimiterWidth = delimiterWidth; }
/// <summary> /// Return the int[] length required for a buffer to split 'value' /// by 'delimiter'. This may be an overestimate to perform better. /// Used by callers to allocate a safe byte[] for String8Set.Split. /// </summary> /// <param name="value">Value to Split</param> /// <param name="delimiter">Delimiter to Split by</param> /// <returns>Length of byte[] required to safely contain value</returns> public static int GetLength(String8 value, char delimiter) { return(GetLength(value, (byte)delimiter)); }
/// <summary> /// Split a string on a given delimiter into a provided byte[]. Used /// to split strings without allocation when a large byte[] is created /// and reused for many strings. /// </summary> /// <param name="value">String8 value to split</param> /// <param name="delimiter">Delimiter to split on</param> /// <param name="positionArray">int[] to contain split positions, of at least length String8Set.SplitRequiredLength</param> /// <returns>String8Set containing split value</returns> public static String8Set Split(String8 value, byte delimiter, int[] positionArray) { return(Split(value, delimiter, new PartialArray <int>(positionArray))); }
public bool TryGetRangeStartingWith(String8 prefix, out int firstIdentifier, out int lastIdentifier) { throw new NotImplementedException(); }
public bool TryFindString(String8 value, out int identifier) { return(_valueToIdentifier.TryGetValue(value.ToString(), out identifier)); }
public int FindOrAddString(String8 value) { return(FindOrAddString(value.ToString())); }
public String8 this[int identifier] { get { return(String8.Convert(_values[identifier], new byte[String8.GetLength(_values[identifier])])); } }
/// <summary> /// Create a concatenation of three String8s. Used to join values /// with a delimiter in a memory efficient way. /// </summary> /// <param name="first">First Value</param> /// <returns>String8 copy which will persist</returns> public String8 Concatenate(String8 first, String8 delimiter, String8 second) { // If either string is empty, use only the other [if both empty, String8.Empty returned] if (first.IsEmpty()) { return(GetCopy(second)); } if (second.IsEmpty()) { return(GetCopy(first)); } BlockPart targetBlock = null; // Find the Block hosting the value (if it is already here) int blockIndex = _blocks.Count - 1; for (; blockIndex >= 0; --blockIndex) { if (first._buffer == _blocks[blockIndex].Block) { targetBlock = _blocks[blockIndex]; break; } } // If "first" is the last thing on the block... if (targetBlock != null && targetBlock.LengthUsed == first._index + first._length) { // If there's room to concatenate in place, do that if (targetBlock.Block.Length >= targetBlock.LengthUsed + delimiter.Length + second.Length) { targetBlock.LengthUsed += delimiter.WriteTo(targetBlock.Block, targetBlock.LengthUsed); targetBlock.LengthUsed += second.WriteTo(targetBlock.Block, targetBlock.LengthUsed); return(new String8(first._buffer, first._index, targetBlock.LengthUsed - first._index)); } // If not, "remove" first from the block to recycle the space if (first._index == 0) { // If first was alone, remove the whole block _blocks.RemoveAt(blockIndex); } else { // Deduct the used space for "first" _blocks[blockIndex].LengthUsed -= first.Length; } } // Find new room for the concatenated value int requiredLength = first.Length + delimiter.Length + second.Length; targetBlock = GetBlockForLength((int)(1.5 * requiredLength)); // Write the parts to the chosen block and return a reference to the new copy int startPosition = targetBlock.LengthUsed; targetBlock.LengthUsed += first.WriteTo(targetBlock.Block, targetBlock.LengthUsed); targetBlock.LengthUsed += delimiter.WriteTo(targetBlock.Block, targetBlock.LengthUsed); targetBlock.LengthUsed += second.WriteTo(targetBlock.Block, targetBlock.LengthUsed); return(new String8(targetBlock.Block, startPosition, targetBlock.LengthUsed - startPosition)); }
public bool TryFindString(string value, out Range matches) { return(TryFindString(String8.Convert(value, new byte[String8.GetLength(value)]), out matches)); }