protected Feature(byte[] buffer, ref int position) { Bins = IntArray.New(buffer, ref position); #if !NO_STORE BinsCache = FileObjectStore <IntArrayFormatter> .GetDefaultInstance(); #endif }
public DeltaRepeatIntArray(byte[] buffer, ref int position) { _length = buffer.ToInt(ref position); _deltasActualLength = position; _deltasActualLength = buffer.ToInt(ref _deltasActualLength); _deltas = buffer.ToByteArray(ref position); _values = IntArray.New(buffer, ref position) as DenseIntArray; }
/// <summary> /// Construct a sparse int array from index, value pairs. /// </summary> /// <param name="length">The total length of the constructed array.</param> /// <param name="bitsPerItem">The number of bits required to store the values.</param> /// <param name="nonZeroValues">An ordered enumerable of (index,value) pairs. /// Each index should be strictly increasing as the iterable proceeds.</param> public DeltaSparseIntArray(int length, IntArrayBits bitsPerItem, IEnumerable <KeyValuePair <int, int> > nonZeroValues) { using (Timer.Time(TimerEvent.SparseConstruction)) { List <int> tempValueList = new List <int>(); List <byte> tempDeltaList = new List <byte>(); int currentIndex = 0; foreach (KeyValuePair <int, int> pair in nonZeroValues) { int index = pair.Key; int value = pair.Value; if (index <= currentIndex && (index < 0 || tempValueList.Count > 0)) { throw Contracts.Except("index {0} occurred after {1}", index, currentIndex); } while (index - currentIndex > byte.MaxValue) { tempDeltaList.Add(byte.MaxValue); tempValueList.Add(0); currentIndex += byte.MaxValue; } tempDeltaList.Add((byte)(index - currentIndex)); tempValueList.Add(value); currentIndex = index; } // Add the final chunks of 0's if it ended early while (length - currentIndex > byte.MaxValue) { tempDeltaList.Add(byte.MaxValue); tempValueList.Add(0); currentIndex += byte.MaxValue; } if (currentIndex >= length && currentIndex > 0) { throw Contracts.Except("Index {0} inconsistent with length {1}", currentIndex, length); } _length = length; // It is faster not to use a 4-bit dense array here. The memory difference is minor, since it's just // the sparse values that are saved on. // TODO: Implement a special iterator for 4-bit array, and change this code to use the iterator, which // may be faster if (bitsPerItem == IntArrayBits.Bits0) { throw Contracts.Except("Use dense arrays for 0 bits"); } if (bitsPerItem <= IntArrayBits.Bits8) { bitsPerItem = IntArrayBits.Bits8; } _values = IntArray.New(tempValueList.Count, IntArrayType.Dense, bitsPerItem, tempValueList) as DenseIntArray; _deltas = tempDeltaList.ToArray(); } }
public DeltaSparseIntArray(byte[] buffer, ref int position) { _length = buffer.ToInt(ref position); // REVIEW: The two lines below is as it actually appeared. I have no earthly idea of what // it was trying to accomplish. It appears to function as a no-op, resulting in no valid results // inside _deltasActualLength. //_deltasActualLength = position; //_deltasActualLength = buffer.ToInt(ref _deltasActualLength); _deltas = buffer.ToByteArray(ref position); _values = IntArray.New(buffer, ref position) as DenseIntArray; }
public DeltaRepeatIntArray(int length, IntArrayBits bitsPerItem, IEnumerable <int> values) { using (Timer.Time(TimerEvent.SparseConstruction)) { List <int> tempValueList = new List <int>(); List <byte> tempDeltaList = new List <byte>(); _length = 0; byte delta = 0; int lastVal = -1; foreach (int val in values) { if (val != lastVal || delta == byte.MaxValue) { tempValueList.Add(val); lastVal = val; if (_length != 0) { tempDeltaList.Add(delta); } delta = 0; } ++delta; ++_length; } if (delta > 0) { tempDeltaList.Add(delta); } if (_length != length) { throw Contracts.Except("Length provided to repeat vector is inconsistent with value enumeration"); } // It is faster not to use a 4-bit dense array here. The memory difference is minor, since it's just // the sparse values that are saved on. // TODO: Implement a special iterator for 4-bit array, and change this code to use the iterator, which // may be faster if (bitsPerItem == IntArrayBits.Bits0) { throw Contracts.Except("Use dense arrays for 0 bits"); } if (bitsPerItem <= IntArrayBits.Bits8) { bitsPerItem = IntArrayBits.Bits8; } _values = IntArray.New(tempValueList.Count, IntArrayType.Dense, bitsPerItem, tempValueList) as DenseIntArray; _deltas = tempDeltaList.ToArray(); } }
public override IntArray[] Split(int[][] assignment) { IntArray[] parts = new IntArray[assignment.Length]; for (int i = 0; i < assignment.Length; ++i) { IIntArrayForwardIndexer indexer = GetIndexer(); parts[i] = IntArray.New(assignment[i].Length, IntArrayType.Sparse, BitsPerItem, assignment[i].Select(x => indexer[x])); } return(parts); }
public override IntArray[] Split(int[][] assignment) { int numParts = assignment.Length; IntArray[] newArrays = new IntArray[numParts]; for (int p = 0; p < numParts; ++p) { newArrays[p] = IntArray.New(assignment[p].Length, IntArrayType.Dense, BitsPerItem, assignment[p].Select(x => this[x])); } return(newArrays); }
/// <summary> /// Attempts to create a feature from a ulong array. The intent /// is that this will handle query ID. /// </summary> public static TsvFeature CreateFeatureFromQueryId(Dataset.DatasetSkeleton skel) { Dictionary <uint, int> uvalToOrder = new Dictionary <uint, int>(); foreach (uint uintQid in skel.QueryIds.Select(qid => (uint)qid).Distinct().OrderBy(x => x)) { uvalToOrder[uintQid] = uvalToOrder.Count; } IntArray bins = IntArray.New( skel.NumDocs, IntArrayType.Dense, IntArray.NumBitsNeeded(uvalToOrder.Count), skel.QueryIds.SelectMany((qid, i) => Enumerable.Repeat(uvalToOrder[(uint)qid], skel.Boundaries[i + 1] - skel.Boundaries[i]))); uint[] valueMap = uvalToOrder.Keys.OrderBy(x => x).ToArray(uvalToOrder.Count); return(new TsvFeature(bins, valueMap, "m:QueryId")); }
// Create feature from labels. This is required because freeform evaluations can use m:Rating // as a feature, for which appropriate transformations will be required. public static TsvFeature CreateFeatureFromRatings(short[] ratings) { // This function assumes that labels are only from 0 through 4 // Label to feature map: // 4 -> 9 // 3 -> 8 // 2 -> 7 // 1 -> 6 // 0 -> 5 // invalid -> 0 short maxLab = ratings.Length > 0 ? ratings.Max() : (short)0; IntArray ratingAsFeature = IntArray.New( ratings.Length, IntArrayType.Dense, IntArrayBits.Bits8, ratings.Select(x => (int)x)); uint[] valueMap = Enumerable.Range(0, ((int)maxLab) + 1).Select(x => (uint)x + 5).ToArray(); return(new TsvFeature(ratingAsFeature, valueMap, "m:Rating")); }
private static IntArray ConcatBins(TsvFeature[] parts, uint[] concatValueMap) { using (Timer.Time(TimerEvent.ConcatBins)) { int length = parts.Sum(x => x.Length); IntArrayBits bitsPerItem = IntArray.NumBitsNeeded(concatValueMap.Length); DenseIntArray concatBins = (DenseIntArray)IntArray.New(length, IntArrayType.Dense, bitsPerItem); int pos = 0; for (int partIndex = 0; partIndex < parts.Length; ++partIndex) { IntArray bins = parts[partIndex].Bins; if (concatValueMap.Length == parts[partIndex].ValueMap.Length) { foreach (int bin in bins) { concatBins[pos++] = bin; } } else { int[] binMap = MakeBinMap(parts[partIndex]._valueMap, concatValueMap); foreach (int bin in bins) { concatBins[pos++] = binMap[bin]; } } } if (bitsPerItem != IntArrayBits.Bits0 && parts.All(x => x.Bins is DeltaSparseIntArray)) { return(new DeltaSparseIntArray(length, bitsPerItem, concatBins)); } else { return(concatBins); } } }
/// <summary> /// Concatenates an array of features into one long feature /// </summary> /// <param name="parts">An array of features</param> /// <returns>A concatenated feature</returns> public static TsvFeature Concat(TsvFeature[] parts) { IntArrayBits bitsPerItem = IntArrayBits.Bits0; if (parts.Length == 1) { bitsPerItem = IntArray.NumBitsNeeded(parts[0].ValueMap.Length); if (bitsPerItem == parts[0].Bins.BitsPerItem) { return(parts[0]); } IntArray b = parts[0].Bins; IntArray newBins = IntArray.New(b.Length, b.Type, bitsPerItem, b); return(new TsvFeature(newBins, parts[0].ValueMap, parts[0]._name)); } uint[] concatValueMap = Algorithms.MergeSortedUniqued(parts.Select(x => x.ValueMap).ToArray()); bitsPerItem = IntArray.NumBitsNeeded(concatValueMap.Length); IntArray concatBins = ConcatBins(parts, concatValueMap); return(new TsvFeature(concatBins, concatValueMap, parts[0]._name)); }
public override IntArray Clone(IntArrayBits bitsPerItem, IntArrayType type) { if (type == IntArrayType.Sparse || type == IntArrayType.Current) { if (bitsPerItem <= IntArrayBits.Bits8) { bitsPerItem = IntArrayBits.Bits8; } DenseIntArray newValues = _values.Clone(bitsPerItem, IntArrayType.Dense) as DenseIntArray; return(new DeltaSparseIntArray(newValues, _deltas, _length)); } else { DenseIntArray dense = IntArray.New(Length, IntArrayType.Dense, BitsPerItem) as DenseIntArray; int index = 0; for (int i = 0; i < _values.Length; ++i) { index += _deltas[i]; dense[index] = _values[i]; } return(dense); } }
public override IntArray Clone(IntArrayBits bitsPerItem, IntArrayType type) { return(IntArray.New(_length, type, bitsPerItem, this)); }
/// <summary> /// Finds the most space efficient representation of the feature /// (with slight slack cut for dense features). The behavior of /// this method depends upon the static value <see cref="CompatibilityLevel"/>. /// </summary> /// <param name="workarray">Should be non-null if you want it to /// consider segment arrays.</param> /// <returns>Returns a more space efficient version of the array, /// or the item itself if that is impossible, somehow.</returns> public IntArray Compress(uint[] workarray = null) { int maxval = 0; int zerocount = 0; int runs = 0; int last = -1; int overflows = 0; int zoverflows = 0; int runnow = 0; // The longest run of having the same value. int len = Length; IIntArrayForwardIndexer ind = GetIndexer(); for (int i = 0; i < len; ++i) { int val = ind[i]; if (workarray != null) { workarray[i] = (uint)val; } if (val == 0) { zerocount++; } else if (val > maxval) { maxval = val; } if (last == val) { runs++; if (++runnow > byte.MaxValue) { // We have 256 items in a row the same. overflows++; if (val == 0) { zoverflows++; } runnow = 0; } } last = val; } // Estimate the costs of the available options. IntArrayBits classicBits = IntArray.NumBitsNeeded(maxval + 1); long denseBits = (long)classicBits * (long)Length; long sparseBits = (long)(Math.Max((int)classicBits, 8) + 8) * (long)(Length - zerocount + zoverflows); long rleBits = (long)(classicBits + 8) * (long)(Length - runs + overflows); long segBits = long.MaxValue; int segTransitions = 0; if (workarray != null) { int bits = SegmentIntArray.BitsForValue((uint)maxval); if (bits <= 21) { SegmentIntArray.SegmentFindOptimalPath(workarray, Length, bits, out segBits, out segTransitions); } } if ((IntArray.CompatibilityLevel & 0x4) == 0) { rleBits = long.MaxValue; } long bestCost = Math.Min(Math.Min(Math.Min(denseBits, sparseBits), rleBits), segBits); IntArrayType bestType = IntArrayType.Dense; if (bestCost >= denseBits * 98 / 100) { // Cut the dense bits a wee bit of slack. } else if (bestCost == sparseBits) { bestType = IntArrayType.Sparse; } else if (bestCost == rleBits) { bestType = IntArrayType.Repeat; } else { bestType = IntArrayType.Segmented; } if (bestType == Type && classicBits == BitsPerItem) { return(this); } IntArray bins = null; if (bestType != IntArrayType.Segmented) { bins = IntArray.New(Length, bestType, classicBits, this); } else { bins = SegmentIntArray.FromWorkArray(workarray, Length, segBits, segTransitions); } return(bins); }
/// <summary> /// Clone an IntArray containing only the items indexed by <paramref name="itemIndices"/> /// </summary> /// <param name="itemIndices"> item indices will be contained in the cloned IntArray </param> /// <returns> The cloned IntArray </returns> public override IntArray Clone(int[] itemIndices) { IIntArrayForwardIndexer indexer = GetIndexer(); return(IntArray.New(itemIndices.Length, IntArrayType.Sparse, BitsPerItem, itemIndices.Select(x => indexer[x]))); }
/// <summary> /// Deserializes an IntArray object from a byte array /// </summary> /// <param name="buffer">The byte array object representation</param> /// <returns>The IntArray object</returns> private IntArray Deserialize(byte[] buffer) { int position = 0; return(IntArray.New(buffer, ref position, true)); }
/// <summary> /// Clone an IntArray containing only the items indexed by <paramref name="itemIndices"/> /// </summary> /// <param name="itemIndices"> item indices will be contained in the cloned IntArray </param> /// <returns> The cloned IntArray </returns> public override IntArray Clone(int[] itemIndices) { return(IntArray.New(itemIndices.Length, IntArrayType.Dense, BitsPerItem, itemIndices.Select(x => this[x]))); }