// REVIEW: The converting getter invokes a type conversion delegate on every call, so it's inherently slower // than the 'direct' getter. We don't have good indication of this to the user, and the selection // of affected types is pretty arbitrary (signed integers and bools, but not uints and floats). private Action <TRow> CreateConvertingVBufferSetter <TSrc, TDst>(Row input, int col, Delegate poke, Delegate peek, Func <TSrc, TDst> convert) { var getter = input.GetGetter <VBuffer <TSrc> >(col); var typedPoke = poke as Poke <TRow, TDst[]>; var typedPeek = peek as Peek <TRow, TDst[]>; Contracts.AssertValue(typedPoke); Contracts.AssertValue(typedPeek); VBuffer <TSrc> value = default; TDst[] buf = null; return(row => { getter(ref value); typedPeek(row, Position, ref buf); if (Utils.Size(buf) != value.Length) { buf = new TDst[value.Length]; } foreach (var pair in value.Items(true)) { buf[pair.Key] = convert(pair.Value); } typedPoke(row, buf); }); }
private static IEnumerable <TOut[]> GetColumnArrayConvert <TOut, TData>(IDataView data, int col, Func <TData, TOut> convert) { Contracts.AssertValue(data); Contracts.Assert(0 <= col && col < data.Schema.Count); var column = data.Schema[col]; using (var cursor = data.GetRowCursor(column)) { var getter = cursor.GetGetter <VBuffer <TData> >(column); VBuffer <TData> curValue = default; while (cursor.MoveNext()) { getter(ref curValue); // REVIEW: should we introduce the 'reuse array' logic here? // For now it re-creates the array and densifies. var dst = new TOut[curValue.Length]; foreach (var kvp in curValue.Items(all: false)) { dst[kvp.Key] = convert(kvp.Value); } yield return(dst); } } }
internal static ValueGetter <VBuffer <Single> > GetLabelGetter(SlotCursor cursor) { var type = cursor.GetSlotType().ItemType; if (type == NumberDataViewType.Single) { return(cursor.GetGetter <Single>()); } if (type == NumberDataViewType.Double || type is BooleanDataViewType) { return(GetVecGetterAs <Single>(NumberDataViewType.Single, cursor)); } if (!(type is KeyType keyType)) { throw Contracts.Except("Only floating point number, boolean, and key type values can be used as label."); } Contracts.Assert(TestGetLabelGetter(type) == null); ulong keyMax = (ulong)keyType.Count; if (keyMax == 0) { keyMax = ulong.MaxValue; } var getSrc = RowCursorUtils.GetVecGetterAs <ulong>(NumberDataViewType.UInt64, cursor); VBuffer <ulong> src = default(VBuffer <ulong>); return ((ref VBuffer <Single> dst) => { getSrc(ref src); // Unfortunately defaults in one to not translate to defaults of the other, // so this will not be sparsity preserving. Assume a dense output. var editor = VBufferEditor.Create(ref dst, src.Length); foreach (var kv in src.Items(all: true)) { if (0 < kv.Value && kv.Value <= keyMax) { editor.Values[kv.Key] = kv.Value - 1; } else { editor.Values[kv.Key] = Single.NaN; } } dst = editor.Commit(); }); }
private Action <TRow> CreateDirectVBufferSetter <TDst>(Row input, int col, Delegate poke, Delegate peek) { var getter = input.GetGetter <VBuffer <TDst> >(col); var typedPoke = poke as Poke <TRow, TDst[]>; var typedPeek = peek as Peek <TRow, TDst[]>; Contracts.AssertValue(typedPoke); Contracts.AssertValue(typedPeek); VBuffer <TDst> value = default(VBuffer <TDst>); TDst[] buf = null; return(row => { typedPeek(row, Position, ref buf); getter(ref value); if (value.Length == Utils.Size(buf) && value.IsDense) { // In this case, buf (which came from the input object) is the // right size to represent the vector. // Otherwise, we are either sparse (and need densifying), or value.GetValues() // is a different length than buf. value.CopyTo(buf); } else { buf = new TDst[value.Length]; if (value.IsDense) { value.GetValues().CopyTo(buf); } else { foreach (var pair in value.Items(true)) { buf[pair.Key] = pair.Value; } } } typedPoke(row, buf); }); }
public void SaveData(Stream stream, IDataView data, params int[] cols) { _host.CheckValue(stream, nameof(stream)); _host.CheckValue(data, nameof(data)); _host.CheckValueOrNull(cols); if (cols == null) { cols = new int[0]; } using (var ch = _host.Start("Saving")) { var labelCol = data.Schema.GetColumnOrNull(_labelCol); if (!labelCol.HasValue) { throw ch.Except($"Column {_labelCol} not found in data"); } var featureCol = data.Schema.GetColumnOrNull(_featureCol); if (!featureCol.HasValue) { throw ch.Except($"Column {_featureCol} not found in data"); } var groupCol = !string.IsNullOrWhiteSpace(_groupCol) ? data.Schema.GetColumnOrNull(_groupCol) : default; if (!string.IsNullOrWhiteSpace(_groupCol) && !groupCol.HasValue) { throw ch.Except($"Column {_groupCol} not found in data"); } var weightCol = !string.IsNullOrWhiteSpace(_weightCol) ? data.Schema.GetColumnOrNull(_weightCol) : default; if (!string.IsNullOrWhiteSpace(_weightCol) && !weightCol.HasValue) { throw ch.Except($"Column {_weightCol} not found in data"); } foreach (var col in cols) { _host.Check(col < data.Schema.Count); var column = data.Schema[col]; if (column.Name != _labelCol && column.Name != _featureCol && column.Name != _groupCol && column.Name != _weightCol) { ch.Warning($"Column {column.Name} will not be saved. SVM-light saver saves the label column, feature column, optional group column and optional weight column."); } } var columns = new List <DataViewSchema.Column>() { labelCol.Value, featureCol.Value }; if (groupCol.HasValue) { columns.Add(groupCol.Value); } if (weightCol.HasValue) { columns.Add(weightCol.Value); } using (var writer = new StreamWriter(stream)) using (var cursor = data.GetRowCursor(columns)) { // Getting the getters will fail with type errors if the types are not correct, // so we rely on those messages. var labelGetter = cursor.GetGetter <float>(labelCol.Value); var featuresGetter = cursor.GetGetter <VBuffer <float> >(featureCol.Value); var groupGetter = groupCol.HasValue ? cursor.GetGetter <ulong>(groupCol.Value) : null; var weightGetter = weightCol.HasValue ? cursor.GetGetter <float>(weightCol.Value) : null; VBuffer <float> features = default; while (cursor.MoveNext()) { float lab = default; labelGetter(ref lab); if (_binary) { writer.Write(float.IsNaN(lab) ? 0 : (lab > 0 ? 1 : -1)); } else { writer.Write("{0:R}", lab); } if (groupGetter != null) { ulong groupId = default; groupGetter(ref groupId); if (groupId > 0) { writer.Write(" qid:{0}", groupId - 1); } } if (weightGetter != null) { float weight = default; weightGetter(ref weight); if (weight != 1) { writer.Write(" cost:{0:R}", weight); } } featuresGetter(ref features); bool any = false; foreach (var pair in features.Items().Where(p => p.Value != 0)) { writer.Write(" {0}:{1}", _zero ? pair.Key : (pair.Key + 1), pair.Value); any = true; } // If there were no non-zero items, write a dummy item. Some parsers can handle // empty arrays correctly, but some assume there is at least one defined item. if (!any) { writer.Write(" {0}:0", _zero ? 0 : 1); } writer.WriteLine(); } } } }