Esempio n. 1
0
            // REVIEW: The converting getter invokes a type conversion delegate on every call, so it's inherently slower
            // than the 'direct' getter. We don't have good indication of this to the user, and the selection
            // of affected types is pretty arbitrary (signed integers and bools, but not uints and floats).
            private Action <TRow> CreateConvertingVBufferSetter <TSrc, TDst>(Row input, int col, Delegate poke, Delegate peek, Func <TSrc, TDst> convert)
            {
                var getter    = input.GetGetter <VBuffer <TSrc> >(col);
                var typedPoke = poke as Poke <TRow, TDst[]>;
                var typedPeek = peek as Peek <TRow, TDst[]>;

                Contracts.AssertValue(typedPoke);
                Contracts.AssertValue(typedPeek);
                VBuffer <TSrc> value = default;

                TDst[] buf = null;
                return(row =>
                {
                    getter(ref value);
                    typedPeek(row, Position, ref buf);
                    if (Utils.Size(buf) != value.Length)
                    {
                        buf = new TDst[value.Length];
                    }
                    foreach (var pair in value.Items(true))
                    {
                        buf[pair.Key] = convert(pair.Value);
                    }

                    typedPoke(row, buf);
                });
            }
Esempio n. 2
0
        private static IEnumerable <TOut[]> GetColumnArrayConvert <TOut, TData>(IDataView data, int col, Func <TData, TOut> convert)
        {
            Contracts.AssertValue(data);
            Contracts.Assert(0 <= col && col < data.Schema.Count);

            var column = data.Schema[col];

            using (var cursor = data.GetRowCursor(column))
            {
                var             getter   = cursor.GetGetter <VBuffer <TData> >(column);
                VBuffer <TData> curValue = default;
                while (cursor.MoveNext())
                {
                    getter(ref curValue);
                    // REVIEW: should we introduce the 'reuse array' logic here?
                    // For now it re-creates the array and densifies.
                    var dst = new TOut[curValue.Length];
                    foreach (var kvp in curValue.Items(all: false))
                    {
                        dst[kvp.Key] = convert(kvp.Value);
                    }
                    yield return(dst);
                }
            }
        }
Esempio n. 3
0
        internal static ValueGetter <VBuffer <Single> > GetLabelGetter(SlotCursor cursor)
        {
            var type = cursor.GetSlotType().ItemType;

            if (type == NumberDataViewType.Single)
            {
                return(cursor.GetGetter <Single>());
            }
            if (type == NumberDataViewType.Double || type is BooleanDataViewType)
            {
                return(GetVecGetterAs <Single>(NumberDataViewType.Single, cursor));
            }
            if (!(type is KeyType keyType))
            {
                throw Contracts.Except("Only floating point number, boolean, and key type values can be used as label.");
            }
            Contracts.Assert(TestGetLabelGetter(type) == null);
            ulong keyMax = (ulong)keyType.Count;

            if (keyMax == 0)
            {
                keyMax = ulong.MaxValue;
            }
            var             getSrc = RowCursorUtils.GetVecGetterAs <ulong>(NumberDataViewType.UInt64, cursor);
            VBuffer <ulong> src    = default(VBuffer <ulong>);

            return
                ((ref VBuffer <Single> dst) =>
            {
                getSrc(ref src);
                // Unfortunately defaults in one to not translate to defaults of the other,
                // so this will not be sparsity preserving. Assume a dense output.
                var editor = VBufferEditor.Create(ref dst, src.Length);
                foreach (var kv in src.Items(all: true))
                {
                    if (0 < kv.Value && kv.Value <= keyMax)
                    {
                        editor.Values[kv.Key] = kv.Value - 1;
                    }
                    else
                    {
                        editor.Values[kv.Key] = Single.NaN;
                    }
                }
                dst = editor.Commit();
            });
        }
Esempio n. 4
0
            private Action <TRow> CreateDirectVBufferSetter <TDst>(Row input, int col, Delegate poke, Delegate peek)
            {
                var getter    = input.GetGetter <VBuffer <TDst> >(col);
                var typedPoke = poke as Poke <TRow, TDst[]>;
                var typedPeek = peek as Peek <TRow, TDst[]>;

                Contracts.AssertValue(typedPoke);
                Contracts.AssertValue(typedPeek);
                VBuffer <TDst> value = default(VBuffer <TDst>);

                TDst[] buf = null;
                return(row =>
                {
                    typedPeek(row, Position, ref buf);
                    getter(ref value);
                    if (value.Length == Utils.Size(buf) && value.IsDense)
                    {
                        // In this case, buf (which came from the input object) is the
                        // right size to represent the vector.
                        // Otherwise, we are either sparse (and need densifying), or value.GetValues()
                        // is a different length than buf.
                        value.CopyTo(buf);
                    }
                    else
                    {
                        buf = new TDst[value.Length];

                        if (value.IsDense)
                        {
                            value.GetValues().CopyTo(buf);
                        }
                        else
                        {
                            foreach (var pair in value.Items(true))
                            {
                                buf[pair.Key] = pair.Value;
                            }
                        }
                    }

                    typedPoke(row, buf);
                });
            }
        public void SaveData(Stream stream, IDataView data, params int[] cols)
        {
            _host.CheckValue(stream, nameof(stream));
            _host.CheckValue(data, nameof(data));
            _host.CheckValueOrNull(cols);

            if (cols == null)
            {
                cols = new int[0];
            }

            using (var ch = _host.Start("Saving"))
            {
                var labelCol = data.Schema.GetColumnOrNull(_labelCol);
                if (!labelCol.HasValue)
                {
                    throw ch.Except($"Column {_labelCol} not found in data");
                }

                var featureCol = data.Schema.GetColumnOrNull(_featureCol);
                if (!featureCol.HasValue)
                {
                    throw ch.Except($"Column {_featureCol} not found in data");
                }

                var groupCol = !string.IsNullOrWhiteSpace(_groupCol) ? data.Schema.GetColumnOrNull(_groupCol) : default;
                if (!string.IsNullOrWhiteSpace(_groupCol) && !groupCol.HasValue)
                {
                    throw ch.Except($"Column {_groupCol} not found in data");
                }

                var weightCol = !string.IsNullOrWhiteSpace(_weightCol) ? data.Schema.GetColumnOrNull(_weightCol) : default;
                if (!string.IsNullOrWhiteSpace(_weightCol) && !weightCol.HasValue)
                {
                    throw ch.Except($"Column {_weightCol} not found in data");
                }

                foreach (var col in cols)
                {
                    _host.Check(col < data.Schema.Count);
                    var column = data.Schema[col];
                    if (column.Name != _labelCol && column.Name != _featureCol && column.Name != _groupCol && column.Name != _weightCol)
                    {
                        ch.Warning($"Column {column.Name} will not be saved. SVM-light saver saves the label column, feature column, optional group column and optional weight column.");
                    }
                }

                var columns = new List <DataViewSchema.Column>()
                {
                    labelCol.Value, featureCol.Value
                };
                if (groupCol.HasValue)
                {
                    columns.Add(groupCol.Value);
                }
                if (weightCol.HasValue)
                {
                    columns.Add(weightCol.Value);
                }
                using (var writer = new StreamWriter(stream))
                    using (var cursor = data.GetRowCursor(columns))
                    {
                        // Getting the getters will fail with type errors if the types are not correct,
                        // so we rely on those messages.
                        var             labelGetter    = cursor.GetGetter <float>(labelCol.Value);
                        var             featuresGetter = cursor.GetGetter <VBuffer <float> >(featureCol.Value);
                        var             groupGetter    = groupCol.HasValue ? cursor.GetGetter <ulong>(groupCol.Value) : null;
                        var             weightGetter   = weightCol.HasValue ? cursor.GetGetter <float>(weightCol.Value) : null;
                        VBuffer <float> features       = default;
                        while (cursor.MoveNext())
                        {
                            float lab = default;
                            labelGetter(ref lab);
                            if (_binary)
                            {
                                writer.Write(float.IsNaN(lab) ? 0 : (lab > 0 ? 1 : -1));
                            }
                            else
                            {
                                writer.Write("{0:R}", lab);
                            }
                            if (groupGetter != null)
                            {
                                ulong groupId = default;
                                groupGetter(ref groupId);
                                if (groupId > 0)
                                {
                                    writer.Write(" qid:{0}", groupId - 1);
                                }
                            }
                            if (weightGetter != null)
                            {
                                float weight = default;
                                weightGetter(ref weight);
                                if (weight != 1)
                                {
                                    writer.Write(" cost:{0:R}", weight);
                                }
                            }

                            featuresGetter(ref features);
                            bool any = false;
                            foreach (var pair in features.Items().Where(p => p.Value != 0))
                            {
                                writer.Write(" {0}:{1}", _zero ? pair.Key : (pair.Key + 1), pair.Value);
                                any = true;
                            }
                            // If there were no non-zero items, write a dummy item. Some parsers can handle
                            // empty arrays correctly, but some assume there is at least one defined item.
                            if (!any)
                            {
                                writer.Write(" {0}:0", _zero ? 0 : 1);
                            }
                            writer.WriteLine();
                        }
                    }
            }
        }