Exemple #1
0
        /// <summary>
        /// Since shuffling requires serving up items potentially out of order we need to know
        /// how to save and then copy out values that we read. This transform knows how to save
        /// and copy out only primitive and vector valued columns, but nothing else, so any
        /// other columns are dropped.
        /// </summary>
        private static IDataView SelectCachableColumns(IDataView data, IHostEnvironment env)
        {
            List <int> columnsToDrop = null;
            var        schema        = data.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                var type = schema.GetColumnType(c);
                if (!type.IsCachable())
                {
                    Utils.Add(ref columnsToDrop, c);
                }
            }
            if (Utils.Size(columnsToDrop) == 0)
            {
                return(data);
            }

            var args = new ChooseColumnsByIndexTransform.Arguments();

            args.Drop  = true;
            args.Index = columnsToDrop.ToArray();
            return(new ChooseColumnsByIndexTransform(env, args, data));
        }
        private IDataView AppendPerInstanceDataViews(IEnumerable <IDataView> foldDataViews, IChannel ch)
        {
            // Make sure there are no variable size vector columns.
            // This is a dictionary from the column name to its vector size.
            var vectorSizes                   = new Dictionary <string, int>();
            var firstDvSlotNames              = new Dictionary <string, VBuffer <DvText> >();
            var firstDvKeyColumns             = new List <string>();
            var firstDvVectorKeyColumns       = new List <string>();
            var variableSizeVectorColumnNames = new List <string>();
            var list     = new List <IDataView>();
            int dvNumber = 0;

            foreach (var dv in foldDataViews)
            {
                var hidden = new List <int>();
                for (int i = 0; i < dv.Schema.ColumnCount; i++)
                {
                    if (dv.Schema.IsHidden(i))
                    {
                        hidden.Add(i);
                        continue;
                    }

                    var type = dv.Schema.GetColumnType(i);
                    var name = dv.Schema.GetColumnName(i);
                    if (type.IsVector)
                    {
                        if (dvNumber == 0)
                        {
                            if (dv.Schema.HasKeyNames(i, type.ItemType.KeyCount))
                            {
                                firstDvVectorKeyColumns.Add(name);
                            }
                            // Store the slot names of the 1st idv and use them as baseline.
                            if (dv.Schema.HasSlotNames(i, type.VectorSize))
                            {
                                VBuffer <DvText> slotNames = default(VBuffer <DvText>);
                                dv.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, i, ref slotNames);
                                firstDvSlotNames.Add(name, slotNames);
                            }
                        }

                        int cachedSize;
                        if (vectorSizes.TryGetValue(name, out cachedSize))
                        {
                            VBuffer <DvText> slotNames;
                            // In the event that no slot names were recorded here, then slotNames will be
                            // the default, length 0 vector.
                            firstDvSlotNames.TryGetValue(name, out slotNames);
                            if (!VerifyVectorColumnsMatch(cachedSize, i, dv, type, ref slotNames))
                            {
                                variableSizeVectorColumnNames.Add(name);
                            }
                        }
                        else
                        {
                            vectorSizes.Add(name, type.VectorSize);
                        }
                    }
                    else if (dvNumber == 0 && dv.Schema.HasKeyNames(i, type.KeyCount))
                    {
                        // The label column can be a key. Reconcile the key values, and wrap with a KeyToValue transform.
                        firstDvKeyColumns.Add(name);
                    }
                }
                var idv = dv;
                if (hidden.Count > 0)
                {
                    var args = new ChooseColumnsByIndexTransform.Arguments();
                    args.Drop  = true;
                    args.Index = hidden.ToArray();
                    idv        = new ChooseColumnsByIndexTransform(Host, args, idv);
                }
                list.Add(idv);
                dvNumber++;
            }

            if (variableSizeVectorColumnNames.Count == 0 && firstDvKeyColumns.Count == 0)
            {
                return(AppendRowsDataView.Create(Host, null, list.ToArray()));
            }

            var views = list.ToArray();

            foreach (var keyCol in firstDvKeyColumns)
            {
                EvaluateUtils.ReconcileKeyValues(Host, views, keyCol);
            }
            foreach (var vectorKeyCol in firstDvVectorKeyColumns)
            {
                EvaluateUtils.ReconcileVectorKeyValues(Host, views, vectorKeyCol);
            }

            Func <IDataView, int, IDataView> keyToValue =
                (idv, i) =>
            {
                foreach (var keyCol in firstDvKeyColumns.Concat(firstDvVectorKeyColumns))
                {
                    idv = new KeyToValueTransform(Host, new KeyToValueTransform.Arguments()
                    {
                        Column = new[] { new KeyToValueTransform.Column()
                                         {
                                             Name = keyCol
                                         }, }
                    }, idv);
                    var hidden = FindHiddenColumns(idv.Schema, keyCol);
                    idv = new ChooseColumnsByIndexTransform(Host, new ChooseColumnsByIndexTransform.Arguments()
                    {
                        Drop = true, Index = hidden.ToArray()
                    }, idv);
                }
                return(idv);
            };

            Func <IDataView, IDataView> selectDropNonVarLenthCol =
                (idv) =>
            {
                foreach (var variableSizeVectorColumnName in variableSizeVectorColumnNames)
                {
                    int index;
                    idv.Schema.TryGetColumnIndex(variableSizeVectorColumnName, out index);
                    var type = idv.Schema.GetColumnType(index);

                    idv = Utils.MarshalInvoke(AddVarLengthColumn <int>, type.ItemType.RawType, Host, idv,
                                              variableSizeVectorColumnName, type);

                    // Drop the old column that does not have variable length.
                    idv = new DropColumnsTransform(Host, new DropColumnsTransform.Arguments()
                    {
                        Column = new[] { variableSizeVectorColumnName }
                    }, idv);
                }
                return(idv);
            };

            if (variableSizeVectorColumnNames.Count > 0)
            {
                ch.Warning("Detected columns of variable length: {0}. Consider setting collateMetrics- for meaningful per-Folds results.", string.Join(", ", variableSizeVectorColumnNames));
            }
            return(AppendRowsDataView.Create(Host, null, views.Select(keyToValue).Select(selectDropNonVarLenthCol).ToArray()));
        }