/// <summary> /// Since shuffling requires serving up items potentially out of order we need to know /// how to save and then copy out values that we read. This transform knows how to save /// and copy out only primitive and vector valued columns, but nothing else, so any /// other columns are dropped. /// </summary> private static IDataView SelectCachableColumns(IDataView data, IHostEnvironment env) { List <int> columnsToDrop = null; var schema = data.Schema; for (int c = 0; c < schema.ColumnCount; ++c) { var type = schema.GetColumnType(c); if (!type.IsCachable()) { Utils.Add(ref columnsToDrop, c); } } if (Utils.Size(columnsToDrop) == 0) { return(data); } var args = new ChooseColumnsByIndexTransform.Arguments(); args.Drop = true; args.Index = columnsToDrop.ToArray(); return(new ChooseColumnsByIndexTransform(env, args, data)); }
private IDataView AppendPerInstanceDataViews(IEnumerable <IDataView> foldDataViews, IChannel ch) { // Make sure there are no variable size vector columns. // This is a dictionary from the column name to its vector size. var vectorSizes = new Dictionary <string, int>(); var firstDvSlotNames = new Dictionary <string, VBuffer <DvText> >(); var firstDvKeyColumns = new List <string>(); var firstDvVectorKeyColumns = new List <string>(); var variableSizeVectorColumnNames = new List <string>(); var list = new List <IDataView>(); int dvNumber = 0; foreach (var dv in foldDataViews) { var hidden = new List <int>(); for (int i = 0; i < dv.Schema.ColumnCount; i++) { if (dv.Schema.IsHidden(i)) { hidden.Add(i); continue; } var type = dv.Schema.GetColumnType(i); var name = dv.Schema.GetColumnName(i); if (type.IsVector) { if (dvNumber == 0) { if (dv.Schema.HasKeyNames(i, type.ItemType.KeyCount)) { firstDvVectorKeyColumns.Add(name); } // Store the slot names of the 1st idv and use them as baseline. if (dv.Schema.HasSlotNames(i, type.VectorSize)) { VBuffer <DvText> slotNames = default(VBuffer <DvText>); dv.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, i, ref slotNames); firstDvSlotNames.Add(name, slotNames); } } int cachedSize; if (vectorSizes.TryGetValue(name, out cachedSize)) { VBuffer <DvText> slotNames; // In the event that no slot names were recorded here, then slotNames will be // the default, length 0 vector. firstDvSlotNames.TryGetValue(name, out slotNames); if (!VerifyVectorColumnsMatch(cachedSize, i, dv, type, ref slotNames)) { variableSizeVectorColumnNames.Add(name); } } else { vectorSizes.Add(name, type.VectorSize); } } else if (dvNumber == 0 && dv.Schema.HasKeyNames(i, type.KeyCount)) { // The label column can be a key. Reconcile the key values, and wrap with a KeyToValue transform. firstDvKeyColumns.Add(name); } } var idv = dv; if (hidden.Count > 0) { var args = new ChooseColumnsByIndexTransform.Arguments(); args.Drop = true; args.Index = hidden.ToArray(); idv = new ChooseColumnsByIndexTransform(Host, args, idv); } list.Add(idv); dvNumber++; } if (variableSizeVectorColumnNames.Count == 0 && firstDvKeyColumns.Count == 0) { return(AppendRowsDataView.Create(Host, null, list.ToArray())); } var views = list.ToArray(); foreach (var keyCol in firstDvKeyColumns) { EvaluateUtils.ReconcileKeyValues(Host, views, keyCol); } foreach (var vectorKeyCol in firstDvVectorKeyColumns) { EvaluateUtils.ReconcileVectorKeyValues(Host, views, vectorKeyCol); } Func <IDataView, int, IDataView> keyToValue = (idv, i) => { foreach (var keyCol in firstDvKeyColumns.Concat(firstDvVectorKeyColumns)) { idv = new KeyToValueTransform(Host, new KeyToValueTransform.Arguments() { Column = new[] { new KeyToValueTransform.Column() { Name = keyCol }, } }, idv); var hidden = FindHiddenColumns(idv.Schema, keyCol); idv = new ChooseColumnsByIndexTransform(Host, new ChooseColumnsByIndexTransform.Arguments() { Drop = true, Index = hidden.ToArray() }, idv); } return(idv); }; Func <IDataView, IDataView> selectDropNonVarLenthCol = (idv) => { foreach (var variableSizeVectorColumnName in variableSizeVectorColumnNames) { int index; idv.Schema.TryGetColumnIndex(variableSizeVectorColumnName, out index); var type = idv.Schema.GetColumnType(index); idv = Utils.MarshalInvoke(AddVarLengthColumn <int>, type.ItemType.RawType, Host, idv, variableSizeVectorColumnName, type); // Drop the old column that does not have variable length. idv = new DropColumnsTransform(Host, new DropColumnsTransform.Arguments() { Column = new[] { variableSizeVectorColumnName } }, idv); } return(idv); }; if (variableSizeVectorColumnNames.Count > 0) { ch.Warning("Detected columns of variable length: {0}. Consider setting collateMetrics- for meaningful per-Folds results.", string.Join(", ", variableSizeVectorColumnNames)); } return(AppendRowsDataView.Create(Host, null, views.Select(keyToValue).Select(selectDropNonVarLenthCol).ToArray())); }