예제 #1
0
        /// <summary>
        /// Returns the max value for the specified metadata kind.
        /// The metadata type should be a KeyType with raw type U4.
        /// colMax will be set to the first column that has the max value for the specified metadata.
        /// If no column has the specified metadata, colMax is set to -1 and the method returns zero.
        /// The filter function is called for each column, passing in the schema and the column index, and returns
        /// true if the column should be considered, false if the column should be skipped.
        /// </summary>
        public static uint GetMaxMetadataKind(this Schema schema, out int colMax, string metadataKind, Func <Schema, int, bool> filterFunc = null)
        {
            uint max = 0;

            colMax = -1;
            for (int col = 0; col < schema.Count; col++)
            {
                var columnType = schema.GetMetadataTypeOrNull(metadataKind, col);
                if (columnType == null || !columnType.IsKey || columnType.RawKind != DataKind.U4)
                {
                    continue;
                }
                if (filterFunc != null && !filterFunc(schema, col))
                {
                    continue;
                }
                uint value = 0;
                schema.GetMetadata(metadataKind, col, ref value);
                if (max < value)
                {
                    max    = value;
                    colMax = col;
                }
            }
            return(max);
        }
예제 #2
0
 public void GetMetadata <TValue>(string kind, int col, ref TValue value)
 {
     _ectx.Check(0 <= col && col < ColumnCount);
     if (IsPivot(col) && !ShouldPreserveMetadata(kind))
     {
         throw _ectx.ExceptGetMetadata();
     }
     _inputSchema.GetMetadata(kind, col, ref value);
 }
 public void GetMetadata <TValue>(string kind, int col, ref TValue value)
 {
     _ectx.CheckParam(col == 0, nameof(col));
     if (kind == MetadataUtils.Kinds.SlotNames && _hasSlotNames)
     {
         _parentSchema.GetMetadata(kind, _featureCol, ref value);
     }
     else
     {
         throw MetadataUtils.ExceptGetMetadata();
     }
 }
예제 #4
0
        /// <summary>
        /// Tries to get the metadata kind of the specified type for a column.
        /// </summary>
        /// <typeparam name="T">The raw type of the metadata, should match the PrimitiveType type</typeparam>
        /// <param name="schema">The schema</param>
        /// <param name="type">The type of the metadata</param>
        /// <param name="kind">The metadata kind</param>
        /// <param name="col">The column</param>
        /// <param name="value">The value to return, if successful</param>
        /// <returns>True if the metadata of the right type exists, false otherwise</returns>
        public static bool TryGetMetadata <T>(this Schema schema, PrimitiveType type, string kind, int col, ref T value)
        {
            Contracts.CheckValue(schema, nameof(schema));
            Contracts.CheckValue(type, nameof(type));

            var metadataType = schema.GetMetadataTypeOrNull(kind, col);

            if (!type.Equals(metadataType))
            {
                return(false);
            }
            schema.GetMetadata(kind, col, ref value);
            return(true);
        }
예제 #5
0
        /// <summary>
        /// The categoricalFeatures is a vector of the indices of categorical features slots.
        /// This vector should always have an even number of elements, and the elements should be parsed in groups of two consecutive numbers.
        /// So if its value is the range of numbers: 0,2,3,4,8,9
        /// look at it as [0,2],[3,4],[8,9].
        /// The way to interpret that is: feature with indices 0, 1, and 2 are one categorical
        /// Features with indices 3 and 4 are another categorical. Features 5 and 6 don't appear there, so they are not categoricals.
        /// </summary>
        public static bool TryGetCategoricalFeatureIndices(Schema schema, int colIndex, out int[] categoricalFeatures)
        {
            Contracts.CheckValue(schema, nameof(schema));
            Contracts.Check(colIndex >= 0, nameof(colIndex));

            bool isValid = false;

            categoricalFeatures = null;
            if (!(schema.GetColumnType(colIndex) is VectorType vecType && vecType.Size > 0))
            {
                return(isValid);
            }

            var type = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.CategoricalSlotRanges, colIndex);

            if (type?.RawType == typeof(VBuffer <int>))
            {
                VBuffer <int> catIndices = default(VBuffer <int>);
                schema.GetMetadata(MetadataUtils.Kinds.CategoricalSlotRanges, colIndex, ref catIndices);
                VBufferUtils.Densify(ref catIndices);
                int columnSlotsCount = vecType.Size;
                if (catIndices.Length > 0 && catIndices.Length % 2 == 0 && catIndices.Length <= columnSlotsCount * 2)
                {
                    int previousEndIndex = -1;
                    isValid = true;
                    var catIndicesValues = catIndices.GetValues();
                    for (int i = 0; i < catIndicesValues.Length; i += 2)
                    {
                        if (catIndicesValues[i] > catIndicesValues[i + 1] ||
                            catIndicesValues[i] <= previousEndIndex ||
                            catIndicesValues[i] >= columnSlotsCount ||
                            catIndicesValues[i + 1] >= columnSlotsCount)
                        {
                            isValid = false;
                            break;
                        }

                        previousEndIndex = catIndicesValues[i + 1];
                    }
                    if (isValid)
                    {
                        categoricalFeatures = catIndicesValues.ToArray();
                    }
                }
            }

            return(isValid);
        }
예제 #6
0
 /// <summary>
 /// Returns the set of column ids which match the value of specified metadata kind.
 /// The metadata type should be of type text.
 /// </summary>
 public static IEnumerable <int> GetColumnSet(this Schema schema, string metadataKind, string value)
 {
     for (int col = 0; col < schema.Count; col++)
     {
         var columnType = schema.GetMetadataTypeOrNull(metadataKind, col);
         if (columnType != null && columnType.IsText)
         {
             ReadOnlyMemory <char> val = default;
             schema.GetMetadata(metadataKind, col, ref val);
             if (ReadOnlyMemoryUtils.EqualsStr(value, val))
             {
                 yield return(col);
             }
         }
     }
 }
예제 #7
0
 /// <summary>
 /// Returns the set of column ids which match the value of specified metadata kind.
 /// The metadata type should be a KeyType with raw type U4.
 /// </summary>
 public static IEnumerable <int> GetColumnSet(this Schema schema, string metadataKind, uint value)
 {
     for (int col = 0; col < schema.Count; col++)
     {
         var columnType = schema.GetMetadataTypeOrNull(metadataKind, col);
         if (columnType != null && columnType.IsKey && columnType.RawKind == DataKind.U4)
         {
             uint val = 0;
             schema.GetMetadata(metadataKind, col, ref val);
             if (val == value)
             {
                 yield return(col);
             }
         }
     }
 }
예제 #8
0
        /// <summary>
        /// Gets the mapping from T into a StringBuilder representation, using various heuristics.
        /// This StringBuilder representation will be a component of the composed KeyValues for the
        /// hash outputs.
        /// </summary>
        public static ValueMapper <T, StringBuilder> GetSimpleMapper <T>(Schema schema, int col)
        {
            Contracts.AssertValue(schema);
            Contracts.Assert(0 <= col && col < schema.ColumnCount);
            var type = schema.GetColumnType(col).ItemType;

            Contracts.Assert(type.RawType == typeof(T));
            var conv = Conversion.Conversions.Instance;

            // First: if not key, then get the standard string converison.
            if (!type.IsKey)
            {
                return(conv.GetStringConversion <T>(type));
            }

            bool identity;

            // Second choice: if key, utilize the KeyValues metadata for that key, if it has one and is text.
            if (schema.HasKeyNames(col, type.KeyCount))
            {
                // REVIEW: Non-textual KeyValues are certainly possible. Should we handle them?
                // Get the key names.
                VBuffer <ReadOnlyMemory <char> > keyValues = default;
                schema.GetMetadata(MetadataUtils.Kinds.KeyValues, col, ref keyValues);
                ReadOnlyMemory <char> value = default;

                // REVIEW: We could optimize for identity, but it's probably not worthwhile.
                var keyMapper = conv.GetStandardConversion <T, uint>(type, NumberType.U4, out identity);
                return
                    ((ref T src, ref StringBuilder dst) =>
                {
                    ClearDst(ref dst);
                    uint intermediate = 0;
                    keyMapper(ref src, ref intermediate);
                    if (intermediate == 0)
                    {
                        return;
                    }
                    keyValues.GetItemOrDefault((int)(intermediate - 1), ref value);
                    dst.AppendMemory(value);
                });
            }

            // Third choice: just use the key value itself, subject to offsetting by the min.
            return(conv.GetKeyStringConversion <T>(type.AsKey));
        }
예제 #9
0
        private static void ShowMetadataValue <T>(IndentedTextWriter itw, Schema schema, int col, string kind, ColumnType type)
        {
            Contracts.AssertValue(itw);
            Contracts.AssertValue(schema);
            Contracts.Assert(0 <= col && col < schema.ColumnCount);
            Contracts.AssertNonEmpty(kind);
            Contracts.AssertValue(type);
            Contracts.Assert(!type.IsVector);
            Contracts.Assert(type.RawType == typeof(T));

            var conv = Conversions.Instance.GetStringConversion <T>(type);

            var value = default(T);
            var sb    = default(StringBuilder);

            schema.GetMetadata(kind, col, ref value);
            conv(in value, ref sb);

            itw.Write(": '{0}'", sb);
        }
예제 #10
0
        private static void ShowMetadataValueVec <T>(IndentedTextWriter itw, Schema schema, int col, string kind, ColumnType type)
        {
            Contracts.AssertValue(itw);
            Contracts.AssertValue(schema);
            Contracts.Assert(0 <= col && col < schema.ColumnCount);
            Contracts.AssertNonEmpty(kind);
            Contracts.AssertValue(type);
            Contracts.Assert(type.IsVector);
            Contracts.Assert(type.ItemType.RawType == typeof(T));

            var conv = Conversions.Instance.GetStringConversion <T>(type.ItemType);

            var value = default(VBuffer <T>);

            schema.GetMetadata(kind, col, ref value);

            itw.Write(": Length={0}, Count={0}", value.Length, value.GetValues().Length);

            using (itw.Nest())
            {
                var sb    = default(StringBuilder);
                int count = 0;
                foreach (var item in value.Items())
                {
                    if ((count % 10) == 0)
                    {
                        itw.WriteLine();
                    }
                    else
                    {
                        itw.Write(", ");
                    }
                    var val = item.Value;
                    conv(in val, ref sb);
                    itw.Write("[{0}] '{1}'", item.Key, sb);
                    count++;
                }
            }
        }
예제 #11
0
        private static void PrintSchema(TextWriter writer, Arguments args, Schema schema, ITransposeSchema tschema)
        {
            Contracts.AssertValue(writer);
            Contracts.AssertValue(args);
            Contracts.AssertValue(schema);
            Contracts.AssertValueOrNull(tschema);
#if !CORECLR
            if (args.ShowJson)
            {
                writer.WriteLine("Json Schema not supported.");
                return;
            }
#endif
            int colLim = schema.ColumnCount;

            var itw = new IndentedTextWriter(writer, "  ");
            itw.WriteLine("{0} columns:", colLim);
            using (itw.Nest())
            {
                var names = default(VBuffer <ReadOnlyMemory <char> >);
                for (int col = 0; col < colLim; col++)
                {
                    var name     = schema.GetColumnName(col);
                    var type     = schema.GetColumnType(col);
                    var slotType = tschema == null ? null : tschema.GetSlotType(col);
                    itw.WriteLine("{0}: {1}{2}", name, type, slotType == null ? "" : " (T)");

                    bool metaVals = args.ShowMetadataValues;
                    if (metaVals || args.ShowMetadataTypes)
                    {
                        ShowMetadata(itw, schema, col, metaVals);
                        continue;
                    }

                    if (!args.ShowSlots)
                    {
                        continue;
                    }
                    if (!type.IsKnownSizeVector)
                    {
                        continue;
                    }
                    ColumnType typeNames;
                    if ((typeNames = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, col)) == null)
                    {
                        continue;
                    }
                    if (typeNames.VectorSize != type.VectorSize || !typeNames.ItemType.IsText)
                    {
                        Contracts.Assert(false, "Unexpected slot names type");
                        continue;
                    }
                    schema.GetMetadata(MetadataUtils.Kinds.SlotNames, col, ref names);
                    if (names.Length != type.VectorSize)
                    {
                        Contracts.Assert(false, "Unexpected length of slot names vector");
                        continue;
                    }

                    using (itw.Nest())
                    {
                        bool verbose = args.Verbose ?? false;
                        foreach (var kvp in names.Items(all: verbose))
                        {
                            if (verbose || !kvp.Value.IsEmpty)
                            {
                                itw.WriteLine("{0}:{1}", kvp.Key, kvp.Value);
                            }
                        }
                    }
                }
            }
        }
예제 #12
0
 public void GetMetadata <TValue>(string kind, int col, ref TValue value)
 {
     Contracts.CheckNonEmpty(kind, nameof(kind));
     Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
     _input.GetMetadata(kind, Sources[col], ref value);
 }