Ejemplo n.º 1
0
 internal CountTargetEncodingTransformer(IHostEnvironment env /*, HashingEstimator.ColumnOptions[] hashingColumns*/, HashingTransformer hashingTransformer, CountTableTransformer countTable)
 {
     Contracts.AssertValue(env);
     env.AssertValue(hashingTransformer);
     env.AssertValue(countTable);
     _host = env.Register(nameof(CountTargetEncodingTransformer));
     HashingTransformer = hashingTransformer;
     CountTable         = countTable;
 }
Ejemplo n.º 2
0
        private void InitMap <T>(T val, DataViewType type, int numberOfBits = 20, ValueGetter <T> getter = null)
        {
            if (getter == null)
            {
                getter = (ref T dst) => dst = val;
            }
            _inRow = RowImpl.Create(type, getter);
            // One million features is a nice, typical number.
            var info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: numberOfBits);
            var xf     = new HashingTransformer(_env, new[] { info });
            var mapper = ((ITransformer)xf).GetRowToRowMapper(_inRow.Schema);
            var column = mapper.OutputSchema["Bar"];
            var outRow = mapper.GetRow(_inRow, column);

            if (type is VectorType)
            {
                _vecGetter = outRow.GetGetter <VBuffer <uint> >(column);
            }
            else
            {
                _getter = outRow.GetGetter <uint>(column);
            }
        }
Ejemplo n.º 3
0
        private void InitMap <T>(T val, ColumnType type, int hashBits = 20, ValueGetter <T> getter = null)
        {
            if (getter == null)
            {
                getter = (ref T dst) => dst = val;
            }
            _inRow = RowImpl.Create(type, getter);
            // One million features is a nice, typical number.
            var info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: hashBits);
            var xf     = new HashingTransformer(_env, new[] { info });
            var mapper = xf.GetRowToRowMapper(_inRow.Schema);
            var column = mapper.OutputSchema["Bar"];
            var outRow = mapper.GetRow(_inRow, c => c == column.Index);

            if (type is VectorType)
            {
                _vecGetter = outRow.GetGetter <VBuffer <uint> >(column.Index);
            }
            else
            {
                _getter = outRow.GetGetter <uint>(column.Index);
            }
        }
Ejemplo n.º 4
0
        private void InitMap <T>(T val, ColumnType type, int hashBits = 20)
        {
            var col = RowColumnUtils.GetColumn("Foo", type, ref val);

            _counted = new Counted();
            var inRow = RowColumnUtils.GetRow(_counted, col);
            // One million features is a nice, typical number.
            var info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: hashBits);
            var xf     = new HashingTransformer(_env, new[] { info });
            var mapper = xf.GetRowToRowMapper(inRow.Schema);

            mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol);
            var outRow = mapper.GetRow(inRow, c => c == outCol, out var _);

            if (type is VectorType)
            {
                _vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            }
            else
            {
                _getter = outRow.GetGetter <uint>(outCol);
            }
        }
Ejemplo n.º 5
0
        private void HashTestCore <T>(T val, PrimitiveDataViewType type, uint expected, uint expectedOrdered, uint expectedOrdered3, uint expectedCombined, uint expectedCombinedSparse)
        {
            const int bits = 10;

            var builder = new DataViewSchema.Annotations.Builder();

            builder.AddPrimitiveValue("Foo", type, val);
            var inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations());

            //helper
            ValueGetter <TType> hashGetter <TType>(HashingEstimator.ColumnOptions colInfo)
            {
                var xf     = new HashingTransformer(Env, new[] { colInfo });
                var mapper = ((ITransformer)xf).GetRowToRowMapper(inRow.Schema);
                var col    = mapper.OutputSchema["Bar"];
                var outRow = mapper.GetRow(inRow, col);

                return(outRow.GetGetter <TType>(col));
            };

            // First do an unordered hash.
            var  info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits);
            var  getter = hashGetter <uint>(info);
            uint result = 0;

            getter(ref result);
            Assert.Equal(expected, result);

            // Next do an ordered hash.
            info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true);
            getter = hashGetter <uint>(info);
            getter(ref result);
            Assert.Equal(expectedOrdered, result);

            // Next build up a vector to make sure that hashing is consistent between scalar values
            // at least in the first position, and in the unordered case, the last position.
            const int vecLen   = 5;
            var       denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val));

            builder = new DataViewSchema.Annotations.Builder();
            builder.Add("Foo", new VectorDataViewType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst));
            inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations());

            info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: false);
            var            vecGetter = hashGetter <VBuffer <uint> >(info);
            VBuffer <uint> vecResult = default;

            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            // They all should equal this in this case.
            Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v));

            // Now do ordered with the dense vector.
            info      = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true);
            vecGetter = hashGetter <VBuffer <uint> >(info);
            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
            Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0)));

            // Now combine into one hash.
            info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, combine: true);
            getter = hashGetter <uint>(info);
            getter(ref result);
            Assert.Equal(expectedCombined, result);

            // Let's now do a sparse vector.
            var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 });

            builder = new DataViewSchema.Annotations.Builder();
            builder.Add("Foo", new VectorDataViewType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst));
            inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations());

            info      = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: false);
            vecGetter = hashGetter <VBuffer <uint> >(info);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expected, vecResult.GetItemOrDefault(0));
            Assert.Equal(expected, vecResult.GetItemOrDefault(3));
            Assert.Equal(expected, vecResult.GetItemOrDefault(7));

            info      = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true);
            vecGetter = hashGetter <VBuffer <uint> >(info);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));

            info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, combine: true);
            getter = hashGetter <uint>(info);
            getter(ref result);
            Assert.Equal(expectedCombinedSparse, result);
        }
Ejemplo n.º 6
0
        private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3)
        {
            const int bits = 10;

            var col   = RowColumnUtils.GetColumn("Foo", type, ref val);
            var inRow = RowColumnUtils.GetRow(new Counted(), col);

            // First do an unordered hash.
            var info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits);
            var xf     = new HashingTransformer(Env, new[] { info });
            var mapper = xf.GetRowToRowMapper(inRow.Schema);

            mapper.Schema.TryGetColumnIndex("Bar", out int outCol);
            var outRow = mapper.GetRow(inRow, c => c == outCol, out var _);

            var  getter = outRow.GetGetter <uint>(outCol);
            uint result = 0;

            getter(ref result);
            Assert.Equal(expected, result);

            // Next do an ordered hash.
            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.Schema.TryGetColumnIndex("Bar", out outCol);
            outRow = mapper.GetRow(inRow, c => c == outCol, out var _);

            getter = outRow.GetGetter <uint>(outCol);
            getter(ref result);
            Assert.Equal(expectedOrdered, result);

            // Next build up a vector to make sure that hashing is consistent between scalar values
            // at least in the first position, and in the unordered case, the last position.
            const int vecLen   = 5;
            var       denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val));

            col   = RowColumnUtils.GetColumn("Foo", new VectorType(type, vecLen), ref denseVec);
            inRow = RowColumnUtils.GetRow(new Counted(), col);

            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.Schema.TryGetColumnIndex("Bar", out outCol);
            outRow = mapper.GetRow(inRow, c => c == outCol, out var _);

            var            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            VBuffer <uint> vecResult = default;

            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            // They all should equal this in this case.
            Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v));

            // Now do ordered with the dense vector.
            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.Schema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol, out var _);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
            Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0)));

            // Let's now do a sparse vector.
            var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 });

            col   = RowColumnUtils.GetColumn("Foo", new VectorType(type, vecLen), ref sparseVec);
            inRow = RowColumnUtils.GetRow(new Counted(), col);

            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.Schema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol, out var _);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expected, vecResult.GetItemOrDefault(0));
            Assert.Equal(expected, vecResult.GetItemOrDefault(3));
            Assert.Equal(expected, vecResult.GetItemOrDefault(7));

            info   = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.Schema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol, out var _);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
        }
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input,
                                            TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");

            // To each input column to the NgramHashExtractorArguments, a HashTransform using 31
            // bits (to minimize collisions) is applied first, followed by an NgramHashTransform.
            IDataView view = input;

            List <ValueToKeyMappingTransformer.Column> termCols = null;

            if (termLoaderArgs != null)
            {
                termCols = new List <ValueToKeyMappingTransformer.Column>();
            }
            var hashColumns      = new List <HashingTransformer.Column>();
            var ngramHashColumns = new NgramHashingTransformer.Column[args.Column.Length];

            var colCount = args.Column.Length;

            // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source
            // column name when a 'name' destination column name was specified, we use temporary column names.
            string[][] tmpColNames = new string[colCount][];
            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column = args.Column[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0 &&
                               column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                int srcCount = column.Source.Length;
                tmpColNames[iinfo] = new string[srcCount];
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]);
                    tmpColNames[iinfo][isrc] = tmpName;
                    if (termLoaderArgs != null)
                    {
                        termCols.Add(
                            new ValueToKeyMappingTransformer.Column
                        {
                            Name   = tmpName,
                            Source = column.Source[isrc]
                        });
                    }

                    hashColumns.Add(
                        new HashingTransformer.Column
                    {
                        Name       = tmpName,
                        Source     = termLoaderArgs == null ? column.Source[isrc] : tmpName,
                        HashBits   = 30,
                        Seed       = column.Seed,
                        Ordered    = false,
                        InvertHash = column.InvertHash
                    });
                }

                ngramHashColumns[iinfo] =
                    new NgramHashingTransformer.Column
                {
                    Name           = column.Name,
                    Source         = tmpColNames[iinfo],
                    AllLengths     = column.AllLengths,
                    HashBits       = column.HashBits,
                    NgramLength    = column.NgramLength,
                    RehashUnigrams = false,
                    Seed           = column.Seed,
                    SkipLength     = column.SkipLength,
                    Ordered        = column.Ordered,
                    InvertHash     = column.InvertHash,
                    // REVIEW: This is an ugly internal hack to get around
                    // the problem that we want the *original* source names surfacing
                    // in the descriptions where appropriate, rather than _tmp000 and
                    // what have you. The alternative is we do something elaborate
                    // with metadata or something but I'm not sure that's better.
                    FriendlyNames = column.FriendlyNames
                };
            }

            if (termLoaderArgs != null)
            {
                h.Assert(Utils.Size(termCols) == hashColumns.Count);
                var termArgs =
                    new ValueToKeyMappingTransformer.Arguments()
                {
                    MaxNumTerms = int.MaxValue,
                    Terms       = termLoaderArgs.Terms,
                    Term        = termLoaderArgs.Term,
                    DataFile    = termLoaderArgs.DataFile,
                    Loader      = termLoaderArgs.Loader,
                    TermsColumn = termLoaderArgs.TermsColumn,
                    Sort        = termLoaderArgs.Sort,
                    Column      = termCols.ToArray()
                };
                view = ValueToKeyMappingTransformer.Create(h, termArgs, view);

                if (termLoaderArgs.DropUnknowns)
                {
                    var naDropArgs = new MissingValueDroppingTransformer.Arguments {
                        Column = new MissingValueDroppingTransformer.Column[termCols.Count]
                    };
                    for (int iinfo = 0; iinfo < termCols.Count; iinfo++)
                    {
                        naDropArgs.Column[iinfo] =
                            new MissingValueDroppingTransformer.Column {
                            Name = termCols[iinfo].Name, Source = termCols[iinfo].Name
                        };
                    }
                    view = new MissingValueDroppingTransformer(h, naDropArgs, view);
                }
            }

            // Args for the Hash function with multiple columns
            var hashArgs =
                new HashingTransformer.Arguments
            {
                HashBits   = 31,
                Seed       = args.Seed,
                Ordered    = false,
                Column     = hashColumns.ToArray(),
                InvertHash = args.InvertHash
            };

            view = HashingTransformer.Create(h, hashArgs, view);

            // creating the NgramHash function
            var ngramHashArgs =
                new NgramHashingTransformer.Arguments
            {
                AllLengths     = args.AllLengths,
                HashBits       = args.HashBits,
                NgramLength    = args.NgramLength,
                SkipLength     = args.SkipLength,
                RehashUnigrams = false,
                Ordered        = args.Ordered,
                Seed           = args.Seed,
                Column         = ngramHashColumns,
                InvertHash     = args.InvertHash
            };

            view = new NgramHashingTransformer(h, ngramHashArgs, view);
            return(ColumnSelectingTransformer.CreateDrop(h, view, tmpColNames.SelectMany(cols => cols).ToArray()));
        }