コード例 #1
0
        private void TestHashTransformHelper <T>(T[] data, uint[] results, NumberType type)
        {
            var builder = new ArrayDataViewBuilder(Env);

            builder.AddColumn("F1", type, data);
            var srcView = builder.GetDataView();

            var col = new HashTransformer.Column();

            col.Name     = "F1";
            col.HashBits = 5;
            col.Seed     = 42;
            var args = new HashTransformer.Arguments();

            args.Column = new HashTransformer.Column[] { col };

            var hashTransform = HashTransformer.Create(Env, args, srcView);

            using (var cursor = hashTransform.GetRowCursor(c => true))
            {
                var  resultGetter = cursor.GetGetter <uint>(1);
                uint resultRow    = 0;
                foreach (var r in results)
                {
                    Assert.True(cursor.MoveNext());
                    resultGetter(ref resultRow);
                    Assert.True(resultRow == r);
                }
            }
        }
コード例 #2
0
        private void TestHashTransformVectorHelper(ArrayDataViewBuilder builder, uint[][] results)
        {
            var srcView = builder.GetDataView();
            var col     = new HashTransformer.Column();

            col.Name     = "F1V";
            col.HashBits = 5;
            col.Seed     = 42;
            var args = new HashTransformer.Arguments();

            args.Column = new HashTransformer.Column[] { col };

            var hashTransform = HashTransformer.Create(Env, args, srcView);

            using (var cursor = hashTransform.GetRowCursor(c => true))
            {
                var            resultGetter = cursor.GetGetter <VBuffer <uint> >(1);
                VBuffer <uint> resultRow    = new VBuffer <uint>();
                foreach (var r in results)
                {
                    Assert.True(cursor.MoveNext());
                    resultGetter(ref resultRow);

                    Assert.True(resultRow.Length == r.Length);
                    for (int i = 0; i < r.Length; i++)
                    {
                        Assert.True(resultRow.GetItemOrDefault(i) == r[i]);
                    }
                }
            }
        }
コード例 #3
0
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("CategoricalHash");

            using (var ch = h.Start("CategoricalHash"))
            {
                h.CheckValue(args, nameof(args));
                h.CheckValue(input, nameof(input));
                h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");
                if (args.HashBits < 1 || args.HashBits >= NumBitsLim)
                {
                    throw h.ExceptUserArg(nameof(args.HashBits), "Number of bits must be between 1 and {0}", NumBitsLim - 1);
                }

                // creating the Hash function
                var hashArgs = new HashTransformer.Arguments
                {
                    HashBits   = args.HashBits,
                    Seed       = args.Seed,
                    Ordered    = args.Ordered,
                    InvertHash = args.InvertHash,
                    Column     = new HashTransformer.Column[args.Column.Length]
                };
                for (int i = 0; i < args.Column.Length; i++)
                {
                    var column = args.Column[i];
                    if (!column.TrySanitize())
                    {
                        throw h.ExceptUserArg(nameof(Column.Name));
                    }
                    h.Assert(!string.IsNullOrWhiteSpace(column.Name));
                    h.Assert(!string.IsNullOrWhiteSpace(column.Source));
                    hashArgs.Column[i] = new HashTransformer.Column
                    {
                        HashBits   = column.HashBits,
                        Seed       = column.Seed,
                        Ordered    = column.Ordered,
                        Name       = column.Name,
                        Source     = column.Source,
                        InvertHash = column.InvertHash,
                    };
                }

                return(CreateTransformCore(
                           args.OutputKind, args.Column,
                           args.Column.Select(col => col.OutputKind).ToList(),
                           HashTransformer.Create(h, hashArgs, input),
                           h,
                           args));
            }
        }
コード例 #4
0
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input,
                                            TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");

            // To each input column to the NgramHashExtractorArguments, a HashTransform using 31
            // bits (to minimize collisions) is applied first, followed by an NgramHashTransform.
            IDataView view = input;

            List <TermTransform.Column> termCols = null;

            if (termLoaderArgs != null)
            {
                termCols = new List <TermTransform.Column>();
            }
            var hashColumns      = new List <HashTransformer.Column>();
            var ngramHashColumns = new NgramHashTransform.Column[args.Column.Length];

            var colCount = args.Column.Length;

            // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source
            // column name when a 'name' destination column name was specified, we use temporary column names.
            string[][] tmpColNames = new string[colCount][];
            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column = args.Column[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0 &&
                               column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                int srcCount = column.Source.Length;
                tmpColNames[iinfo] = new string[srcCount];
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]);
                    tmpColNames[iinfo][isrc] = tmpName;
                    if (termLoaderArgs != null)
                    {
                        termCols.Add(
                            new TermTransform.Column
                        {
                            Name   = tmpName,
                            Source = column.Source[isrc]
                        });
                    }

                    hashColumns.Add(
                        new HashTransformer.Column
                    {
                        Name       = tmpName,
                        Source     = termLoaderArgs == null ? column.Source[isrc] : tmpName,
                        HashBits   = 30,
                        Seed       = column.Seed,
                        Ordered    = false,
                        InvertHash = column.InvertHash
                    });
                }

                ngramHashColumns[iinfo] =
                    new NgramHashTransform.Column
                {
                    Name           = column.Name,
                    Source         = tmpColNames[iinfo],
                    AllLengths     = column.AllLengths,
                    HashBits       = column.HashBits,
                    NgramLength    = column.NgramLength,
                    RehashUnigrams = false,
                    Seed           = column.Seed,
                    SkipLength     = column.SkipLength,
                    Ordered        = column.Ordered,
                    InvertHash     = column.InvertHash,
                    // REVIEW: This is an ugly internal hack to get around
                    // the problem that we want the *original* source names surfacing
                    // in the descriptions where appropriate, rather than _tmp000 and
                    // what have you. The alternative is we do something elaborate
                    // with metadata or something but I'm not sure that's better.
                    FriendlyNames = column.FriendlyNames
                };
            }

            if (termLoaderArgs != null)
            {
                h.Assert(Utils.Size(termCols) == hashColumns.Count);
                var termArgs =
                    new TermTransform.Arguments()
                {
                    MaxNumTerms = int.MaxValue,
                    Terms       = termLoaderArgs.Terms,
                    Term        = termLoaderArgs.Term,
                    DataFile    = termLoaderArgs.DataFile,
                    Loader      = termLoaderArgs.Loader,
                    TermsColumn = termLoaderArgs.TermsColumn,
                    Sort        = termLoaderArgs.Sort,
                    Column      = termCols.ToArray()
                };
                view = TermTransform.Create(h, termArgs, view);

                if (termLoaderArgs.DropUnknowns)
                {
                    var naDropArgs = new NADropTransform.Arguments {
                        Column = new NADropTransform.Column[termCols.Count]
                    };
                    for (int iinfo = 0; iinfo < termCols.Count; iinfo++)
                    {
                        naDropArgs.Column[iinfo] =
                            new NADropTransform.Column {
                            Name = termCols[iinfo].Name, Source = termCols[iinfo].Name
                        };
                    }
                    view = new NADropTransform(h, naDropArgs, view);
                }
            }

            // Args for the Hash function with multiple columns
            var hashArgs =
                new HashTransformer.Arguments
            {
                HashBits   = 31,
                Seed       = args.Seed,
                Ordered    = false,
                Column     = hashColumns.ToArray(),
                InvertHash = args.InvertHash
            };

            view = HashTransformer.Create(h, hashArgs, view);

            // creating the NgramHash function
            var ngramHashArgs =
                new NgramHashTransform.Arguments
            {
                AllLengths     = args.AllLengths,
                HashBits       = args.HashBits,
                NgramLength    = args.NgramLength,
                SkipLength     = args.SkipLength,
                RehashUnigrams = false,
                Ordered        = args.Ordered,
                Seed           = args.Seed,
                Column         = ngramHashColumns,
                InvertHash     = args.InvertHash
            };

            view = new NgramHashTransform(h, ngramHashArgs, view);
            return(new DropColumnsTransform(h, new DropColumnsTransform.Arguments()
            {
                Column = tmpColNames.SelectMany(cols => cols).ToArray()
            }, view));
        }