public RowCursor(NgramHashTransform parent, IRowCursor input, bool[] active, FinderDecorator decorator = null) : base(parent.Host, input) { Ch.AssertValue(parent); Ch.Assert(active == null || active.Length == parent._bindings.ColumnCount); Ch.AssertValueOrNull(decorator); _bindings = parent._bindings; _active = active; _getters = new Delegate[_bindings.Infos.Length]; for (int iinfo = 0; iinfo < _bindings.Infos.Length; iinfo++) { if (IsIndexActive(iinfo)) { _getters[iinfo] = parent.MakeGetter(Ch, Input, iinfo, decorator); } } }
public InvertHashHelper(NgramHashTransform parent, string[][] friendlyNames, Func <int, bool> inputPred, int[] invertHashMaxCounts) { Contracts.AssertValue(parent); Contracts.AssertValue(friendlyNames); Contracts.Assert(friendlyNames.Length == parent._bindings.InfoCount); Contracts.AssertValue(inputPred); Contracts.AssertValue(invertHashMaxCounts); Contracts.Assert(invertHashMaxCounts.Length == parent._bindings.InfoCount); _parent = parent; // One per iinfo (some may be null). _iinfoToCollector = new InvertHashCollector <NGram> [_parent._bindings.InfoCount]; // One per source column (some may be null). _srcTextGetters = new ValueMapper <uint, StringBuilder> [_parent.Source.Schema.ColumnCount]; _invertHashMaxCounts = invertHashMaxCounts; for (int i = 0; i < _srcTextGetters.Length; ++i) { if (inputPred(i)) { _srcTextGetters[i] = InvertHashUtils.GetSimpleMapper <uint>(_parent.Source.Schema, i); } } _friendlyNames = friendlyNames; }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <TermTransform.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <TermTransform.Column>(); } var hashColumns = new List <HashTransformer.Column>(); var ngramHashColumns = new NgramHashTransform.Column[args.Column.Length]; var colCount = args.Column.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new TermTransform.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add( new HashTransformer.Column { Name = tmpName, Source = termLoaderArgs == null ? column.Source[isrc] : tmpName, HashBits = 30, Seed = column.Seed, Ordered = false, InvertHash = column.InvertHash }); } ngramHashColumns[iinfo] = new NgramHashTransform.Column { Name = column.Name, Source = tmpColNames[iinfo], AllLengths = column.AllLengths, HashBits = column.HashBits, NgramLength = column.NgramLength, RehashUnigrams = false, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, InvertHash = column.InvertHash, // REVIEW: This is an ugly internal hack to get around // the problem that we want the *original* source names surfacing // in the descriptions where appropriate, rather than _tmp000 and // what have you. The alternative is we do something elaborate // with metadata or something but I'm not sure that's better. FriendlyNames = column.FriendlyNames }; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new TermTransform.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = termCols.ToArray() }; view = TermTransform.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var naDropArgs = new NADropTransform.Arguments { Column = new NADropTransform.Column[termCols.Count] }; for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { naDropArgs.Column[iinfo] = new NADropTransform.Column { Name = termCols[iinfo].Name, Source = termCols[iinfo].Name }; } view = new NADropTransform(h, naDropArgs, view); } } // Args for the Hash function with multiple columns var hashArgs = new HashTransformer.Arguments { HashBits = 31, Seed = args.Seed, Ordered = false, Column = hashColumns.ToArray(), InvertHash = args.InvertHash }; view = HashTransformer.Create(h, hashArgs, view); // creating the NgramHash function var ngramHashArgs = new NgramHashTransform.Arguments { AllLengths = args.AllLengths, HashBits = args.HashBits, NgramLength = args.NgramLength, SkipLength = args.SkipLength, RehashUnigrams = false, Ordered = args.Ordered, Seed = args.Seed, Column = ngramHashColumns, InvertHash = args.InvertHash }; view = new NgramHashTransform(h, ngramHashArgs, view); return(SelectColumnsTransform.CreateDrop(h, view, tmpColNames.SelectMany(cols => cols).ToArray())); }
public Bindings(ModelLoadContext ctx, ISchema schemaInput, NgramHashTransform parent) : base(ctx, schemaInput, TestTypes) { Types = new VectorType[Infos.Length]; _parent = parent; }
public Bindings(Arguments args, ISchema schemaInput, NgramHashTransform parent) : base(args.Column, schemaInput, TestTypes) { Types = new VectorType[args.Column.Length]; _parent = parent; }