public PolynomialState(IHostEnvironment host, IDataView input, Arguments args, Func <TInput, TInput, TInput> multiplication) { _host = host.Register("PolynomialState"); _host.CheckValue(input, "input"); _input = input; // _lock = new object(); _args = args; _multiplication = multiplication; var column = _args.columns[0]; var schema = input.Schema; using (var ch = _host.Start("PolynomialState")) { if (!schema.TryGetColumnIndex(column.Source, out _inputCol)) { throw _host.ExceptParam("inputColumn", "Column '{0}' not found in schema.", column.Source); } var type = schema.GetColumnType(_inputCol); if (!type.IsVector()) { throw _host.Except("Input column type must be a vector."); } int dim = type.AsVector().DimCount(); if (dim > 1) { throw _host.Except("Input column type must be a vector of one dimension."); } int size = dim > 0 ? type.AsVector().GetDim(0) : 0; if (size > 0) { size = TotalCumulated[_args.degree](size); } ch.Trace("PolynomialTransform {0}->{1}.", dim, size); // We extend the input schema. The new type has the same type as the input. _schema = Schema.Create(new ExtendedSchema(input.Schema, new[] { column.Name }, new[] { new VectorType(type.AsVector().ItemType(), size) })); } }
/// <summary> /// We compute the polynomial features. /// </summary> private ValueGetter <VBuffer <TInput> > PolynomialBuilder() { // VBuffer<TInput> is the internal representation of a vector. // It can be dense (TInput[]) or sparse. // If there are n features, we can expect sum(i=1, d) n^i / i! polynomial features. VBuffer <TInput> features = new VBuffer <TInput>(); int degree = _args.degree; var values = new List <TInput>(); var indices = new List <int>(); int[] tempIndices = new int[3]; Func <IEnumerable <int>, int, int> computeIndex = (IEnumerable <int> sparseIndices, int nbFeatures) => { int nb = 0; foreach (var i in sparseIndices) { tempIndices[nb] = i; nb += 1; } switch (nb) { case 1: return(tempIndices[0]); case 2: int d1 = Total[1](nbFeatures); int d2 = Total[2](nbFeatures); return(d1 + (d2 - Total[2](nbFeatures - tempIndices[0])) + (tempIndices[1] - tempIndices[0])); case 3: int d1_ = Total[1](nbFeatures); int d2_ = Total[2](nbFeatures); int d3_ = Total[3](nbFeatures); int d1d = Total[2](nbFeatures - tempIndices[0]); int d2d = Total[2](nbFeatures - tempIndices[1]); return(d1_ + d2_ + d1d - d2d + tempIndices[2] - tempIndices[1] + // part with N^2 d3_ - Total[3](nbFeatures - tempIndices[0])); // part with N^3 default: throw Contracts.ExceptNotSupp("Level should be in [1, 3]."); } }; return((ref VBuffer <TInput> polyfeat) => { _inputGetter(ref features); int total; if (features.IsDense) { var poly = EnumeratePosition(features.Count, degree) .Select(pos => pos.Select(p => features.Values[p]).Aggregate((a, b) => _multiplication(a, b))) .ToArray(); polyfeat = new VBuffer <TInput>(poly.Length, poly); } else { values.Clear(); indices.Clear(); foreach (var pos in EnumeratePosition(features.Count, degree)) { values.Add(pos.Select(p => features.Values[p]).Aggregate((a, b) => _multiplication(a, b))); indices.Add(computeIndex(pos.Select(p => features.Indices[p]), features.Length)); #if (DEBUG) if (indices.Count > 1) { if (indices[indices.Count - 1] <= indices[indices.Count - 2]) { throw Contracts.Except("Inconsistency"); } } #endif } total = TotalCumulated[_args.degree](features.Length); polyfeat = new VBuffer <TInput>(total, values.Count, values.ToArray(), indices.ToArray()); } }); }