ValueMapperDispose <TSrc, TDst> GetMapperDispose <TSrc, TDst>() { var firstView = _sourceToReplace ?? DataViewHelper.GetFirstView(_transform); var inputView = new InfiniteLoopViewCursorColumn <TSrc>(_inputIndex, firstView.Schema, ignoreOtherColumn: _ignoreOtherColumn); // This is extremely time consuming as the transform is serialized and deserialized. var outputView = _sourceToReplace == _transform.Source ? ApplyTransformUtils.ApplyTransformToData(_computeEnv, _transform, inputView) : ApplyTransformUtils.ApplyAllTransformsToData(_computeEnv, _transform, inputView, _sourceToReplace); var index = SchemaHelper.GetColumnIndexDC(outputView.Schema, _outputColumn); var newOutputIndex = index; var cur = outputView.GetRowCursor(outputView.Schema.Where(c => c.Index == newOutputIndex.Index).ToArray()); var getter = cur.GetGetter <TDst>(newOutputIndex); if (getter == null) { throw _env.Except("Unable to get a getter on the transform for type {0}", default(TDst).GetType()); } return(new ValueMapperDispose <TSrc, TDst>((in TSrc src, ref TDst dst) => { inputView.Set(in src); cur.MoveNext(); getter(ref dst); }, new IDisposable[] { cur }));
protected void DebugChecking0(IDataView viewI, string labName, bool oneO) { var index = SchemaHelper.GetColumnIndexDC(viewI.Schema, labName); int nbRows = 0; using (var cursor = viewI.GetRowCursor(viewI.Schema.Where(c => c.Index == index.Index).ToArray())) { if (oneO) { var gfu = cursor.GetGetter <float>(index); var gff = cursor.GetGetter <uint>(index); Contracts.Assert(gfu != null || gff != null); } var ty = viewI.Schema[index.Index].Type; if (ty.IsVector() && ty.AsVector().ItemType().RawKind() == DataKind.Single) { var getter = cursor.GetGetter <VBuffer <float> >(index); var value = new VBuffer <float>(); while (cursor.MoveNext()) { getter(ref value); if (value.Length == 0 || value.Count == 0) { throw Host.Except("Issue."); } ++nbRows; } } else if (!ty.IsVector() && ty.RawKind() == DataKind.Single) { var getter = cursor.GetGetter <float>(index); var sch = SchemaHelper.ToString(cursor.Schema); var value = 0f; while (cursor.MoveNext()) { getter(ref value); ++nbRows; } } else if (ty.IsKey() && ty.RawKind() == DataKind.UInt32) { var getter = cursor.GetGetter <uint>(index); uint value = 0; while (cursor.MoveNext()) { getter(ref value); ++nbRows; } } else { throw Host.ExceptNotSupp(); } } if (nbRows == 0) { throw Contracts.Except("View is empty."); } }
Dictionary <int, DataViewSchema.Column> BuildMapping() { var res = new Dictionary <int, DataViewSchema.Column>(); foreach (var col in _args.columns) { res[SchemaHelper.GetColumnIndex(_schema, col.Name)] = SchemaHelper.GetColumnIndexDC(_schema, col.Source); } return(res); }
public Delegate[] GetCursorGetter(DataViewRowCursor cursor) { var indexL = SchemaHelper.GetColumnIndexDC(cursor.Schema, "PredictedLabel"); var indexS = SchemaHelper.GetColumnIndexDC(cursor.Schema, "Score"); var indexP = SchemaHelper.GetColumnIndexDC(cursor.Schema, "Probability"); return(new Delegate[] { cursor.GetGetter <bool>(indexL), cursor.GetGetter <float>(indexS), cursor.GetGetter <float>(indexP), }); }
protected void DebugChecking0Vfloat(IDataView viewI, string labName, ulong count) { var index = SchemaHelper.GetColumnIndexDC(viewI.Schema, labName); var ty = viewI.Schema[index.Index].Type; Contracts.Assert(ty.IsKey() || ty.IsVector() || ty.RawKind() == DataKind.Single); using (var cursor = viewI.GetRowCursor(viewI.Schema.Where(i => i.Index == index.Index).ToArray())) { var getter = cursor.GetGetter <VBuffer <float> >(index); var value = new VBuffer <float>(); int nb = 0; while (cursor.MoveNext()) { getter(ref value); if (value.Length == 0 || value.Count == 0) { throw Host.Except("Issue."); } if ((ulong)value.Length > count || (ulong)value.Count > count) { throw Host.Except("Issue."); } if ((ulong)value.Length != count || value.Count != 1) { getter(ref value); throw Host.Except("Issue."); } if (value.Values[0] != 1) { throw Host.Except("Issue."); } if ((ulong)value.Indices[0] >= count) { getter(ref value); throw Host.Except("Issue."); } if (value.Indices[0] < 0) { throw Host.Except("Issue."); } ++nb; } if (nb < 10) { throw Host.Except("Issue."); } } }
private DataViewRowCursor GetRowCursor(IEnumerable <DataViewSchema.Column> columnsNeeded, Random rand, DelegateGetRowCursor getterCursor) { ComputeNearestNeighbors(); _host.AssertValue(_input, "_input"); var schema = _input.Schema; if (columnsNeeded.Where(c => c.Index == _input.Schema.Count).Any()) { var newColumns = PredicatePropagation(columnsNeeded); var oldCols = SchemaHelper.ColumnsNeeded(newColumns, schema); var featureIndex = SchemaHelper.GetColumnIndexDC(Schema, _args.column); return(new NearestNeighborsCursor(getterCursor(oldCols, rand), this, newColumns, featureIndex)); } else { // The new column is not required. We do not need to compute it. But we need to keep the same schema. var oldCols = SchemaHelper.ColumnsNeeded(columnsNeeded, schema); return(new SameCursor(getterCursor(oldCols, rand), Schema)); } }
public DataViewRowCursor[] GetRowCursorSet(IEnumerable <DataViewSchema.Column> columnsNeeded, int n, Random rand = null) { ComputeNearestNeighbors(); _host.AssertValue(_input, "_input"); var schema = _input.Schema; if (columnsNeeded.Where(c => c.Index == _input.Schema.Count).Any()) { var newColumns = PredicatePropagation(columnsNeeded); var oldCols = SchemaHelper.ColumnsNeeded(newColumns, schema); var featureIndex = SchemaHelper.GetColumnIndexDC(Schema, _args.column); var res = _input.GetRowCursorSet(oldCols, n, rand) .Select(c => new NearestNeighborsCursor(c, this, newColumns, featureIndex)).ToArray(); return(res); } else { // The new column is not required. We do not need to compute it. But we need to keep the same schema. var oldCols = SchemaHelper.ColumnsNeeded(columnsNeeded, schema); return(_input.GetRowCursorSet(oldCols, n, rand).Select(c => new SameCursor(c, Schema)).ToArray()); } }
public PolynomialState(IHostEnvironment host, IDataView input, Arguments args, Func <TInput, TInput, TInput> multiplication) { _host = host.Register("PolynomialState"); _host.CheckValue(input, "input"); _input = input; // _lock = new object(); _args = args; _multiplication = multiplication; var column = _args.columns[0]; var schema = input.Schema; using (var ch = _host.Start("PolynomialState")) { _inputCol = SchemaHelper.GetColumnIndexDC(schema, column.Source); var type = schema[_inputCol.Index].Type; if (!type.IsVector()) { throw _host.Except("Input column type must be a vector."); } int dim = type.AsVector().DimCount(); if (dim > 1) { throw _host.Except("Input column type must be a vector of one dimension."); } int size = dim > 0 ? type.AsVector().GetDim(0) : 0; if (size > 0) { size = TotalCumulated[_args.degree](size); } ch.Trace("PolynomialTransform {0}->{1}.", dim, size); // We extend the input schema. The new type has the same type as the input. _schema = ExtendedSchema.Create(new ExtendedSchema(input.Schema, new[] { column.Name }, new[] { new VectorDataViewType(type.AsVector().ItemType(), size) })); } }
void TrainTransform() { lock (_lock) { if (_Results != null) { return; } using (var ch = _host.Start("Optics")) { var sw = Stopwatch.StartNew(); sw.Start(); var points = new List <IPointIdFloat>(); var index = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features); // Caching data. ch.Info(MessageSensitivity.None, "Caching the data."); using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index))) { var getter = cursor.GetGetter <VBuffer <float> >(index); var getterId = cursor.GetIdGetter(); DataViewRowId id = new DataViewRowId(); VBuffer <float> tmp = new VBuffer <float>(); for (int i = 0; cursor.MoveNext(); i++) { getter(ref tmp); getterId(ref id); points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c))); } } // Mapping. // int: index of a cluster // long: index of a point var mapping = new int[points.Count]; var mapprev = new Dictionary <long, int>(); float[] distances = null; if (_args.epsilons == null || _args.epsilons.Count() == 0) { float mind, maxd; distances = new[] { EstimateDistance(ch, points, out mind, out maxd) }; ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distances.First(), mind, maxd); } else { distances = _args.epsilonsDouble; } var maxEpsilon = distances.Max(); _Results = new List <Dictionary <int, ClusteringResult> >(); _reversedMapping = new List <Dictionary <long, int> >(); Optics opticsAlgo = new Optics(points, _args.seed); //Ordering ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count); int nPoints = points.Count; int cyclesBetweenLogging = Math.Min(1000, nPoints / 10); int currentIteration = 0; Action progressLogger = () => { if (++currentIteration % cyclesBetweenLogging == 0) { ch.Info(MessageSensitivity.UserData, "Processing {0}/{1}", currentIteration, nPoints); } }; OpticsOrdering opticsOrdering = opticsAlgo.Ordering( maxEpsilon, _args.minPoints, seed: _args.seed, onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg), onPointProcessing: progressLogger); // Clustering. foreach (var epsilon in distances) { ch.Info(MessageSensitivity.UserData, "Clustering {0} points using epsilon={1}.", points.Count, epsilon); Dictionary <long, int> results = opticsOrdering.Cluster(epsilon); HashSet <int> clusterIds = new HashSet <int>(); for (int i = 0; i < results.Count; ++i) { var p = points[i]; int cluster = results[p.id]; mapprev[p.id] = cluster; mapping[i] = cluster; if (cluster != DBScan.NOISE) { clusterIds.Add(cluster); } } _reversedMapping.Add(mapprev); // Cleaning small clusters. ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints); var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() }); var finalCounts = finalCounts_.ToDictionary(c => c.key, d => d.nb); results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value)) .ToDictionary(c => c.Key, c => c.Value); // Cleaning. ch.Info(MessageSensitivity.None, "Cleaning."); // We replace by the original labels. var runResults = new Dictionary <int, ClusteringResult>(); for (int i = 0; i < results.Count; ++i) { runResults[i] = new ClusteringResult() { cl = results[i] != DBScan.NOISE ? results[i] : -1, score = results[i] != DBScan.NOISE ? 1f : 0f }; } _Results.Add(runResults); ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", clusterIds.Count); } sw.Stop(); ch.Info(MessageSensitivity.UserData, "'Optics' finished in {0}.", sw.Elapsed); } } }
void TrainTransform() { lock (_lock) { if (_Results != null) { return; } using (var ch = _host.Start("Starting Optics")) { var sw = Stopwatch.StartNew(); sw.Start(); var points = new List <IPointIdFloat>(); var index = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features); // Caching data. ch.Info("Caching the data."); using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index))) { var getter = cursor.GetGetter <VBuffer <float> >(index); var getterId = cursor.GetIdGetter(); DataViewRowId id = new DataViewRowId(); VBuffer <float> tmp = new VBuffer <float>(); for (int i = 0; cursor.MoveNext(); i++) { getter(ref tmp); getterId(ref id); points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c))); } } // Mapping. // long: index in the ordering // long: index of a point var mapping = new long[points.Count]; var mapprev = new Dictionary <long, long>(); var distance = (float)_args.epsilon; if (distance <= 0) { float mind, maxd; distance = EstimateDistance(ch, points, out mind, out maxd); ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd); } Optics opticsAlgo = new Optics(points, _args.seed); //Ordering ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count); int nPoints = points.Count; int cyclesBetweenLogging = Math.Min(1000, nPoints / 10); int currentIteration = 0; Action progressLogger = () => { if (++currentIteration % cyclesBetweenLogging == 0) { ch.Info(MessageSensitivity.None, "Processing {0}/{1}", currentIteration, nPoints); } }; OpticsOrdering opticsOrdering = opticsAlgo.Ordering( distance, _args.minPoints, seed: _args.seed, onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg), onPointProcessing: progressLogger); IReadOnlyDictionary <long, long> results = opticsOrdering.orderingMapping; var reachabilityDs = opticsOrdering.reachabilityDistances; var coreDs = opticsOrdering.coreDistancesCache; for (int i = 0; i < results.Count; ++i) { var p = points[i]; mapprev[results[i]] = i; mapping[i] = results[i]; } _reversedMapping = mapprev; // Cleaning. ch.Info(MessageSensitivity.None, "Cleaning."); // We replace by the original labels. _Results = new OpticsOrderingResult[results.Count]; for (int i = 0; i < results.Count; ++i) { long pId = points[i].id; float?rd; float?cd; reachabilityDs.TryGetValue(pId, out rd); coreDs.TryGetValue(pId, out cd); _Results[i] = new OpticsOrderingResult() { id = results[i] != DBScan.NOISE ? results[i] : -1, reachability = (float)rd.GetValueOrDefault(float.PositiveInfinity), core = (float)cd.GetValueOrDefault(float.PositiveInfinity) }; } ch.Info(MessageSensitivity.UserData, "Ordered {0} points.", _Results.Count()); sw.Stop(); ch.Info(MessageSensitivity.None, "'OpticsOrdering' finished in {0}.", sw.Elapsed); } } }
void TrainTransform() { lock (_lock) { if (_reversedMapping != null) { return; } using (var ch = _host.Start("DBScan")) { var sw = Stopwatch.StartNew(); sw.Start(); var points = new List <IPointIdFloat>(); var index = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features); // Caching data. ch.Info(MessageSensitivity.None, "Caching the data."); using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index))) { var getter = cursor.GetGetter <VBuffer <float> >(index); var getterId = cursor.GetIdGetter(); DataViewRowId id = new DataViewRowId(); VBuffer <float> tmp = new VBuffer <float>(); for (int i = 0; cursor.MoveNext(); ++i) { getter(ref tmp); getterId(ref id); points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues())); } } // Mapping. // int: index of a cluster // long: index of a point var mapping = new int[points.Count]; var mapprev = new Dictionary <long, int>(); float distance = _args.epsilon; if (distance <= 0) { float mind, maxd; distance = EstimateDistance(ch, points, out mind, out maxd); ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd); } DBScan dbscanAlgo = new DBScan(points, _args.seed); // Clustering. ch.Info(MessageSensitivity.UserData, "Clustering {0} points.", points.Count); int nPoints = points.Count; int cyclesBetweenLogging = Math.Min(1000, nPoints / 10); int currentIteration = 0; Action <int> progressLogger = nClusters => { if (++currentIteration % cyclesBetweenLogging == 0) { ch.Info(MessageSensitivity.UserData, "Processing {0}/{1} - NbClusters={2}", currentIteration, nPoints, nClusters); } }; Dictionary <long, int> results = dbscanAlgo.Cluster( distance, _args.minPoints, seed: _args.seed, onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg), onPointProcessing: progressLogger); // Cleaning small clusters. ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints); var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() }); var finalCounts = finalCounts_.ToDictionary(c => c.key, d => d.nb); results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value)) .ToDictionary(c => c.Key, c => c.Value); _reversedMapping = new Dictionary <long, Tuple <int, float> >(); ch.Info(MessageSensitivity.None, "Compute scores."); HashSet <int> clusterIds = new HashSet <int>(); for (int i = 0; i < results.Count; ++i) { IPointIdFloat p = points[i]; int cluster = results[p.id]; mapprev[p.id] = cluster; if (cluster >= 0) // -1 is noise { mapping[cluster] = cluster; } mapping[i] = cluster; if (cluster != DBScan.NOISE) { clusterIds.Add(cluster); } } foreach (var p in points) { if (mapprev[p.id] < 0) { continue; } _reversedMapping[p.id] = new Tuple <int, float>(mapprev[p.id], dbscanAlgo.Score(p, _args.epsilon, mapprev)); } // Adding points with no clusters. foreach (var p in points) { if (!_reversedMapping.ContainsKey(p.id)) { _reversedMapping[p.id] = new Tuple <int, float>(-1, float.PositiveInfinity); } } if (_reversedMapping.Count != points.Count) { throw ch.Except("Mismatch between the number of points. This means some ids are not unique {0} != {1}.", _reversedMapping.Count, points.Count); } ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", mapprev.Select(c => c.Value).Where(c => c >= 0).Distinct().Count()); sw.Stop(); ch.Info(MessageSensitivity.UserData, "'DBScan' finished in {0}.", sw.Elapsed); } } }
public ShakeInputState(IHostEnvironment host, IDataView input, IValueMapper[] toShake, Arguments args) { _host = host.Register("ShakeInputState"); _host.CheckValue(input, "input"); _input = input; _lock = new object(); _args = args; _toShake = toShake; foreach (var vm in toShake) { if (vm.OutputType.IsVector() && vm.OutputType.AsVector().DimCount() > 1) { throw _host.Except("If a ValueMapper return a vector, it should have one dimension or zero."); } } _inputCol = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.inputColumn); _shakingValues = ExtractShakingValues(); if (_shakingValues.Length != _args.inputFeaturesInt.Length) { throw _host.Except("Shaking Values and columns to shake do not have the same dimension {0} and '{1}'.", _args.inputFeaturesInt.Length, _args.values); } var colTypes = new List <DataViewType>(); switch (_args.aggregation) { case ShakeAggregation.concatenate: int m = 1; foreach (var shakeVal in _shakingValues) { m *= shakeVal.Length; } if (m == 0) { throw _host.Except("No shaking values ('{0}')", _args.values); } foreach (var c in toShake) { var vt = c.OutputType.IsVector() ? new VectorDataViewType(c.OutputType.ItemType().AsPrimitive(), c.OutputType.AsVector().DimCount() == 0 ? 0 : c.OutputType.AsVector().GetDim(0) * m) : new VectorDataViewType(c.OutputType.AsPrimitive(), m); colTypes.Add(vt); } break; case ShakeAggregation.add: foreach (var c in toShake) { var vt = c.OutputType.IsVector() ? new VectorDataViewType(c.OutputType.ItemType().AsPrimitive(), c.OutputType.AsVector().DimCount() == 0 ? 0 : c.OutputType.AsVector().GetDim(0)) : new VectorDataViewType(c.OutputType.AsPrimitive(), 1); colTypes.Add(vt); } break; default: throw _host.ExceptNotSupp("Unknown aggregatino strategy {0}", _args.aggregation); } _schema = ExtendedSchema.Create(new ExtendedSchema(input.Schema, args.outputColumns, colTypes.ToArray())); }
void LoadCache(Random rand) { if (_cacheReplica != null) { // Already done. return; } uint?useed = _args.seed.HasValue ? (uint)_args.seed.Value : (uint?)null; if (rand == null) { rand = RandomUtils.Create(useed); } using (var ch = _host.Start("Resample: fill the cache")) { var indexClass = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.column, true); using (var cur = _input.GetRowCursor(Schema.Where(c => c.Index == indexClass.Index))) { if (string.IsNullOrEmpty(_args.column)) { _cacheReplica = new Dictionary <DataViewRowId, int>(); var gid = cur.GetIdGetter(); DataViewRowId did = default(DataViewRowId); int rep; while (cur.MoveNext()) { gid(ref did); rep = NextPoisson(_args.lambda, rand); _cacheReplica[did] = rep; } } else { var type = _input.Schema[indexClass.Index].Type; switch (type.RawKind()) { case DataKind.Boolean: bool clbool; if (!bool.TryParse(_args.classValue, out clbool)) { throw ch.Except("Unable to parse '{0}'.", _args.classValue); } LoadCache <bool>(rand, cur, indexClass, clbool, ch); break; case DataKind.UInt32: uint cluint; if (!uint.TryParse(_args.classValue, out cluint)) { throw ch.Except("Unable to parse '{0}'.", _args.classValue); } LoadCache <uint>(rand, cur, indexClass, cluint, ch); break; case DataKind.Single: float clfloat; if (!float.TryParse(_args.classValue, out clfloat)) { throw ch.Except("Unable to parse '{0}'.", _args.classValue); } LoadCache <float>(rand, cur, indexClass, clfloat, ch); break; case DataKind.String: var cltext = new ReadOnlyMemory <char>(_args.classValue.ToCharArray()); LoadCache <ReadOnlyMemory <char> >(rand, cur, indexClass, cltext, ch); break; default: throw _host.Except("Unsupported type '{0}'", type); } } } } }