public override ValueGetter <DataViewRowId> GetIdGetter() { #if (DEBUG) Dictionary <DataViewRowId, int> localCache = new Dictionary <DataViewRowId, int>(); #endif // We do not change the ID (row to row transform). var getId = _inputCursor.GetIdGetter(); return((ref DataViewRowId pos) => { getId(ref pos); if (_shift > 0) { Contracts.Assert(_copy >= 0 && _copy <= _maxReplica); ulong left = pos.Low << _shift; left >>= _shift; left = pos.Low - left; ulong lo = pos.Low << _shift; ulong hi = pos.High << _shift; hi += left >> (64 - _shift); pos = new DataViewRowId(lo + (ulong)_copy, hi); #if (DEBUG) if (localCache.ContainsKey(pos)) { throw Contracts.Except("Id already taken: {0}", pos); } #endif } else { Contracts.Assert(_copy == 0); } }); }
public override ValueGetter <DataViewRowId> GetIdGetter() { return((ref DataViewRowId val) => { Ch.Check(IsGood, RowCursorUtils.FetchValueStateError); val = new DataViewRowId((ulong)Position, 0); }); }
public override ValueGetter <DataViewRowId> GetIdGetter() { return ((ref DataViewRowId val) => { Ch.Check(IsGood, "Cannot call ID getter in current state"); val = new DataViewRowId((ulong)Position, 0); }); }
protected Func <bool> GetIdComparer(DataViewRow r1, DataViewRow r2, out ValueGetter <DataViewRowId> idGetter) { var g1 = r1.GetIdGetter(); idGetter = g1; var g2 = r2.GetIdGetter(); DataViewRowId v1 = default(DataViewRowId); DataViewRowId v2 = default(DataViewRowId); return (() => { g1(ref v1); g2(ref v2); return v1.Equals(v2); }); }
void LoadCache <TClass>(Random rand, DataViewRowCursor cur, DataViewSchema.Column classColumn, TClass valueClass, IChannel ch) { _cacheReplica = new Dictionary <DataViewRowId, int>(); var hist = new Dictionary <TClass, long>(); var gid = cur.GetIdGetter(); var gcl = cur.GetGetter <TClass>(classColumn); DataViewRowId did = default(DataViewRowId); TClass cl = default(TClass); long nbIn = 0; long nbOut = 0; int rep; while (cur.MoveNext()) { gcl(ref cl); gid(ref did); if (!hist.ContainsKey(cl)) { hist[cl] = 1; } else { ++hist[cl]; } if (cl.Equals(valueClass)) { rep = NextPoisson(_args.lambda, rand); ++nbIn; } else { rep = 1; ++nbOut; } _cacheReplica[did] = rep; } if (nbIn == 0) { ch.Warning(MessageSensitivity.UserData, "Resample on a condition never happened: nbIn={0} nbOut={1}", nbIn, nbOut); } }
public override ValueGetter <DataViewRowId> GetIdGetter() => (ref DataViewRowId val) => val = new DataViewRowId((ulong)Position, 0);
private void IdGetterImplementation(ref DataViewRowId id) => id = new DataViewRowId((ulong)_position, 0);
void TrainTransform() { lock (_lock) { if (_Results != null) { return; } using (var ch = _host.Start("Optics")) { var sw = Stopwatch.StartNew(); sw.Start(); var points = new List <IPointIdFloat>(); var index = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features); // Caching data. ch.Info(MessageSensitivity.None, "Caching the data."); using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index))) { var getter = cursor.GetGetter <VBuffer <float> >(index); var getterId = cursor.GetIdGetter(); DataViewRowId id = new DataViewRowId(); VBuffer <float> tmp = new VBuffer <float>(); for (int i = 0; cursor.MoveNext(); i++) { getter(ref tmp); getterId(ref id); points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c))); } } // Mapping. // int: index of a cluster // long: index of a point var mapping = new int[points.Count]; var mapprev = new Dictionary <long, int>(); float[] distances = null; if (_args.epsilons == null || _args.epsilons.Count() == 0) { float mind, maxd; distances = new[] { EstimateDistance(ch, points, out mind, out maxd) }; ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distances.First(), mind, maxd); } else { distances = _args.epsilonsDouble; } var maxEpsilon = distances.Max(); _Results = new List <Dictionary <int, ClusteringResult> >(); _reversedMapping = new List <Dictionary <long, int> >(); Optics opticsAlgo = new Optics(points, _args.seed); //Ordering ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count); int nPoints = points.Count; int cyclesBetweenLogging = Math.Min(1000, nPoints / 10); int currentIteration = 0; Action progressLogger = () => { if (++currentIteration % cyclesBetweenLogging == 0) { ch.Info(MessageSensitivity.UserData, "Processing {0}/{1}", currentIteration, nPoints); } }; OpticsOrdering opticsOrdering = opticsAlgo.Ordering( maxEpsilon, _args.minPoints, seed: _args.seed, onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg), onPointProcessing: progressLogger); // Clustering. foreach (var epsilon in distances) { ch.Info(MessageSensitivity.UserData, "Clustering {0} points using epsilon={1}.", points.Count, epsilon); Dictionary <long, int> results = opticsOrdering.Cluster(epsilon); HashSet <int> clusterIds = new HashSet <int>(); for (int i = 0; i < results.Count; ++i) { var p = points[i]; int cluster = results[p.id]; mapprev[p.id] = cluster; mapping[i] = cluster; if (cluster != DBScan.NOISE) { clusterIds.Add(cluster); } } _reversedMapping.Add(mapprev); // Cleaning small clusters. ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints); var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() }); var finalCounts = finalCounts_.ToDictionary(c => c.key, d => d.nb); results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value)) .ToDictionary(c => c.Key, c => c.Value); // Cleaning. ch.Info(MessageSensitivity.None, "Cleaning."); // We replace by the original labels. var runResults = new Dictionary <int, ClusteringResult>(); for (int i = 0; i < results.Count; ++i) { runResults[i] = new ClusteringResult() { cl = results[i] != DBScan.NOISE ? results[i] : -1, score = results[i] != DBScan.NOISE ? 1f : 0f }; } _Results.Add(runResults); ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", clusterIds.Count); } sw.Stop(); ch.Info(MessageSensitivity.UserData, "'Optics' finished in {0}.", sw.Elapsed); } } }
protected bool CheckSameValues(DataViewRowCursor curs1, DataViewRowCursor curs2, bool exactTypes, bool exactDoubles, bool checkId, bool checkIdCollisions = true) { Contracts.Assert(curs1.Schema.Count == curs2.Schema.Count); // Get the comparison delegates for each column. int colLim = curs1.Schema.Count; Func <bool>[] comps = new Func <bool> [colLim]; for (int col = 0; col < colLim; col++) { var f1 = curs1.IsColumnActive(curs1.Schema[col]); var f2 = curs2.IsColumnActive(curs2.Schema[col]); if (f1 && f2) { var type1 = curs1.Schema[col].Type; var type2 = curs2.Schema[col].Type; if (!TestCommon.EqualTypes(type1, type2, exactTypes)) { Fail($"Different types {type1} and {type2}"); return(Failed()); } comps[col] = GetColumnComparer(curs1, curs2, col, type1, exactDoubles); } } ValueGetter <DataViewRowId> idGetter = null; Func <bool> idComp = checkId ? GetIdComparer(curs1, curs2, out idGetter) : null; HashSet <DataViewRowId> idsSeen = null; if (checkIdCollisions && idGetter == null) { idGetter = curs1.GetIdGetter(); } long idCollisions = 0; DataViewRowId id = default(DataViewRowId); for (; ;) { bool f1 = curs1.MoveNext(); bool f2 = curs2.MoveNext(); if (f1 != f2) { if (f1) { Fail("Left has more rows at position: {0}", curs1.Position); } else { Fail("Right has more rows at position: {0}", curs2.Position); } return(Failed()); } if (!f1) { if (idCollisions > 0) { Fail("{0} id collisions among {1} items", idCollisions, Utils.Size(idsSeen) + idCollisions); } return(idCollisions == 0); } else if (checkIdCollisions) { idGetter(ref id); if (!Utils.Add(ref idsSeen, id)) { if (idCollisions == 0) { idCollisions++; } } } Contracts.Assert(curs1.Position == curs2.Position); for (int col = 0; col < colLim; col++) { var comp = comps[col]; if (comp != null && !comp()) { Fail("Different values in column {0} of row {1}", col, curs1.Position); return(Failed()); } if (idComp != null && !idComp()) { Fail("Different values in ID of row {0}", curs1.Position); return(Failed()); } } } }
public override ValueGetter <DataViewRowId> GetIdGetter() { return((ref DataViewRowId uid) => { uid = new DataViewRowId(0, 1); }); }
void TrainTransform() { lock (_lock) { if (_Results != null) { return; } using (var ch = _host.Start("Starting Optics")) { var sw = Stopwatch.StartNew(); sw.Start(); var points = new List <IPointIdFloat>(); var index = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features); // Caching data. ch.Info("Caching the data."); using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index))) { var getter = cursor.GetGetter <VBuffer <float> >(index); var getterId = cursor.GetIdGetter(); DataViewRowId id = new DataViewRowId(); VBuffer <float> tmp = new VBuffer <float>(); for (int i = 0; cursor.MoveNext(); i++) { getter(ref tmp); getterId(ref id); points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c))); } } // Mapping. // long: index in the ordering // long: index of a point var mapping = new long[points.Count]; var mapprev = new Dictionary <long, long>(); var distance = (float)_args.epsilon; if (distance <= 0) { float mind, maxd; distance = EstimateDistance(ch, points, out mind, out maxd); ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd); } Optics opticsAlgo = new Optics(points, _args.seed); //Ordering ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count); int nPoints = points.Count; int cyclesBetweenLogging = Math.Min(1000, nPoints / 10); int currentIteration = 0; Action progressLogger = () => { if (++currentIteration % cyclesBetweenLogging == 0) { ch.Info(MessageSensitivity.None, "Processing {0}/{1}", currentIteration, nPoints); } }; OpticsOrdering opticsOrdering = opticsAlgo.Ordering( distance, _args.minPoints, seed: _args.seed, onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg), onPointProcessing: progressLogger); IReadOnlyDictionary <long, long> results = opticsOrdering.orderingMapping; var reachabilityDs = opticsOrdering.reachabilityDistances; var coreDs = opticsOrdering.coreDistancesCache; for (int i = 0; i < results.Count; ++i) { var p = points[i]; mapprev[results[i]] = i; mapping[i] = results[i]; } _reversedMapping = mapprev; // Cleaning. ch.Info(MessageSensitivity.None, "Cleaning."); // We replace by the original labels. _Results = new OpticsOrderingResult[results.Count]; for (int i = 0; i < results.Count; ++i) { long pId = points[i].id; float?rd; float?cd; reachabilityDs.TryGetValue(pId, out rd); coreDs.TryGetValue(pId, out cd); _Results[i] = new OpticsOrderingResult() { id = results[i] != DBScan.NOISE ? results[i] : -1, reachability = (float)rd.GetValueOrDefault(float.PositiveInfinity), core = (float)cd.GetValueOrDefault(float.PositiveInfinity) }; } ch.Info(MessageSensitivity.UserData, "Ordered {0} points.", _Results.Count()); sw.Stop(); ch.Info(MessageSensitivity.None, "'OpticsOrdering' finished in {0}.", sw.Elapsed); } } }
void TrainTransform() { lock (_lock) { if (_reversedMapping != null) { return; } using (var ch = _host.Start("DBScan")) { var sw = Stopwatch.StartNew(); sw.Start(); var points = new List <IPointIdFloat>(); var index = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features); // Caching data. ch.Info(MessageSensitivity.None, "Caching the data."); using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index))) { var getter = cursor.GetGetter <VBuffer <float> >(index); var getterId = cursor.GetIdGetter(); DataViewRowId id = new DataViewRowId(); VBuffer <float> tmp = new VBuffer <float>(); for (int i = 0; cursor.MoveNext(); ++i) { getter(ref tmp); getterId(ref id); points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues())); } } // Mapping. // int: index of a cluster // long: index of a point var mapping = new int[points.Count]; var mapprev = new Dictionary <long, int>(); float distance = _args.epsilon; if (distance <= 0) { float mind, maxd; distance = EstimateDistance(ch, points, out mind, out maxd); ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd); } DBScan dbscanAlgo = new DBScan(points, _args.seed); // Clustering. ch.Info(MessageSensitivity.UserData, "Clustering {0} points.", points.Count); int nPoints = points.Count; int cyclesBetweenLogging = Math.Min(1000, nPoints / 10); int currentIteration = 0; Action <int> progressLogger = nClusters => { if (++currentIteration % cyclesBetweenLogging == 0) { ch.Info(MessageSensitivity.UserData, "Processing {0}/{1} - NbClusters={2}", currentIteration, nPoints, nClusters); } }; Dictionary <long, int> results = dbscanAlgo.Cluster( distance, _args.minPoints, seed: _args.seed, onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg), onPointProcessing: progressLogger); // Cleaning small clusters. ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints); var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() }); var finalCounts = finalCounts_.ToDictionary(c => c.key, d => d.nb); results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value)) .ToDictionary(c => c.Key, c => c.Value); _reversedMapping = new Dictionary <long, Tuple <int, float> >(); ch.Info(MessageSensitivity.None, "Compute scores."); HashSet <int> clusterIds = new HashSet <int>(); for (int i = 0; i < results.Count; ++i) { IPointIdFloat p = points[i]; int cluster = results[p.id]; mapprev[p.id] = cluster; if (cluster >= 0) // -1 is noise { mapping[cluster] = cluster; } mapping[i] = cluster; if (cluster != DBScan.NOISE) { clusterIds.Add(cluster); } } foreach (var p in points) { if (mapprev[p.id] < 0) { continue; } _reversedMapping[p.id] = new Tuple <int, float>(mapprev[p.id], dbscanAlgo.Score(p, _args.epsilon, mapprev)); } // Adding points with no clusters. foreach (var p in points) { if (!_reversedMapping.ContainsKey(p.id)) { _reversedMapping[p.id] = new Tuple <int, float>(-1, float.PositiveInfinity); } } if (_reversedMapping.Count != points.Count) { throw ch.Except("Mismatch between the number of points. This means some ids are not unique {0} != {1}.", _reversedMapping.Count, points.Count); } ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", mapprev.Select(c => c.Value).Where(c => c >= 0).Distinct().Count()); sw.Stop(); ch.Info(MessageSensitivity.UserData, "'DBScan' finished in {0}.", sw.Elapsed); } } }
public override ValueGetter <DataViewRowId> GetIdGetter() { return((ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0)); }
void LoadCache(Random rand) { if (_cacheReplica != null) { // Already done. return; } uint?useed = _args.seed.HasValue ? (uint)_args.seed.Value : (uint?)null; if (rand == null) { rand = RandomUtils.Create(useed); } using (var ch = _host.Start("Resample: fill the cache")) { var indexClass = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.column, true); using (var cur = _input.GetRowCursor(Schema.Where(c => c.Index == indexClass.Index))) { if (string.IsNullOrEmpty(_args.column)) { _cacheReplica = new Dictionary <DataViewRowId, int>(); var gid = cur.GetIdGetter(); DataViewRowId did = default(DataViewRowId); int rep; while (cur.MoveNext()) { gid(ref did); rep = NextPoisson(_args.lambda, rand); _cacheReplica[did] = rep; } } else { var type = _input.Schema[indexClass.Index].Type; switch (type.RawKind()) { case DataKind.Boolean: bool clbool; if (!bool.TryParse(_args.classValue, out clbool)) { throw ch.Except("Unable to parse '{0}'.", _args.classValue); } LoadCache <bool>(rand, cur, indexClass, clbool, ch); break; case DataKind.UInt32: uint cluint; if (!uint.TryParse(_args.classValue, out cluint)) { throw ch.Except("Unable to parse '{0}'.", _args.classValue); } LoadCache <uint>(rand, cur, indexClass, cluint, ch); break; case DataKind.Single: float clfloat; if (!float.TryParse(_args.classValue, out clfloat)) { throw ch.Except("Unable to parse '{0}'.", _args.classValue); } LoadCache <float>(rand, cur, indexClass, clfloat, ch); break; case DataKind.String: var cltext = new ReadOnlyMemory <char>(_args.classValue.ToCharArray()); LoadCache <ReadOnlyMemory <char> >(rand, cur, indexClass, cltext, ch); break; default: throw _host.Except("Unsupported type '{0}'", type); } } } } }
public override ValueGetter <DataViewRowId> GetIdGetter() { return((ref DataViewRowId value) => { value = new DataViewRowId((ulong)_rowIndex, 0); }); }