public override ValueGetter <DataViewRowId> GetIdGetter()
            {
#if (DEBUG)
                Dictionary <DataViewRowId, int> localCache = new Dictionary <DataViewRowId, int>();
#endif
                // We do not change the ID (row to row transform).
                var getId = _inputCursor.GetIdGetter();
                return((ref DataViewRowId pos) =>
                {
                    getId(ref pos);
                    if (_shift > 0)
                    {
                        Contracts.Assert(_copy >= 0 && _copy <= _maxReplica);
                        ulong left = pos.Low << _shift;
                        left >>= _shift;
                        left = pos.Low - left;
                        ulong lo = pos.Low << _shift;
                        ulong hi = pos.High << _shift;
                        hi += left >> (64 - _shift);
                        pos = new DataViewRowId(lo + (ulong)_copy, hi);
#if (DEBUG)
                        if (localCache.ContainsKey(pos))
                        {
                            throw Contracts.Except("Id already taken: {0}", pos);
                        }
#endif
                    }
                    else
                    {
                        Contracts.Assert(_copy == 0);
                    }
                });
            }
 public override ValueGetter <DataViewRowId> GetIdGetter()
 {
     return((ref DataViewRowId val) =>
     {
         Ch.Check(IsGood, RowCursorUtils.FetchValueStateError);
         val = new DataViewRowId((ulong)Position, 0);
     });
 }
Exemple #3
0
 public override ValueGetter <DataViewRowId> GetIdGetter()
 {
     return
         ((ref DataViewRowId val) =>
     {
         Ch.Check(IsGood, "Cannot call ID getter in current state");
         val = new DataViewRowId((ulong)Position, 0);
     });
 }
        protected Func <bool> GetIdComparer(DataViewRow r1, DataViewRow r2, out ValueGetter <DataViewRowId> idGetter)
        {
            var g1 = r1.GetIdGetter();

            idGetter = g1;
            var           g2 = r2.GetIdGetter();
            DataViewRowId v1 = default(DataViewRowId);
            DataViewRowId v2 = default(DataViewRowId);

            return
                (() =>
            {
                g1(ref v1);
                g2(ref v2);
                return v1.Equals(v2);
            });
        }
        void LoadCache <TClass>(Random rand, DataViewRowCursor cur, DataViewSchema.Column classColumn, TClass valueClass, IChannel ch)
        {
            _cacheReplica = new Dictionary <DataViewRowId, int>();
            var           hist  = new Dictionary <TClass, long>();
            var           gid   = cur.GetIdGetter();
            var           gcl   = cur.GetGetter <TClass>(classColumn);
            DataViewRowId did   = default(DataViewRowId);
            TClass        cl    = default(TClass);
            long          nbIn  = 0;
            long          nbOut = 0;
            int           rep;

            while (cur.MoveNext())
            {
                gcl(ref cl);
                gid(ref did);
                if (!hist.ContainsKey(cl))
                {
                    hist[cl] = 1;
                }
                else
                {
                    ++hist[cl];
                }
                if (cl.Equals(valueClass))
                {
                    rep = NextPoisson(_args.lambda, rand);
                    ++nbIn;
                }
                else
                {
                    rep = 1;
                    ++nbOut;
                }
                _cacheReplica[did] = rep;
            }
            if (nbIn == 0)
            {
                ch.Warning(MessageSensitivity.UserData, "Resample on a condition never happened: nbIn={0} nbOut={1}", nbIn, nbOut);
            }
        }
 public override ValueGetter <DataViewRowId> GetIdGetter()
 => (ref DataViewRowId val) => val = new DataViewRowId((ulong)Position, 0);
Exemple #7
0
 private void IdGetterImplementation(ref DataViewRowId id)
 => id = new DataViewRowId((ulong)_position, 0);
Exemple #8
0
            void TrainTransform()
            {
                lock (_lock)
                {
                    if (_Results != null)
                    {
                        return;
                    }

                    using (var ch = _host.Start("Optics"))
                    {
                        var sw = Stopwatch.StartNew();
                        sw.Start();
                        var points = new List <IPointIdFloat>();
                        var index  = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features);

                        // Caching data.
                        ch.Info(MessageSensitivity.None, "Caching the data.");
                        using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index)))
                        {
                            var           getter   = cursor.GetGetter <VBuffer <float> >(index);
                            var           getterId = cursor.GetIdGetter();
                            DataViewRowId id       = new DataViewRowId();

                            VBuffer <float> tmp = new VBuffer <float>();

                            for (int i = 0; cursor.MoveNext(); i++)
                            {
                                getter(ref tmp);
                                getterId(ref id);
                                points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c)));
                            }
                        }

                        // Mapping.
                        // int: index of a cluster
                        // long: index of a point
                        var mapping = new int[points.Count];
                        var mapprev = new Dictionary <long, int>();

                        float[] distances = null;
                        if (_args.epsilons == null || _args.epsilons.Count() == 0)
                        {
                            float mind, maxd;
                            distances = new[] { EstimateDistance(ch, points, out mind, out maxd) };
                            ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distances.First(), mind, maxd);
                        }
                        else
                        {
                            distances = _args.epsilonsDouble;
                        }

                        var maxEpsilon = distances.Max();
                        _Results         = new List <Dictionary <int, ClusteringResult> >();
                        _reversedMapping = new List <Dictionary <long, int> >();

                        Optics opticsAlgo = new Optics(points, _args.seed);
                        //Ordering
                        ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count);
                        int    nPoints = points.Count;
                        int    cyclesBetweenLogging = Math.Min(1000, nPoints / 10);
                        int    currentIteration     = 0;
                        Action progressLogger       = () =>
                        {
                            if (++currentIteration % cyclesBetweenLogging == 0)
                            {
                                ch.Info(MessageSensitivity.UserData, "Processing {0}/{1}", currentIteration, nPoints);
                            }
                        };

                        OpticsOrdering opticsOrdering = opticsAlgo.Ordering(
                            maxEpsilon,
                            _args.minPoints,
                            seed: _args.seed,
                            onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg),
                            onPointProcessing: progressLogger);

                        // Clustering.
                        foreach (var epsilon in distances)
                        {
                            ch.Info(MessageSensitivity.UserData, "Clustering {0} points using epsilon={1}.", points.Count, epsilon);
                            Dictionary <long, int> results = opticsOrdering.Cluster(epsilon);

                            HashSet <int> clusterIds = new HashSet <int>();

                            for (int i = 0; i < results.Count; ++i)
                            {
                                var p       = points[i];
                                int cluster = results[p.id];
                                mapprev[p.id] = cluster;
                                mapping[i]    = cluster;
                                if (cluster != DBScan.NOISE)
                                {
                                    clusterIds.Add(cluster);
                                }
                            }

                            _reversedMapping.Add(mapprev);

                            // Cleaning small clusters.
                            ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints);
                            var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() });
                            var finalCounts  = finalCounts_.ToDictionary(c => c.key, d => d.nb);
                            results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value))
                                      .ToDictionary(c => c.Key, c => c.Value);

                            // Cleaning.
                            ch.Info(MessageSensitivity.None, "Cleaning.");
                            // We replace by the original labels.
                            var runResults = new Dictionary <int, ClusteringResult>();
                            for (int i = 0; i < results.Count; ++i)
                            {
                                runResults[i] = new ClusteringResult()
                                {
                                    cl    = results[i] != DBScan.NOISE ? results[i] : -1,
                                    score = results[i] != DBScan.NOISE ? 1f : 0f
                                };
                            }

                            _Results.Add(runResults);
                            ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", clusterIds.Count);
                        }
                        sw.Stop();
                        ch.Info(MessageSensitivity.UserData, "'Optics' finished in {0}.", sw.Elapsed);
                    }
                }
            }
        protected bool CheckSameValues(DataViewRowCursor curs1, DataViewRowCursor curs2, bool exactTypes, bool exactDoubles, bool checkId, bool checkIdCollisions = true)
        {
            Contracts.Assert(curs1.Schema.Count == curs2.Schema.Count);

            // Get the comparison delegates for each column.
            int colLim = curs1.Schema.Count;

            Func <bool>[] comps = new Func <bool> [colLim];
            for (int col = 0; col < colLim; col++)
            {
                var f1 = curs1.IsColumnActive(curs1.Schema[col]);
                var f2 = curs2.IsColumnActive(curs2.Schema[col]);

                if (f1 && f2)
                {
                    var type1 = curs1.Schema[col].Type;
                    var type2 = curs2.Schema[col].Type;
                    if (!TestCommon.EqualTypes(type1, type2, exactTypes))
                    {
                        Fail($"Different types {type1} and {type2}");
                        return(Failed());
                    }
                    comps[col] = GetColumnComparer(curs1, curs2, col, type1, exactDoubles);
                }
            }
            ValueGetter <DataViewRowId> idGetter = null;
            Func <bool>             idComp       = checkId ? GetIdComparer(curs1, curs2, out idGetter) : null;
            HashSet <DataViewRowId> idsSeen      = null;

            if (checkIdCollisions && idGetter == null)
            {
                idGetter = curs1.GetIdGetter();
            }
            long          idCollisions = 0;
            DataViewRowId id           = default(DataViewRowId);

            for (; ;)
            {
                bool f1 = curs1.MoveNext();
                bool f2 = curs2.MoveNext();
                if (f1 != f2)
                {
                    if (f1)
                    {
                        Fail("Left has more rows at position: {0}", curs1.Position);
                    }
                    else
                    {
                        Fail("Right has more rows at position: {0}", curs2.Position);
                    }
                    return(Failed());
                }

                if (!f1)
                {
                    if (idCollisions > 0)
                    {
                        Fail("{0} id collisions among {1} items", idCollisions, Utils.Size(idsSeen) + idCollisions);
                    }
                    return(idCollisions == 0);
                }
                else if (checkIdCollisions)
                {
                    idGetter(ref id);
                    if (!Utils.Add(ref idsSeen, id))
                    {
                        if (idCollisions == 0)
                        {
                            idCollisions++;
                        }
                    }
                }

                Contracts.Assert(curs1.Position == curs2.Position);

                for (int col = 0; col < colLim; col++)
                {
                    var comp = comps[col];
                    if (comp != null && !comp())
                    {
                        Fail("Different values in column {0} of row {1}", col, curs1.Position);
                        return(Failed());
                    }
                    if (idComp != null && !idComp())
                    {
                        Fail("Different values in ID of row {0}", curs1.Position);
                        return(Failed());
                    }
                }
            }
        }
Exemple #10
0
 public override ValueGetter <DataViewRowId> GetIdGetter()
 {
     return((ref DataViewRowId uid) => { uid = new DataViewRowId(0, 1); });
 }
            void TrainTransform()
            {
                lock (_lock)
                {
                    if (_Results != null)
                    {
                        return;
                    }

                    using (var ch = _host.Start("Starting Optics"))
                    {
                        var sw = Stopwatch.StartNew();
                        sw.Start();
                        var points = new List <IPointIdFloat>();
                        var index  = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features);

                        // Caching data.
                        ch.Info("Caching the data.");
                        using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index)))
                        {
                            var           getter   = cursor.GetGetter <VBuffer <float> >(index);
                            var           getterId = cursor.GetIdGetter();
                            DataViewRowId id       = new DataViewRowId();

                            VBuffer <float> tmp = new VBuffer <float>();

                            for (int i = 0; cursor.MoveNext(); i++)
                            {
                                getter(ref tmp);
                                getterId(ref id);
                                points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c)));
                            }
                        }

                        // Mapping.
                        // long: index in the ordering
                        // long: index of a point
                        var mapping = new long[points.Count];
                        var mapprev = new Dictionary <long, long>();

                        var distance = (float)_args.epsilon;
                        if (distance <= 0)
                        {
                            float mind, maxd;
                            distance = EstimateDistance(ch, points, out mind, out maxd);
                            ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd);
                        }

                        Optics opticsAlgo = new Optics(points, _args.seed);
                        //Ordering
                        ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count);
                        int nPoints = points.Count;
                        int cyclesBetweenLogging = Math.Min(1000, nPoints / 10);
                        int currentIteration     = 0;

                        Action progressLogger = () =>
                        {
                            if (++currentIteration % cyclesBetweenLogging == 0)
                            {
                                ch.Info(MessageSensitivity.None, "Processing {0}/{1}", currentIteration, nPoints);
                            }
                        };

                        OpticsOrdering opticsOrdering = opticsAlgo.Ordering(
                            distance,
                            _args.minPoints,
                            seed: _args.seed,
                            onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg),
                            onPointProcessing: progressLogger);
                        IReadOnlyDictionary <long, long> results = opticsOrdering.orderingMapping;
                        var reachabilityDs = opticsOrdering.reachabilityDistances;
                        var coreDs         = opticsOrdering.coreDistancesCache;

                        for (int i = 0; i < results.Count; ++i)
                        {
                            var p = points[i];
                            mapprev[results[i]] = i;
                            mapping[i]          = results[i];
                        }
                        _reversedMapping = mapprev;

                        // Cleaning.
                        ch.Info(MessageSensitivity.None, "Cleaning.");
                        // We replace by the original labels.
                        _Results = new OpticsOrderingResult[results.Count];

                        for (int i = 0; i < results.Count; ++i)
                        {
                            long  pId = points[i].id;
                            float?rd;
                            float?cd;

                            reachabilityDs.TryGetValue(pId, out rd);
                            coreDs.TryGetValue(pId, out cd);

                            _Results[i] = new OpticsOrderingResult()
                            {
                                id           = results[i] != DBScan.NOISE ? results[i] : -1,
                                reachability = (float)rd.GetValueOrDefault(float.PositiveInfinity),
                                core         = (float)cd.GetValueOrDefault(float.PositiveInfinity)
                            };
                        }
                        ch.Info(MessageSensitivity.UserData, "Ordered {0} points.", _Results.Count());
                        sw.Stop();
                        ch.Info(MessageSensitivity.None, "'OpticsOrdering' finished in {0}.", sw.Elapsed);
                    }
                }
            }
Exemple #12
0
            void TrainTransform()
            {
                lock (_lock)
                {
                    if (_reversedMapping != null)
                    {
                        return;
                    }

                    using (var ch = _host.Start("DBScan"))
                    {
                        var sw = Stopwatch.StartNew();
                        sw.Start();
                        var points = new List <IPointIdFloat>();
                        var index  = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features);

                        // Caching data.
                        ch.Info(MessageSensitivity.None, "Caching the data.");
                        using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index)))
                        {
                            var           getter   = cursor.GetGetter <VBuffer <float> >(index);
                            var           getterId = cursor.GetIdGetter();
                            DataViewRowId id       = new DataViewRowId();

                            VBuffer <float> tmp = new VBuffer <float>();

                            for (int i = 0; cursor.MoveNext(); ++i)
                            {
                                getter(ref tmp);
                                getterId(ref id);
                                points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues()));
                            }
                        }

                        // Mapping.
                        // int: index of a cluster
                        // long: index of a point
                        var mapping = new int[points.Count];
                        var mapprev = new Dictionary <long, int>();

                        float distance = _args.epsilon;
                        if (distance <= 0)
                        {
                            float mind, maxd;
                            distance = EstimateDistance(ch, points, out mind, out maxd);
                            ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd);
                        }

                        DBScan dbscanAlgo = new DBScan(points, _args.seed);
                        // Clustering.
                        ch.Info(MessageSensitivity.UserData, "Clustering {0} points.", points.Count);

                        int          nPoints = points.Count;
                        int          cyclesBetweenLogging = Math.Min(1000, nPoints / 10);
                        int          currentIteration     = 0;
                        Action <int> progressLogger       = nClusters =>
                        {
                            if (++currentIteration % cyclesBetweenLogging == 0)
                            {
                                ch.Info(MessageSensitivity.UserData, "Processing  {0}/{1} - NbClusters={2}", currentIteration, nPoints, nClusters);
                            }
                        };

                        Dictionary <long, int> results = dbscanAlgo.Cluster(
                            distance,
                            _args.minPoints,
                            seed: _args.seed,
                            onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg),
                            onPointProcessing: progressLogger);

                        // Cleaning small clusters.
                        ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints);
                        var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() });
                        var finalCounts  = finalCounts_.ToDictionary(c => c.key, d => d.nb);
                        results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value))
                                  .ToDictionary(c => c.Key, c => c.Value);

                        _reversedMapping = new Dictionary <long, Tuple <int, float> >();

                        ch.Info(MessageSensitivity.None, "Compute scores.");
                        HashSet <int> clusterIds = new HashSet <int>();
                        for (int i = 0; i < results.Count; ++i)
                        {
                            IPointIdFloat p = points[i];

                            int cluster = results[p.id];
                            mapprev[p.id] = cluster;
                            if (cluster >= 0)  // -1 is noise
                            {
                                mapping[cluster] = cluster;
                            }
                            mapping[i] = cluster;
                            if (cluster != DBScan.NOISE)
                            {
                                clusterIds.Add(cluster);
                            }
                        }
                        foreach (var p in points)
                        {
                            if (mapprev[p.id] < 0)
                            {
                                continue;
                            }
                            _reversedMapping[p.id] = new Tuple <int, float>(mapprev[p.id],
                                                                            dbscanAlgo.Score(p, _args.epsilon, mapprev));
                        }

                        // Adding points with no clusters.
                        foreach (var p in points)
                        {
                            if (!_reversedMapping.ContainsKey(p.id))
                            {
                                _reversedMapping[p.id] = new Tuple <int, float>(-1, float.PositiveInfinity);
                            }
                        }

                        if (_reversedMapping.Count != points.Count)
                        {
                            throw ch.Except("Mismatch between the number of points. This means some ids are not unique {0} != {1}.", _reversedMapping.Count, points.Count);
                        }

                        ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", mapprev.Select(c => c.Value).Where(c => c >= 0).Distinct().Count());
                        sw.Stop();
                        ch.Info(MessageSensitivity.UserData, "'DBScan' finished in {0}.", sw.Elapsed);
                    }
                }
            }
 public override ValueGetter <DataViewRowId> GetIdGetter()
 {
     return((ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0));
 }
        void LoadCache(Random rand)
        {
            if (_cacheReplica != null)
            {
                // Already done.
                return;
            }

            uint?useed = _args.seed.HasValue ? (uint)_args.seed.Value : (uint?)null;

            if (rand == null)
            {
                rand = RandomUtils.Create(useed);
            }

            using (var ch = _host.Start("Resample: fill the cache"))
            {
                var indexClass = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.column, true);

                using (var cur = _input.GetRowCursor(Schema.Where(c => c.Index == indexClass.Index)))
                {
                    if (string.IsNullOrEmpty(_args.column))
                    {
                        _cacheReplica = new Dictionary <DataViewRowId, int>();
                        var           gid = cur.GetIdGetter();
                        DataViewRowId did = default(DataViewRowId);
                        int           rep;
                        while (cur.MoveNext())
                        {
                            gid(ref did);
                            rep = NextPoisson(_args.lambda, rand);
                            _cacheReplica[did] = rep;
                        }
                    }
                    else
                    {
                        var type = _input.Schema[indexClass.Index].Type;
                        switch (type.RawKind())
                        {
                        case DataKind.Boolean:
                            bool clbool;
                            if (!bool.TryParse(_args.classValue, out clbool))
                            {
                                throw ch.Except("Unable to parse '{0}'.", _args.classValue);
                            }
                            LoadCache <bool>(rand, cur, indexClass, clbool, ch);
                            break;

                        case DataKind.UInt32:
                            uint cluint;
                            if (!uint.TryParse(_args.classValue, out cluint))
                            {
                                throw ch.Except("Unable to parse '{0}'.", _args.classValue);
                            }
                            LoadCache <uint>(rand, cur, indexClass, cluint, ch);
                            break;

                        case DataKind.Single:
                            float clfloat;
                            if (!float.TryParse(_args.classValue, out clfloat))
                            {
                                throw ch.Except("Unable to parse '{0}'.", _args.classValue);
                            }
                            LoadCache <float>(rand, cur, indexClass, clfloat, ch);
                            break;

                        case DataKind.String:
                            var cltext = new ReadOnlyMemory <char>(_args.classValue.ToCharArray());
                            LoadCache <ReadOnlyMemory <char> >(rand, cur, indexClass, cltext, ch);
                            break;

                        default:
                            throw _host.Except("Unsupported type '{0}'", type);
                        }
                    }
                }
            }
        }
Exemple #15
0
 public override ValueGetter <DataViewRowId> GetIdGetter()
 {
     return((ref DataViewRowId value) => { value = new DataViewRowId((ulong)_rowIndex, 0); });
 }