Esempio n. 1
0
            void TrainTransform()
            {
                lock (_lock)
                {
                    if (_Results != null)
                    {
                        return;
                    }

                    using (var ch = _host.Start("Optics"))
                    {
                        var sw = Stopwatch.StartNew();
                        sw.Start();
                        var points = new List <IPointIdFloat>();
                        var index  = SchemaHelper.GetColumnIndexDC(_input.Schema, _args.features);

                        // Caching data.
                        ch.Info(MessageSensitivity.None, "Caching the data.");
                        using (var cursor = _input.GetRowCursor(_input.Schema.Where(c => c.Index == index.Index)))
                        {
                            var           getter   = cursor.GetGetter <VBuffer <float> >(index);
                            var           getterId = cursor.GetIdGetter();
                            DataViewRowId id       = new DataViewRowId();

                            VBuffer <float> tmp = new VBuffer <float>();

                            for (int i = 0; cursor.MoveNext(); i++)
                            {
                                getter(ref tmp);
                                getterId(ref id);
                                points.Add(new PointIdFloat((long)id.Low, tmp.DenseValues().Select(c => (float)c)));
                            }
                        }

                        // Mapping.
                        // int: index of a cluster
                        // long: index of a point
                        var mapping = new int[points.Count];
                        var mapprev = new Dictionary <long, int>();

                        float[] distances = null;
                        if (_args.epsilons == null || _args.epsilons.Count() == 0)
                        {
                            float mind, maxd;
                            distances = new[] { EstimateDistance(ch, points, out mind, out maxd) };
                            ch.Info(MessageSensitivity.UserData, "epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distances.First(), mind, maxd);
                        }
                        else
                        {
                            distances = _args.epsilonsDouble;
                        }

                        var maxEpsilon = distances.Max();
                        _Results         = new List <Dictionary <int, ClusteringResult> >();
                        _reversedMapping = new List <Dictionary <long, int> >();

                        Optics opticsAlgo = new Optics(points, _args.seed);
                        //Ordering
                        ch.Info(MessageSensitivity.UserData, "Generating OPTICS ordering for {0} points.", points.Count);
                        int    nPoints = points.Count;
                        int    cyclesBetweenLogging = Math.Min(1000, nPoints / 10);
                        int    currentIteration     = 0;
                        Action progressLogger       = () =>
                        {
                            if (++currentIteration % cyclesBetweenLogging == 0)
                            {
                                ch.Info(MessageSensitivity.UserData, "Processing {0}/{1}", currentIteration, nPoints);
                            }
                        };

                        OpticsOrdering opticsOrdering = opticsAlgo.Ordering(
                            maxEpsilon,
                            _args.minPoints,
                            seed: _args.seed,
                            onShuffle: msg => ch.Info(MessageSensitivity.UserData, msg),
                            onPointProcessing: progressLogger);

                        // Clustering.
                        foreach (var epsilon in distances)
                        {
                            ch.Info(MessageSensitivity.UserData, "Clustering {0} points using epsilon={1}.", points.Count, epsilon);
                            Dictionary <long, int> results = opticsOrdering.Cluster(epsilon);

                            HashSet <int> clusterIds = new HashSet <int>();

                            for (int i = 0; i < results.Count; ++i)
                            {
                                var p       = points[i];
                                int cluster = results[p.id];
                                mapprev[p.id] = cluster;
                                mapping[i]    = cluster;
                                if (cluster != DBScan.NOISE)
                                {
                                    clusterIds.Add(cluster);
                                }
                            }

                            _reversedMapping.Add(mapprev);

                            // Cleaning small clusters.
                            ch.Info(MessageSensitivity.UserData, "Removing clusters with less than {0} points.", _args.minPoints);
                            var finalCounts_ = results.GroupBy(c => c.Value, (key, g) => new { key = key, nb = g.Count() });
                            var finalCounts  = finalCounts_.ToDictionary(c => c.key, d => d.nb);
                            results = results.Select(c => new KeyValuePair <long, int>(c.Key, finalCounts[c.Value] < _args.minPoints ? -1 : c.Value))
                                      .ToDictionary(c => c.Key, c => c.Value);

                            // Cleaning.
                            ch.Info(MessageSensitivity.None, "Cleaning.");
                            // We replace by the original labels.
                            var runResults = new Dictionary <int, ClusteringResult>();
                            for (int i = 0; i < results.Count; ++i)
                            {
                                runResults[i] = new ClusteringResult()
                                {
                                    cl    = results[i] != DBScan.NOISE ? results[i] : -1,
                                    score = results[i] != DBScan.NOISE ? 1f : 0f
                                };
                            }

                            _Results.Add(runResults);
                            ch.Info(MessageSensitivity.UserData, "Found {0} clusters.", clusterIds.Count);
                        }
                        sw.Stop();
                        ch.Info(MessageSensitivity.UserData, "'Optics' finished in {0}.", sw.Elapsed);
                    }
                }
            }
            void TrainTransform()
            {
                lock (_lock)
                {
                    if (_Results != null)
                    {
                        return;
                    }

                    using (var ch = _host.Start("Starting Optics"))
                    {
                        var sw = Stopwatch.StartNew();
                        sw.Start();
                        var points = new List <IPointIdFloat>();

                        int index;
                        if (!_input.Schema.TryGetColumnIndex(_args.features, out index))
                        {
                            ch.Except("Unable to find column '{0}'", _args.features);
                        }

                        // Caching data.
                        ch.Info("Caching the data.");
                        using (var cursor = _input.GetRowCursor(i => i == index))
                        {
                            var     getter   = cursor.GetGetter <VBuffer <float> >(index);
                            var     getterId = cursor.GetIdGetter();
                            UInt128 id       = new UInt128();

                            VBuffer <float> tmp = new VBuffer <float>();

                            for (int i = 0; cursor.MoveNext(); i++)
                            {
                                getter(ref tmp);
                                getterId(ref id);
                                if (id > long.MaxValue)
                                {
                                    ch.Except("An id is outside the range for long {0}", id);
                                }
                                points.Add(new PointIdFloat((long)id, tmp.DenseValues().Select(c => (float)c)));
                            }
                        }

                        // Mapping.
                        // long: index in the ordering
                        // long: index of a point
                        var mapping = new long[points.Count];
                        var mapprev = new Dictionary <long, long>();

                        var distance = (float)_args.epsilon;
                        if (distance <= 0)
                        {
                            float mind, maxd;
                            distance = EstimateDistance(ch, points, out mind, out maxd);
                            ch.Info("epsilon (=Radius) was estimating on random couples of points: {0} in [{1}, {2}]", distance, mind, maxd);
                        }

                        Optics opticsAlgo = new Optics(points, _args.seed);
                        //Ordering
                        ch.Info("Generating OPTICS ordering for {0} points.", points.Count);
                        int nPoints = points.Count;
                        int cyclesBetweenLogging = Math.Min(1000, nPoints / 10);
                        int currentIteration     = 0;

                        Action progressLogger = () =>
                        {
                            if (++currentIteration % cyclesBetweenLogging == 0)
                            {
                                ch.Info("Processing {0}/{1}", currentIteration, nPoints);
                            }
                        };

                        OpticsOrdering opticsOrdering = opticsAlgo.Ordering(
                            distance,
                            _args.minPoints,
                            seed: _args.seed,
                            onShuffle: msg => ch.Info(msg),
                            onPointProcessing: progressLogger);
                        IReadOnlyDictionary <long, long> results = opticsOrdering.orderingMapping;
                        var reachabilityDs = opticsOrdering.reachabilityDistances;
                        var coreDs         = opticsOrdering.coreDistancesCache;

                        for (int i = 0; i < results.Count; ++i)
                        {
                            var p = points[i];
                            mapprev[results[i]] = i;
                            mapping[i]          = results[i];
                        }
                        _reversedMapping = mapprev;

                        // Cleaning.
                        ch.Info("Cleaning.");
                        // We replace by the original labels.
                        _Results = new OpticsOrderingResult[results.Count];

                        for (int i = 0; i < results.Count; ++i)
                        {
                            long  pId = points[i].id;
                            float?rd;
                            float?cd;

                            reachabilityDs.TryGetValue(pId, out rd);
                            coreDs.TryGetValue(pId, out cd);

                            _Results[i] = new OpticsOrderingResult()
                            {
                                id           = results[i] != DBScan.NOISE ? results[i] : -1,
                                reachability = (float)rd.GetValueOrDefault(float.PositiveInfinity),
                                core         = (float)cd.GetValueOrDefault(float.PositiveInfinity)
                            };
                        }
                        ch.Info("Ordered {0} points.", _Results.Count());
                        sw.Stop();
                        ch.Info("'OpticsOrdering' finished in {0}.", sw.Elapsed);
                    }
                }
            }