예제 #1
0
        public void Train(InstanceModel model, IEnumerable <Instance> dataset)
        {
            var            classFeature       = model.ClassFeature() as NominalFeature;
            List <Feature> featuresToConsider = new List <Feature>();

            for (int i = 0; i < model.Features.Length; i++)
            {
                if (model.Features[i] != classFeature)
                {
                    featuresToConsider.Add(model.Features[i]);
                }
            }

            _distance = new EuclideanDissimilarity(dataset.ToList(), model);


            var instanceList  = dataset.ToList();
            var randomSampler = new RandomSamplerWithReplacement <Instance>();

            _sd      = new double[ClassifierCount];
            _centers = new IEnumerable <Instance> [ClassifierCount];

            for (int i = 0; i < ClassifierCount; i++)
            {
                int sampleSize = UseBootstrapSampleCount
                    ? BootstrapSampleCount : (BootstrapSamplePercent * instanceList.Count / 100);

                _centers[i] = randomSampler.GetSample(dataset, sampleSize);

                _sd[i] = ComputeBeta(_centers[i].ToList());
            }
        }
예제 #2
0
        public EuclideanDissimilarity(IEnumerable <Instance> instances, InstanceModel model)
        {
            _instanceModel = model;
            var classFeature = model.ClassFeature();

            _features = _instanceModel.Features.Where(f => f != classFeature).ToList();

            int instanceCount = 0;

            foreach (var instance in instances)
            {
                if (_instanceModel != instance.Model)
                {
                    throw new ArgumentOutOfRangeException(nameof(model), model, $"Unable to instantiate ${nameof(EuclideanDissimilarity)}: Object found with invalid ${nameof(InstanceModel)}.");
                }

                if (_minFeatureValues != null)
                {
                    foreach (var feature in _features)
                    {
                        if (!Double.IsNaN(instance[feature]))
                        {
                            if (feature.FeatureType == FeatureType.Double || feature.FeatureType == FeatureType.Integer)
                            {
                                if (Double.IsNaN(_minFeatureValues[feature.Index]) || instance[feature] < _minFeatureValues[feature.Index])
                                {
                                    _minFeatureValues[feature.Index] = instance[feature];
                                }

                                if (Double.IsNaN(_maxFeatureValues[feature.Index]) || instance[feature] > _maxFeatureValues[feature.Index])
                                {
                                    _maxFeatureValues[feature.Index] = instance[feature];
                                }
                            }
                        }
                    }
                }
                else
                {
                    _minFeatureValues = new double[_features.Count];
                    _maxFeatureValues = new double[_features.Count];
                    foreach (var feature in _features)
                    {
                        if (Double.IsNaN(instance[feature]))
                        {
                            _minFeatureValues[feature.Index] = Double.NaN;
                            _maxFeatureValues[feature.Index] = Double.NaN;
                        }
                        else
                        {
                            if (feature.FeatureType == FeatureType.Double || feature.FeatureType == FeatureType.Integer)
                            {
                                _minFeatureValues[feature.Index] = instance[feature];
                                _maxFeatureValues[feature.Index] = instance[feature];
                            }
                            else if (feature.FeatureType == FeatureType.Nominal)
                            {
                                _minFeatureValues[feature.Index] = Double.NaN;
                                _maxFeatureValues[feature.Index] = Double.NaN;
                            }
                        }
                    }
                }

                instanceCount++;
            }

            if (instanceCount < 1)
            {
                throw new ArgumentOutOfRangeException(nameof(instances), instances, $"Unable to instantiate ${nameof(EuclideanDissimilarity)}: empty vector collection.");
            }

            _maxLessMin = new double[_minFeatureValues.Length];
            double validFeaturesCount = 0;

            for (int i = 0; i < _minFeatureValues.Length; i++)
            {
                _maxLessMin[i] = _maxFeatureValues[i] - _minFeatureValues[i];
                if (!Double.IsNaN(_maxLessMin[i]) || (_features[i].FeatureType == FeatureType.Nominal))
                {
                    validFeaturesCount++;
                }
            }

            _maxDissimilarity = Math.Sqrt(validFeaturesCount);
        }
예제 #3
0
        public List <List <Instance> > FindClusters(InstanceModel model, List <Instance> instances, out List <IEmergingPattern> selectedPatterns)
        {
            NominalFeature     classFeature             = null;
            FeatureInformation backupFeatureInformation = null;

            string[] backupClassValues     = null;
            double[] backupClassByInstance = null;
            bool     isClassPresent        = true;

            if (model.ClassFeature() == null)
            {
                isClassPresent = false;
                classFeature   = new NominalFeature("class", model.Features.Length);
                var backupFeatures = model.Features;
                model.Features = new Feature[backupFeatures.Length + 1];
                for (int i = 0; i < backupFeatures.Length; i++)
                {
                    model.Features[i] = backupFeatures[i];
                }

                model.Features[backupFeatures.Length] = classFeature;
            }
            else
            {
                classFeature             = model.ClassFeature() as NominalFeature;
                backupFeatureInformation = classFeature.FeatureInformation;
                backupClassValues        = classFeature.Values;

                backupClassByInstance = new double[instances.Count];
                for (int i = 0; i < instances.Count; i++)
                {
                    backupClassByInstance[i]   = instances[i][classFeature];
                    instances[i][classFeature] = 0;
                }
            }

            classFeature.FeatureInformation = new NominalFeatureInformation()
            {
                Distribution     = new double[] { 1, 1, 1, 1, 1 },
                Ratio            = new double[] { 1, 1, 1, 1, 1 },
                ValueProbability = new double[] { 1, 1, 1, 1, 1 }
            };

            classFeature.Values = new string[1] {
                "Unknown"
            };

            var Miner = new UnsupervisedRandomForestMiner()
            {
                ClusterCount = ClusterCount, TreeCount = 100
            };

            var patterns = Miner.Mine(model, instances, classFeature);

            var instIdx = new Dictionary <Instance, int>();

            for (int i = 0; i < instances.Count; i++)
            {
                instIdx.Add(instances[i], i);
            }

            int[,] similarityMatrix = new int[instances.Count, instances.Count + 1];
            var coverSetByPattern = new Dictionary <IEmergingPattern, HashSet <Instance> >();

            foreach (var pattern in patterns)
            {
                if (pattern != null)
                {
                    var currentCluster  = new List <int>();
                    var currentCoverSet = new HashSet <Instance>();
                    for (int i = 0; i < instances.Count; i++)
                    {
                        if (pattern.IsMatch(instances[i]))
                        {
                            currentCluster.Add(i);
                            currentCoverSet.Add(instances[i]);
                        }
                    }

                    for (int i = 0; i < currentCluster.Count; i++)
                    {
                        for (int j = 0; j < currentCluster.Count; j++)
                        {
                            similarityMatrix[currentCluster[i], currentCluster[j]] += 1;
                            similarityMatrix[currentCluster[i], instances.Count]   += 1;
                        }
                    }

                    coverSetByPattern.Add(pattern, currentCoverSet);
                }
            }

            var kmeans = new KMeans()
            {
                K = ClusterCount, classFeature = classFeature, similarityMatrix = similarityMatrix, instIdx = instIdx
            };
            var clusterList = kmeans.FindClusters(instances);

            var patternClusterList = new List <List <IEmergingPattern> >();

            for (int i = 0; i < ClusterCount; i++)
            {
                patternClusterList.Add(new List <IEmergingPattern>());
            }

            foreach (var pattern in patterns)
            {
                if (pattern != null)
                {
                    var bestIdx       = 0;
                    var maxCoverCount = int.MinValue;
                    pattern.Supports = new double[ClusterCount];
                    pattern.Counts   = new double[ClusterCount];
                    HashSet <Instance> bestCover = null;
                    for (int i = 0; i < ClusterCount; i++)
                    {
                        HashSet <Instance> currentCover = new HashSet <Instance>(coverSetByPattern[pattern].Intersect(clusterList[i]));
                        var currentCoverCount           = currentCover.Count;
                        pattern.Counts[i]   = currentCoverCount;
                        pattern.Supports[i] = 1.0 * currentCoverCount / clusterList[i].Count;
                        if (currentCoverCount > maxCoverCount)
                        {
                            maxCoverCount = currentCoverCount;
                            bestIdx       = i;
                            bestCover     = currentCover;
                        }
                    }
                    coverSetByPattern[pattern] = bestCover;

                    patternClusterList[bestIdx].Add(pattern);
                }
            }

            selectedPatterns = FilterPatterns(instances, patternClusterList);

            if (isClassPresent)
            {
                classFeature.FeatureInformation = backupFeatureInformation;
                classFeature.Values             = backupClassValues;
                for (int i = 0; i < instances.Count; i++)
                {
                    instances[i][classFeature] = backupClassByInstance[i];
                }
            }

            return(clusterList);
        }