public void Train(InstanceModel model, IEnumerable <Instance> dataset) { var classFeature = model.ClassFeature() as NominalFeature; List <Feature> featuresToConsider = new List <Feature>(); for (int i = 0; i < model.Features.Length; i++) { if (model.Features[i] != classFeature) { featuresToConsider.Add(model.Features[i]); } } _distance = new EuclideanDissimilarity(dataset.ToList(), model); var instanceList = dataset.ToList(); var randomSampler = new RandomSamplerWithReplacement <Instance>(); _sd = new double[ClassifierCount]; _centers = new IEnumerable <Instance> [ClassifierCount]; for (int i = 0; i < ClassifierCount; i++) { int sampleSize = UseBootstrapSampleCount ? BootstrapSampleCount : (BootstrapSamplePercent * instanceList.Count / 100); _centers[i] = randomSampler.GetSample(dataset, sampleSize); _sd[i] = ComputeBeta(_centers[i].ToList()); } }
public EuclideanDissimilarity(IEnumerable <Instance> instances, InstanceModel model) { _instanceModel = model; var classFeature = model.ClassFeature(); _features = _instanceModel.Features.Where(f => f != classFeature).ToList(); int instanceCount = 0; foreach (var instance in instances) { if (_instanceModel != instance.Model) { throw new ArgumentOutOfRangeException(nameof(model), model, $"Unable to instantiate ${nameof(EuclideanDissimilarity)}: Object found with invalid ${nameof(InstanceModel)}."); } if (_minFeatureValues != null) { foreach (var feature in _features) { if (!Double.IsNaN(instance[feature])) { if (feature.FeatureType == FeatureType.Double || feature.FeatureType == FeatureType.Integer) { if (Double.IsNaN(_minFeatureValues[feature.Index]) || instance[feature] < _minFeatureValues[feature.Index]) { _minFeatureValues[feature.Index] = instance[feature]; } if (Double.IsNaN(_maxFeatureValues[feature.Index]) || instance[feature] > _maxFeatureValues[feature.Index]) { _maxFeatureValues[feature.Index] = instance[feature]; } } } } } else { _minFeatureValues = new double[_features.Count]; _maxFeatureValues = new double[_features.Count]; foreach (var feature in _features) { if (Double.IsNaN(instance[feature])) { _minFeatureValues[feature.Index] = Double.NaN; _maxFeatureValues[feature.Index] = Double.NaN; } else { if (feature.FeatureType == FeatureType.Double || feature.FeatureType == FeatureType.Integer) { _minFeatureValues[feature.Index] = instance[feature]; _maxFeatureValues[feature.Index] = instance[feature]; } else if (feature.FeatureType == FeatureType.Nominal) { _minFeatureValues[feature.Index] = Double.NaN; _maxFeatureValues[feature.Index] = Double.NaN; } } } } instanceCount++; } if (instanceCount < 1) { throw new ArgumentOutOfRangeException(nameof(instances), instances, $"Unable to instantiate ${nameof(EuclideanDissimilarity)}: empty vector collection."); } _maxLessMin = new double[_minFeatureValues.Length]; double validFeaturesCount = 0; for (int i = 0; i < _minFeatureValues.Length; i++) { _maxLessMin[i] = _maxFeatureValues[i] - _minFeatureValues[i]; if (!Double.IsNaN(_maxLessMin[i]) || (_features[i].FeatureType == FeatureType.Nominal)) { validFeaturesCount++; } } _maxDissimilarity = Math.Sqrt(validFeaturesCount); }
public List <List <Instance> > FindClusters(InstanceModel model, List <Instance> instances, out List <IEmergingPattern> selectedPatterns) { NominalFeature classFeature = null; FeatureInformation backupFeatureInformation = null; string[] backupClassValues = null; double[] backupClassByInstance = null; bool isClassPresent = true; if (model.ClassFeature() == null) { isClassPresent = false; classFeature = new NominalFeature("class", model.Features.Length); var backupFeatures = model.Features; model.Features = new Feature[backupFeatures.Length + 1]; for (int i = 0; i < backupFeatures.Length; i++) { model.Features[i] = backupFeatures[i]; } model.Features[backupFeatures.Length] = classFeature; } else { classFeature = model.ClassFeature() as NominalFeature; backupFeatureInformation = classFeature.FeatureInformation; backupClassValues = classFeature.Values; backupClassByInstance = new double[instances.Count]; for (int i = 0; i < instances.Count; i++) { backupClassByInstance[i] = instances[i][classFeature]; instances[i][classFeature] = 0; } } classFeature.FeatureInformation = new NominalFeatureInformation() { Distribution = new double[] { 1, 1, 1, 1, 1 }, Ratio = new double[] { 1, 1, 1, 1, 1 }, ValueProbability = new double[] { 1, 1, 1, 1, 1 } }; classFeature.Values = new string[1] { "Unknown" }; var Miner = new UnsupervisedRandomForestMiner() { ClusterCount = ClusterCount, TreeCount = 100 }; var patterns = Miner.Mine(model, instances, classFeature); var instIdx = new Dictionary <Instance, int>(); for (int i = 0; i < instances.Count; i++) { instIdx.Add(instances[i], i); } int[,] similarityMatrix = new int[instances.Count, instances.Count + 1]; var coverSetByPattern = new Dictionary <IEmergingPattern, HashSet <Instance> >(); foreach (var pattern in patterns) { if (pattern != null) { var currentCluster = new List <int>(); var currentCoverSet = new HashSet <Instance>(); for (int i = 0; i < instances.Count; i++) { if (pattern.IsMatch(instances[i])) { currentCluster.Add(i); currentCoverSet.Add(instances[i]); } } for (int i = 0; i < currentCluster.Count; i++) { for (int j = 0; j < currentCluster.Count; j++) { similarityMatrix[currentCluster[i], currentCluster[j]] += 1; similarityMatrix[currentCluster[i], instances.Count] += 1; } } coverSetByPattern.Add(pattern, currentCoverSet); } } var kmeans = new KMeans() { K = ClusterCount, classFeature = classFeature, similarityMatrix = similarityMatrix, instIdx = instIdx }; var clusterList = kmeans.FindClusters(instances); var patternClusterList = new List <List <IEmergingPattern> >(); for (int i = 0; i < ClusterCount; i++) { patternClusterList.Add(new List <IEmergingPattern>()); } foreach (var pattern in patterns) { if (pattern != null) { var bestIdx = 0; var maxCoverCount = int.MinValue; pattern.Supports = new double[ClusterCount]; pattern.Counts = new double[ClusterCount]; HashSet <Instance> bestCover = null; for (int i = 0; i < ClusterCount; i++) { HashSet <Instance> currentCover = new HashSet <Instance>(coverSetByPattern[pattern].Intersect(clusterList[i])); var currentCoverCount = currentCover.Count; pattern.Counts[i] = currentCoverCount; pattern.Supports[i] = 1.0 * currentCoverCount / clusterList[i].Count; if (currentCoverCount > maxCoverCount) { maxCoverCount = currentCoverCount; bestIdx = i; bestCover = currentCover; } } coverSetByPattern[pattern] = bestCover; patternClusterList[bestIdx].Add(pattern); } } selectedPatterns = FilterPatterns(instances, patternClusterList); if (isClassPresent) { classFeature.FeatureInformation = backupFeatureInformation; classFeature.Values = backupClassValues; for (int i = 0; i < instances.Count; i++) { instances[i][classFeature] = backupClassByInstance[i]; } } return(clusterList); }