async Task <KMeansData> StandardiseData(UserModel model) { //TODO: range age, avg age | max 1 min 0 avg of interests KMeansStatistics statistics = await GetStatistics(); KMeansData kuser = new KMeansData { Id = model.Id, Hobbies = { ["Age"] = (model.Age - statistics.AvgAge) / statistics.AgeRange } }; foreach (var userInterest in model.Interests) { kuser.Hobbies[userInterest] = 1; } //await _database.KMeansInsert(kuser); // finalizing the data KMeansData sample = new KMeansData(); foreach (var hobby in sample.Hobbies.Keys) { kuser.Hobbies[hobby] = (kuser.Hobbies[hobby] - statistics.HobbiesAverages[hobby]) / statistics.HobbiesRange; } return(kuser); }
public async Task <int> FindClosestCentroidAsync(UserModel model) { List <KMeansData> centroids = await _database.GetCollection <KMeansData>("Centroids").Find(_ => true).ToListAsync(); KMeansData standardisedData = await StandardiseData(model); // ADD STANDARDISATION int result = 0; double closestCentroid = Double.MaxValue; for (int i = 0; i < centroids.Count; i++) { double dist = 0; foreach (var pointHobby in standardisedData.Hobbies) { double hobbyDist = Math.Pow((pointHobby.Value - centroids[i].Hobbies[pointHobby.Key]), 2); dist += hobbyDist; } if (dist < closestCentroid) { closestCentroid = dist; result = i; } } return(result); }
async Task <KMeansStatistics> GetStatistics() { var users = await GetUsers(); KMeansStatistics statistics = new KMeansStatistics(); //if(await _database.GetCollection<KMeansStatistics>("KMeansStatistics").Find(_ => true).FirstOrDefaultAsync() != null) return await _database.GetCollection<KMeansStatistics>("KMeansStatistics").Find(_ => true).FirstOrDefaultAsync(); if (users.Count != 0 && users.Count % 300 == 0) // testing purposes / update statistics each 300 users to get accurate centroids { statistics.AgeRange = _database.GetCollection <UserModel>("Users").Find(_ => true) .Sort(Builders <UserModel> .Sort.Descending("Age")).FirstOrDefault().Age - _database.GetCollection <UserModel>("Users").Find(_ => true) .Sort(Builders <UserModel> .Sort.Ascending("Age")).FirstOrDefault().Age; double avgAge = 0; foreach (var userModel in users) { avgAge += userModel.Age; } statistics.AvgAge = avgAge / users.Count; // Hobbies average KMeansData sumOfInterests = new KMeansData(); foreach (var user in users) { foreach (var userInterest in user.Interests) { sumOfInterests.Hobbies[userInterest] += 1; } } // average for each hobby foreach (var interest in sumOfInterests.Hobbies.Keys) { statistics.HobbiesAverages[interest] = sumOfInterests.Hobbies[interest] / users.Count; } await _database.DropCollectionAsync("KMeansStatistics"); await _database.GetCollection <KMeansStatistics>("KMeansStatistics").InsertOneAsync(statistics); return(statistics); } else { return(await _database.GetCollection <KMeansStatistics>("KMeansStatistics").Find(_ => true).FirstOrDefaultAsync()); } }
public async Task <List <UserModel> > FindClosestCentroidListAsync() { List <KMeansData> centroids = await _database.GetCollection <KMeansData>("Centroids").Find(_ => true).ToListAsync(); List <UserModel> users = await GetUsers(); // ADD STANDARDISATION return(await AssignClustersListAsync(centroids, users)); async Task <List <UserModel> > AssignClustersListAsync(List <KMeansData> centroidsList, List <UserModel> clusters) { foreach (var cluster in clusters) { KMeansData standardisedData = await StandardiseData(cluster); // Find distances between cluster and existing centroids double[] distancesFromCentroids = new double[centroidsList.Count]; for (int i = 0; i < centroidsList.Count; i++) { double dist = 0; foreach (var clusterHobby in standardisedData.Hobbies) { dist += Math.Pow((clusterHobby.Value - centroidsList[i].Hobbies[clusterHobby.Key]), 2); } distancesFromCentroids[i] = dist; } // Which is the nearest? int centroidWithMin = 0; for (int i = centroidWithMin + 1; i < distancesFromCentroids.Length; i++) { if (distancesFromCentroids[centroidWithMin] > distancesFromCentroids[i]) { centroidWithMin = i; } } cluster.Cluster = centroidWithMin; } return(clusters); } }
public async Task ClusterisedInsert(KMeansData model) { await _database.GetCollection <KMeansData>("Clusterized").InsertOneAsync(model); }
public async Task KMeansInsert(KMeansData model) { await _database.GetCollection <KMeansData>("KMeans").InsertOneAsync(model); }
public async Task Clusterize() { List <KMeansData> data = await GetKMeansData(); List <KMeansData> centroids = await AnomalousPatterns(); data = await ClusterizeAsync(); data.Sort((s1, s2) => s1.Cluster.CompareTo(s2.Cluster)); foreach (var kMeansData in data) { await ClusterisedInsert(kMeansData); } foreach (var centroid in centroids) { await InsertCentroids(centroid); } async Task <List <KMeansData> > AnomalousPatterns() { // 1. Count distance // 2. Assign clusters to furthest centroid and center // 3. Remove if belongs to furthest List <KMeansData> centroidsList = new List <KMeansData>(); List <KMeansData> tempData = await GetKMeansData(); KMeansData center = new KMeansData(); //0,0,0,0,0,0... KMeansData furthest = new KMeansData(); while (tempData.Count != 0) { furthest = FindFurthestCluster(tempData, furthest); tempData.Remove(furthest); tempData = AssignClusters(new List <KMeansData> { center, furthest }, tempData); if (tempData.Count(x => x.Cluster == 1) >= 4) { centroidsList.Add(furthest); // if centroid has more than two members add to centroids list tempData.RemoveAll(x => x.Cluster == 1); // 1 stands for furthest, remove if it belongs to it } } return(centroidsList); } async Task <List <KMeansData> > ClusterizeAsync() { int maxCycles = 30; for (int i = 0; i < maxCycles; i++) { data = AssignClusters(centroids, data); UpdateCentroids(ref centroids, data, out var updated); if (!updated) { break; } } return(data); } bool CompareDictionaries(Dictionary <string, double> x, Dictionary <string, double> y) { foreach (var xKey in x.Keys) { if (!x[xKey].Equals(y[xKey])) { return(false); } } return(true); } KMeansData FindFurthestCluster(List <KMeansData> tempData, KMeansData furthest) { KMeansData center = new KMeansData(); double furthestCluster = Double.MinValue; foreach (var point in tempData) { double dist = 0; foreach (var pointHobby in point.Hobbies) { double hobbyDist = Math.Pow((pointHobby.Value - center.Hobbies[pointHobby.Key]), 2); dist += hobbyDist; } if (dist > furthestCluster) { furthestCluster = dist; furthest = point; } } return(furthest); } List <KMeansData> AssignClusters(List <KMeansData> centroidsList, List <KMeansData> clusters) { foreach (var cluster in clusters) { // Find distances between cluster and existing centroids double[] distancesFromCentroids = new double[centroidsList.Count]; for (int i = 0; i < centroidsList.Count; i++) { double dist = 0; foreach (var clusterHobby in cluster.Hobbies) { dist += Math.Pow((clusterHobby.Value - centroidsList[i].Hobbies[clusterHobby.Key]), 2); } distancesFromCentroids[i] = dist; } // Which is the nearest? int centroidWithMin = 0; for (int i = centroidWithMin + 1; i < distancesFromCentroids.Length; i++) { if (distancesFromCentroids[centroidWithMin] > distancesFromCentroids[i]) { centroidWithMin = i; } } cluster.Cluster = centroidWithMin; } return(clusters); } void UpdateCentroids(ref List <KMeansData> centroidsList, List <KMeansData> clusters, out bool updated) { updated = false; for (int i = 0; i < centroidsList.Count; i++) { List <KMeansData> membersOfCluster = clusters.FindAll(member => member.Cluster == i); KMeansData sumOfData = new KMeansData(); for (int j = 0; j < membersOfCluster.Count; j++) { foreach (var hobbiesKey in centroidsList[0].Hobbies.Keys) { sumOfData.Hobbies[hobbiesKey] += membersOfCluster[j].Hobbies[hobbiesKey]; } } KMeansData averageCluster = new KMeansData(); foreach (var hobbiesKey in clusters[0].Hobbies.Keys) { averageCluster.Hobbies[hobbiesKey] = sumOfData.Hobbies[hobbiesKey] / membersOfCluster.Count; } if (!updated) { if (!CompareDictionaries(centroidsList[i].Hobbies, averageCluster.Hobbies)) { updated = true; } } centroidsList[i] = averageCluster; } } }
public async Task StandardiseData() { //await RemoveData(); List <KMeansData> data = new List <KMeansData>(); var users = await GetUsers(); // (age - average) / range (max-min) // interests // AGE STANDARDISE // double ageSum = 0; double maxAge = Double.MinValue, minAge = Double.MaxValue; foreach (var user in users) { if (maxAge < user.Age) { maxAge = user.Age; } if (minAge > user.Age) { minAge = user.Age; } ageSum += user.Age; } double avgAge = ageSum / users.Count; double range = maxAge - minAge; // AGE // double interestRange = 1; // max 1, min 0 // FIND RANGE OF EACH INTEREST KMeansData sumOfInterests = new KMeansData(); KMeansData avgOfInterests = new KMeansData(); foreach (var user in users) { //ages.Add((user.Age - ageMean)/ageSD); KMeansData kuser = new KMeansData { Id = user.Id, Hobbies = { ["Age"] = (user.Age - avgAge) / range } }; foreach (var userInterest in user.Interests) { kuser.Hobbies[userInterest] = 1; sumOfInterests.Hobbies[userInterest] += 1; } data.Add(kuser); //await _database.KMeansInsert(kuser); } // average for each hobby foreach (var interest in sumOfInterests.Hobbies.Keys) { avgOfInterests.Hobbies[interest] = sumOfInterests.Hobbies[interest] / users.Count; } // finalizing the data foreach (var kMeansData in data) { foreach (var hobby in avgOfInterests.Hobbies.Keys) { kMeansData.Hobbies[hobby] = (kMeansData.Hobbies[hobby] - avgOfInterests.Hobbies[hobby]) / interestRange; } await KMeansInsert(kMeansData); } }
public async Task InsertCentroids(KMeansData centroids) { await _database.GetCollection <KMeansData>("Centroids").InsertOneAsync(centroids); }