//private double GetQual() //{ // double clustQual = 0; // foreach (Centroid centroid in mCentroids) // { // foreach (int itemIdx in centroid.CurrentItems) // { // clustQual += centroid.GetDotProduct(mDataset[itemIdx]); // } // } // clustQual /= (double)mDataset.Count; // return clustQual; //} // TODO: exceptions public ClusteringResult Update(int dequeueN, IEnumerable <SparseVector <double> > addList, ref int iter) { StopWatch stopWatch = new StopWatch(); // update centroid data (1) foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= dequeueN) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } //Console.WriteLine(">>> {0} >>> update centroid data (1)", stopWatch.TotalMilliseconds); stopWatch.Reset(); // update dataset mDataset.RemoveRange(0, dequeueN); int ofs = mDataset.Count; mDataset.AddRange(addList); //Console.WriteLine(">>> {0} >>> update dataset", stopWatch.TotalMilliseconds); stopWatch.Reset(); // update centroid data (2) foreach (CentroidData centroid in mCentroids) { Set <int> itemsOfs = new Set <int>(); foreach (int item in centroid.CurrentItems) { itemsOfs.Add(item - dequeueN); } centroid.CurrentItems.Inner.SetItems(itemsOfs); centroid.Items.SetItems(itemsOfs); } //Console.WriteLine(">>> {0} >>> update centroid data (2)", stopWatch.TotalMilliseconds); stopWatch.Reset(); // assign new instances double bestClustQual = 0; { mLogger.Info("Update", "Initializing ..."); int i = 0; foreach (SparseVector <double> example in addList) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { double sim = mCentroids[j].GetDotProduct(example); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { mCentroids[candidates[0]].Items.Add(ofs + i); } i++; } // update centroids foreach (CentroidData centroid in mCentroids) { centroid.Update(mDataset); centroid.UpdateCentroidLen(); } //Console.WriteLine(GetQual()); foreach (CentroidData centroid in mCentroids) { foreach (int itemIdx in centroid.CurrentItems) { bestClustQual += centroid.GetDotProduct(mDataset[itemIdx]); } } bestClustQual /= (double)mDataset.Count; mLogger.Info("Update", "Quality: {0:0.0000}", bestClustQual); } //Console.WriteLine(">>> {0} >>> assign new instances", stopWatch.TotalMilliseconds); stopWatch.Reset(); // main k-means loop iter = 0; while (true) { iter++; mLogger.Info("Update", "Iteration {0} ...", iter); // assign items to clusters for (int i = 0; i < mDataset.Count; i++) { SparseVector <double> example = mDataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { double sim = mCentroids[j].GetDotProduct(example); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { mCentroids[candidates[0]].Items.Add(i); } } // // *** OPTIMIZE THIS with GetDotProductSimilarity (see this.Cluster) !!! *** // //Console.WriteLine(">>> {0} >>> loop: assign items to clusters", stopWatch.TotalMilliseconds); stopWatch.Reset(); double clustQual = 0; // update centroids foreach (CentroidData centroid in mCentroids) { centroid.Update(mDataset); centroid.UpdateCentroidLen(); } //Console.WriteLine(GetQual()); foreach (CentroidData centroid in mCentroids) { foreach (int itemIdx in centroid.CurrentItems) { clustQual += centroid.GetDotProduct(mDataset[itemIdx]); } } clustQual /= (double)mDataset.Count; //Console.WriteLine(">>> {0} >>> loop: update centroids", stopWatch.TotalMilliseconds); stopWatch.Reset(); mLogger.Info("Update", "Quality: {0:0.0000} Diff: {1:0.0000}", clustQual, clustQual - bestClustQual); // check if done if (clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; } // save the result ClusteringResult clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(mCentroids[i].Items); } return(clustering); }