/// <summary> /// Calculate the jaccard similarity /// </summary> /// <param name="categorySet1">Set 1</param> /// <param name="categorySet2">Set 2</param> /// <param name="jaccardType">Jacarrd similarity Type</param> /// <returns></returns> private static double CalJacSimilarity(List <int> categorySet1, List <int> categorySet2, JaccardType jaccardType) { int orSet = 0; int andSet = 0; List <int> catereduced1 = new List <int>(); List <int> catereduced2 = new List <int>(); //if repeated values not allowed, reduce each set if (jaccardType == JaccardType.DuplicationInvalid) { foreach (int category in categorySet1) { if (!catereduced1.Contains(category)) { catereduced1.Add(category); } } foreach (int category in categorySet2) { if (!catereduced2.Contains(category)) { catereduced2.Add(category); } } } //if repeated values allowed, do nothing to the raw data, just make a copy of it if (jaccardType == JaccardType.DuplicationValid) { foreach (var item in categorySet1) { catereduced1.Add(item); } foreach (var item in categorySet2) { catereduced2.Add(item); } } //body foreach (int category in catereduced1) { for (int i = 0; i < catereduced2.Count; i++) { if (category == catereduced2[i]) { andSet++; catereduced2.Remove(category); break; } } orSet++; } orSet += catereduced2.Count; return((double)andSet / (double)orSet); }
/// <summary> /// Generate init cluster cneters /// </summary> /// <param name="dataSet">The unclustered dataset, each line with form like "id,item1,item2,....."</param> /// <param name="classAmount">The amount of clusters that you want it to be</param> /// <returns></returns> private static List <string> SetInitCenters(List <string> dataSet, int classAmount, JaccardType jaccardType) { List <string> centers = new List <string>(); List <string> centersCopy = new List <string>(); List <List <double> > jacSimValueSet = new List <List <double> >(); Random rdm = new Random(); centers.Add(dataSet[rdm.Next() % dataSet.Count]); centersCopy.Add(centers[0]); //don't want to write the documentation any more................ while (centers.Count < classAmount || centers.Count == 1) { string brayCenter = ReCalBrayCenter(centersCopy); List <int> brayCenterSet = new List <int>(); string[] brayCenters = brayCenter.Split(","); for (int j = 1; j < brayCenters.Length; j++) { brayCenterSet.Add(Convert.ToInt32(brayCenters[j])); } string minJacSimItem = ""; double minJacSimValue = 1.0; foreach (string item in dataSet) { List <int> itemSet = new List <int>(); string[] items = item.Split(","); for (int j = 1; j < items.Length; j++) { itemSet.Add(Convert.ToInt32(items[j])); } double currentJacSim = CalJacSimilarity(brayCenterSet, itemSet, jaccardType); if (currentJacSim < minJacSimValue) { minJacSimValue = currentJacSim; minJacSimItem = item; } } centersCopy.Add(minJacSimItem); if (!centers.Contains(minJacSimItem)) { centers.Add(minJacSimItem); } } return(centers); }
/// <summary> /// K-Means Use Modified Jaccard similarity instead of Euclidean distance /// The init cluster centers are Generated by Jaccard Similarity combined with some probability /// Detailed information are in my course project documentation /// </summary> /// <param name="dataSet">The unclustered dataset, each line with form like "id,item1,item2,....."</param> /// <param name="classAmount">The amount of clusters that you want it to be</param> /// <returns></returns> private static List <List <string> > KMeans(List <string> dataSet, int classAmount, JaccardType jaccardType) { StreamWriter streamWriter = new StreamWriter(@"C:\MediaSlot\CloudDocs\Docs\课程\Data Mining\Records\Iteration" + classAmount + @"\" + DateTime.Now.ToString("yyMMddHHmmss") + ".txt"); string[] record = new string[classAmount]; Console.WriteLine("K-Means Start"); //create some room to store the clustered data List <List <string> > clusteredDataSet = new List <List <string> >(); for (int i = 0; i < classAmount; i++) { List <string> cluster = new List <string>(); clusteredDataSet.Add(cluster); } //a sign of whether "K-means" convergence bool isConvergence = false; Console.WriteLine("Calculate Init Centers"); //calculate the init cluster center List <string> centers = SetInitCenters(dataSet, classAmount, jaccardType); //to store the map of a certain item's jaccard similarity with each cluster List <List <double> > JacSimMap = new List <List <double> >(); for (int i = 0; i < classAmount; i++) { List <double> tm1 = new List <double>() { i, 0.0 }; JacSimMap.Add(tm1); } int iterCount = 0; //do the k-means iteration while (!isConvergence) { Console.WriteLine("Iter " + iterCount + " Start"); //create room to store the data of last iteration, it will be used to the isconvergence judgement List <List <string> > lastClusterdDataSet = new List <List <string> >(); foreach (List <string> item in clusteredDataSet) { List <string> t1 = new List <string>(); foreach (string str in item) { t1.Add(str); } lastClusterdDataSet.Add(t1); } //as for 2nd and the following iteration, recalculate the cluster braycenter at the beginning if (iterCount > 0) { for (int i = 0; i < classAmount; i++) { centers[i] = ReCalBrayCenter(clusteredDataSet[i]); } Console.WriteLine("Bray Center Recalculated"); } for (int i = 0; i < classAmount; i++) { clusteredDataSet[i].Clear(); } List <List <int> > centersSet = new List <List <int> >(); foreach (var center in centers) { List <int> centerSet = new List <int>(); string[] centerss = center.Split(","); for (int j = 1; j < centerss.Length; j++) { centerSet.Add(Convert.ToInt32(centerss[j])); } centersSet.Add(centerSet); } Console.WriteLine("Calculate Jaccard Similarity"); //calculate the jaccard similarity between every item and each cluster center, //then allocate a certain item to cluster which has the maximun jaccard similarity with it foreach (var item in dataSet) { List <int> itemSet = new List <int>(); string[] items = item.Split(","); for (int j = 1; j < items.Length; j++) { itemSet.Add(Convert.ToInt32(items[j])); } for (int j = 0; j < centersSet.Count; j++) { JacSimMap[j][0] = (double)j; JacSimMap[j][1] = CalJacSimilarity(itemSet, centersSet[j], jaccardType); } BubbleSort(JacSimMap); clusteredDataSet[(int)JacSimMap[0][0]].Add(item); } for (int i = 0; i < classAmount; i++) { record[i] += clusteredDataSet[i].Count + " "; Console.WriteLine("Cluster " + i + " = " + clusteredDataSet[i].Count + ", " + lastClusterdDataSet[i].Count); } //isconvergence judgement for (int i = 0; i < classAmount; i++) { isConvergence = true; if (lastClusterdDataSet[i].Count != clusteredDataSet[i].Count) { Console.WriteLine("Amount not equal"); isConvergence = false; break; } foreach (var item in lastClusterdDataSet[i]) { if (!clusteredDataSet[i].Contains(item)) { Console.WriteLine("Item not equal"); isConvergence = false; break; } } } Console.WriteLine(" "); iterCount++; } for (int i = 0; i < classAmount; i++) { streamWriter.WriteLine(record[i]); } streamWriter.Close(); return(clusteredDataSet); }