/// <summary> /// 构造函数 /// </summary> /// <param name="data">数据集</param> /// <param name="conv">kmeans时的收敛值</param> /// <param name="addr">邻接表的地址</param> public SpectralClusteringGenerator(Dataset data, double conv, PrintLogFunction PrintLog, string addr) { dataset = data; arrData = data.Data.ToArray(); convergence = conv; PrintLog("初始化MATLAB组件..."); matlab = new MLApp.MLAppClass(); matlab.Visible = 0; PrintLog("初始化结束..."); if (addr == null /*|| File.Exists(addr)==false*/) { PrintLog("初始化Spectral邻居点图, 正在寻找每个点相邻最近的数个点..."); FindNeighbours(9, PrintLog); PrintLog("初始化结束..."); } else { addrNeighbours = addr; } }
/// <summary> /// 执行kmeans聚类方法,生成K个类,重复执行repeat次选结果最好的 /// </summary> /// <param name="K">聚类个数</param> /// <param name="repeat">重复次数</param> /// <returns>聚类结果的purity和gini指标</returns> public ValidationPair Generate(int K, int repeat, PrintLogFunction PrintLog) { ValidationPair result = new ValidationPair(); for (int r = 0; r < repeat; r++) { PrintLog("正在执行第" + r.ToString() + "次kmeans聚类..."); #region 执行一次聚类 presentative = new DataType[K]; InitializePresentative(K); double sumDist = 100; bool isOver = false; while (sumDist > convergence) { //对当前的代表进行聚类 isOver = true; foreach (var data in dataset.Data) { double minDist = double.MaxValue; int lableNow = 0; for (int i = 0; i < K; i++) { double d = Matrix.Dist(data.features, presentative[i].features); if (d < minDist) { minDist = d; lableNow = i; } } if (lableNow != data.label) isOver = false; data.label = lableNow; } if (isOver == true) break;//若当前迭代未改变类,则结束 //生成新的代表 DataType[] newPresentative = new DataType[K]; for (int i = 0; i < K; i++) { int clusterCount = 0; newPresentative[i] = new DataType(presentative[0].cntFeatures); //新的代表取聚类中所有向量的平均值 foreach (var data in dataset.Data) { if (data.label != i) continue; clusterCount++; newPresentative[i].Add(data); } for (int j = 0; j < newPresentative[i].cntFeatures; j++) newPresentative[i].features[j] /= clusterCount; } //计算两次代表之间的平均距离 sumDist = 0; for (int i = 0; i < K; i++) { sumDist += Matrix.Dist(presentative[i].features, newPresentative[i].features); } presentative = newPresentative; sumDist /= K; } #endregion #region 生成评价指标 ValidationPair v = ClusterValidater.GetValidation(dataset, K); if (v.purity>=result.purity) { if (v.purity > result.purity) result = v; else if (v.gini < result.gini) result = v; } #endregion } return result; }
/// <summary> /// 执行谱聚类算法,生成K个类,每个点选最近的neighbourCnt个邻居 /// </summary> /// <param name="K">类个数</param> /// <param name="neighbourCnt">邻居个数</param> /// <returns>聚类结果的purity和gini指标</returns> public ValidationPair Generate(int K, int neighbourCnt,PrintLogFunction PrintLog) { GetNeighbours(neighbourCnt); int n = arrData.Count(); matlab.Execute("n = " + n.ToString() + ";"); Array W = new double[n, n]; //计算W矩阵 for (int i=0;i< n;i++) { for(int j=0;j<neighbourCnt;j++) { int a = i; int b = nearestNeighbours[i][j]; W.SetValue(1, a, b); W.SetValue(1, b, a); } } Array piW = new double[n, n]; matlab.PutFullMatrix("W", "base", W, piW); //计算D矩阵 matlab.Execute("s = sum(W);");//对W的每一行求和 matlab.Execute("D = full(sparse(1:n, 1:n, s));");//将s赋值给D的对角线 //计算E矩阵,E的K个最大特征值等于归一化L后L的K个最小特征值 matlab.Execute("L = D - W;"); matlab.Execute("E = D^(-1/2)*W*D^(-1/2);"); matlab.Execute("k=" + K.ToString() + ";"); PrintLog("正在计算特征值特征向量..."); matlab.Execute("[Q, V] = eigs(E, k);"); PrintLog("特征值特征向量计算完毕"); //将特征向量填充给新的数据集 Array pr = new double[n, K]; Array pi = new double[n, K]; matlab.GetFullMatrix("Q", "base", ref pr, ref pi); Dataset newData = new Dataset(); for (int i = 0; i < n; i++) { DataType d = new DataType(K); for (int j = 0; j < K; j++) d.features[j] = (double)pr.GetValue(i, j); d.label_grountTruth = arrData[i].label_grountTruth; newData.AddData(d); } //对新的数据进行kmeans聚类 KmeansGenerator kmeans = new KmeansGenerator(newData, convergence); return kmeans.Generate(K, 10, PrintLog); }
/// <summary> /// 预处理,计算每个点最近的n个邻居,用于算法中W矩阵的计算 /// <param name="neighbourCnt">每个点记录最近的neighbourCnt个点</param> /// </summary> private void FindNeighbours(int neighbourCnt, PrintLogFunction PrintLog) { int n = arrData.Count(); nearestNeighbours = new Dictionary<int, int[]>(); //寻找相邻点 for (int i = 0; i < n; i++) { Dictionary<int, double> dist = new Dictionary<int, double>(); for (int j = 0; j < n; j++) { if (j == i) dist.Add(j, double.MaxValue);//一个点不能与自己相邻 else dist.Add(j, Matrix.Dist(arrData[i].features, arrData[j].features)); } var items = from pair in dist orderby pair.Value ascending select pair; var arr = items.ToArray(); nearestNeighbours.Add(i, new int[neighbourCnt]); for (int k = 0; k < neighbourCnt; k++) { nearestNeighbours[i][k] = arr[k].Key; } } //将近邻矩阵打印出来保存 StreamWriter sw = new StreamWriter(@"neighbours_tmp.txt"); for(int i=0;i< n;i++) { for (int k = 0; k < neighbourCnt; k++) sw.Write(nearestNeighbours[i][k].ToString() + ","); sw.WriteLine(); } sw.Close(); }