Esempio n. 1
0
 static void Input()
 {
     Console.WriteLine("输入读入文件的文件名:");
     fileName = Console.ReadLine();
     dataset = new Dataset();
     StreamReader sr = new StreamReader(fileName);
     string line;
     while ((line=sr.ReadLine())!=null)
     {
         var tmp = line.Split(',');
         DataType newdata = new DataType(tmp.Length - 1);
         newdata.features = tmp.Take(tmp.Length - 1).Select<string, double>(x => Convert.ToDouble(x)).ToArray();
         newdata.cntFeatures = tmp.Length - 1;
         newdata.label_grountTruth = Convert.ToInt32(tmp[tmp.Length - 1]);
         dataset.AddData(newdata);
     }
     sr.Close();
 }
        /// <summary>
        /// 执行kmeans聚类方法,生成K个类,重复执行repeat次选结果最好的
        /// </summary>
        /// <param name="K">聚类个数</param>
        /// <param name="repeat">重复次数</param>
        /// <returns>聚类结果的purity和gini指标</returns>
        public ValidationPair Generate(int K, int repeat, PrintLogFunction PrintLog)
        {
            ValidationPair result = new ValidationPair();
            for (int r = 0; r < repeat; r++)
            {
                PrintLog("正在执行第" + r.ToString() + "次kmeans聚类...");
                #region 执行一次聚类

                presentative = new DataType[K];
                InitializePresentative(K);
                double sumDist = 100;
                bool isOver = false;
                while (sumDist > convergence)
                {
                    //对当前的代表进行聚类
                    isOver = true;
                    foreach (var data in dataset.Data)
                    {
                        double minDist = double.MaxValue;
                        int lableNow = 0;
                        for (int i = 0; i < K; i++)
                        {
                            double d = Matrix.Dist(data.features, presentative[i].features);
                            if (d < minDist)
                            {
                                minDist = d;
                                lableNow = i;
                            }
                        }
                        if (lableNow != data.label) isOver = false;
                        data.label = lableNow;
                    }
                    if (isOver == true) break;//若当前迭代未改变类,则结束
                                              //生成新的代表
                    DataType[] newPresentative = new DataType[K];
                    for (int i = 0; i < K; i++)
                    {
                        int clusterCount = 0;
                        newPresentative[i] = new DataType(presentative[0].cntFeatures);
                        //新的代表取聚类中所有向量的平均值
                        foreach (var data in dataset.Data)
                        {
                            if (data.label != i) continue;
                            clusterCount++;
                            newPresentative[i].Add(data);
                        }
                        for (int j = 0; j < newPresentative[i].cntFeatures; j++) newPresentative[i].features[j] /= clusterCount;
                    }
                    //计算两次代表之间的平均距离
                    sumDist = 0;
                    for (int i = 0; i < K; i++)
                    {
                        sumDist += Matrix.Dist(presentative[i].features, newPresentative[i].features);
                    }
                    presentative = newPresentative;
                    sumDist /= K;
                }
                #endregion

                #region 生成评价指标

                ValidationPair v = ClusterValidater.GetValidation(dataset, K);
                if (v.purity>=result.purity)
                {
                    if (v.purity > result.purity) result = v;
                    else if (v.gini < result.gini) result = v;
                }
                #endregion
            }
            return result;
        }
        /// <summary>
        /// 执行谱聚类算法,生成K个类,每个点选最近的neighbourCnt个邻居
        /// </summary>
        /// <param name="K">类个数</param>
        /// <param name="neighbourCnt">邻居个数</param>
        /// <returns>聚类结果的purity和gini指标</returns>
        public ValidationPair Generate(int K, int neighbourCnt,PrintLogFunction PrintLog)
        {
            GetNeighbours(neighbourCnt);

            int n = arrData.Count();
            matlab.Execute("n = " + n.ToString() + ";");
            Array W = new double[n, n];
            //计算W矩阵
            for (int i=0;i< n;i++)
            {
                for(int j=0;j<neighbourCnt;j++)
                {
                    int a = i;
                    int b = nearestNeighbours[i][j];
                    W.SetValue(1, a, b);
                    W.SetValue(1, b, a);
                }
            }
            Array piW = new double[n, n];
            matlab.PutFullMatrix("W", "base", W, piW);

            //计算D矩阵
            matlab.Execute("s = sum(W);");//对W的每一行求和
            matlab.Execute("D = full(sparse(1:n, 1:n, s));");//将s赋值给D的对角线
            //计算E矩阵,E的K个最大特征值等于归一化L后L的K个最小特征值
            matlab.Execute("L = D - W;");
            matlab.Execute("E = D^(-1/2)*W*D^(-1/2);");
            matlab.Execute("k=" + K.ToString() + ";");
            PrintLog("正在计算特征值特征向量...");
            matlab.Execute("[Q, V] = eigs(E, k);");
            PrintLog("特征值特征向量计算完毕");
            //将特征向量填充给新的数据集
            Array pr = new double[n, K];
            Array pi = new double[n, K];
            matlab.GetFullMatrix("Q", "base", ref pr, ref pi);
            Dataset newData = new Dataset();
            for (int i = 0; i < n; i++)
            {
                DataType d = new DataType(K);
                for (int j = 0; j < K; j++) d.features[j] = (double)pr.GetValue(i, j);
                d.label_grountTruth = arrData[i].label_grountTruth;
                newData.AddData(d);
            }
            //对新的数据进行kmeans聚类
            KmeansGenerator kmeans = new KmeansGenerator(newData, convergence);
            return kmeans.Generate(K, 10, PrintLog);
        }
Esempio n. 4
0
 /// <summary>
 /// 添加一组数据
 /// </summary>
 /// <param name="newData"></param>
 public void AddData(DataType newData)
 {
     data.AddLast(newData);
     CntFeatures = newData.cntFeatures;
 }