Exemplo n.º 1
0
        /// <summary>
        /// Calculate the jaccard similarity
        /// </summary>
        /// <param name="categorySet1">Set 1</param>
        /// <param name="categorySet2">Set 2</param>
        /// <param name="jaccardType">Jacarrd similarity Type</param>
        /// <returns></returns>
        private static double CalJacSimilarity(List <int> categorySet1, List <int> categorySet2, JaccardType jaccardType)
        {
            int        orSet        = 0;
            int        andSet       = 0;
            List <int> catereduced1 = new List <int>();
            List <int> catereduced2 = new List <int>();

            //if repeated values not allowed, reduce each set
            if (jaccardType == JaccardType.DuplicationInvalid)
            {
                foreach (int category in categorySet1)
                {
                    if (!catereduced1.Contains(category))
                    {
                        catereduced1.Add(category);
                    }
                }
                foreach (int category in categorySet2)
                {
                    if (!catereduced2.Contains(category))
                    {
                        catereduced2.Add(category);
                    }
                }
            }

            //if repeated values allowed, do nothing to the raw data, just make a copy of it
            if (jaccardType == JaccardType.DuplicationValid)
            {
                foreach (var item in categorySet1)
                {
                    catereduced1.Add(item);
                }

                foreach (var item in categorySet2)
                {
                    catereduced2.Add(item);
                }
            }

            //body
            foreach (int category in catereduced1)
            {
                for (int i = 0; i < catereduced2.Count; i++)
                {
                    if (category == catereduced2[i])
                    {
                        andSet++;
                        catereduced2.Remove(category);
                        break;
                    }
                }
                orSet++;
            }
            orSet += catereduced2.Count;
            return((double)andSet / (double)orSet);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Generate init cluster cneters
        /// </summary>
        /// <param name="dataSet">The unclustered dataset, each line with form like "id,item1,item2,....."</param>
        /// <param name="classAmount">The amount of clusters that you want it to be</param>
        /// <returns></returns>
        private static List <string> SetInitCenters(List <string> dataSet, int classAmount, JaccardType jaccardType)
        {
            List <string>         centers        = new List <string>();
            List <string>         centersCopy    = new List <string>();
            List <List <double> > jacSimValueSet = new List <List <double> >();
            Random rdm = new Random();

            centers.Add(dataSet[rdm.Next() % dataSet.Count]);
            centersCopy.Add(centers[0]);


            //don't want to write the documentation any more................
            while (centers.Count < classAmount || centers.Count == 1)
            {
                string brayCenter = ReCalBrayCenter(centersCopy);

                List <int> brayCenterSet = new List <int>();
                string[]   brayCenters   = brayCenter.Split(",");
                for (int j = 1; j < brayCenters.Length; j++)
                {
                    brayCenterSet.Add(Convert.ToInt32(brayCenters[j]));
                }
                string minJacSimItem  = "";
                double minJacSimValue = 1.0;
                foreach (string item in dataSet)
                {
                    List <int> itemSet = new List <int>();
                    string[]   items   = item.Split(",");
                    for (int j = 1; j < items.Length; j++)
                    {
                        itemSet.Add(Convert.ToInt32(items[j]));
                    }
                    double currentJacSim = CalJacSimilarity(brayCenterSet, itemSet, jaccardType);

                    if (currentJacSim < minJacSimValue)
                    {
                        minJacSimValue = currentJacSim;
                        minJacSimItem  = item;
                    }
                }

                centersCopy.Add(minJacSimItem);
                if (!centers.Contains(minJacSimItem))
                {
                    centers.Add(minJacSimItem);
                }
            }
            return(centers);
        }
Exemplo n.º 3
0
        /// <summary>
        /// K-Means Use Modified Jaccard similarity instead of Euclidean distance
        /// The init cluster centers are Generated by Jaccard Similarity combined with some probability
        /// Detailed information are in my course project documentation
        /// </summary>
        /// <param name="dataSet">The unclustered dataset, each line with form like "id,item1,item2,....."</param>
        /// <param name="classAmount">The amount of clusters that you want it to be</param>
        /// <returns></returns>
        private static List <List <string> > KMeans(List <string> dataSet, int classAmount, JaccardType jaccardType)
        {
            StreamWriter streamWriter = new StreamWriter(@"C:\MediaSlot\CloudDocs\Docs\课程\Data Mining\Records\Iteration" + classAmount + @"\" + DateTime.Now.ToString("yyMMddHHmmss") + ".txt");

            string[] record = new string[classAmount];
            Console.WriteLine("K-Means Start");
            //create some room to store the clustered data
            List <List <string> > clusteredDataSet = new List <List <string> >();

            for (int i = 0; i < classAmount; i++)
            {
                List <string> cluster = new List <string>();
                clusteredDataSet.Add(cluster);
            }

            //a sign of whether "K-means" convergence
            bool isConvergence = false;

            Console.WriteLine("Calculate Init Centers");

            //calculate the init cluster center
            List <string> centers = SetInitCenters(dataSet, classAmount, jaccardType);

            //to store the map of a certain item's jaccard similarity with each cluster
            List <List <double> > JacSimMap = new List <List <double> >();

            for (int i = 0; i < classAmount; i++)
            {
                List <double> tm1 = new List <double>()
                {
                    i, 0.0
                };
                JacSimMap.Add(tm1);
            }


            int iterCount = 0;

            //do the k-means iteration
            while (!isConvergence)
            {
                Console.WriteLine("Iter " + iterCount + " Start");

                //create room to store the data of last iteration, it will be used to the isconvergence judgement
                List <List <string> > lastClusterdDataSet = new List <List <string> >();
                foreach (List <string> item in clusteredDataSet)
                {
                    List <string> t1 = new List <string>();
                    foreach (string str in item)
                    {
                        t1.Add(str);
                    }
                    lastClusterdDataSet.Add(t1);
                }


                //as for 2nd and the following iteration, recalculate the cluster braycenter at the beginning
                if (iterCount > 0)
                {
                    for (int i = 0; i < classAmount; i++)
                    {
                        centers[i] = ReCalBrayCenter(clusteredDataSet[i]);
                    }
                    Console.WriteLine("Bray Center Recalculated");
                }

                for (int i = 0; i < classAmount; i++)
                {
                    clusteredDataSet[i].Clear();
                }
                List <List <int> > centersSet = new List <List <int> >();


                foreach (var center in centers)
                {
                    List <int> centerSet = new List <int>();
                    string[]   centerss  = center.Split(",");
                    for (int j = 1; j < centerss.Length; j++)
                    {
                        centerSet.Add(Convert.ToInt32(centerss[j]));
                    }
                    centersSet.Add(centerSet);
                }

                Console.WriteLine("Calculate Jaccard Similarity");
                //calculate the jaccard similarity between every item and each cluster center,
                //then allocate a certain item to cluster which has the maximun jaccard similarity with it
                foreach (var item in dataSet)
                {
                    List <int> itemSet = new List <int>();
                    string[]   items   = item.Split(",");
                    for (int j = 1; j < items.Length; j++)
                    {
                        itemSet.Add(Convert.ToInt32(items[j]));
                    }

                    for (int j = 0; j < centersSet.Count; j++)
                    {
                        JacSimMap[j][0] = (double)j;
                        JacSimMap[j][1] = CalJacSimilarity(itemSet, centersSet[j], jaccardType);
                    }
                    BubbleSort(JacSimMap);
                    clusteredDataSet[(int)JacSimMap[0][0]].Add(item);
                }


                for (int i = 0; i < classAmount; i++)
                {
                    record[i] += clusteredDataSet[i].Count + " ";
                    Console.WriteLine("Cluster " + i + " = " + clusteredDataSet[i].Count + ", " + lastClusterdDataSet[i].Count);
                }

                //isconvergence judgement
                for (int i = 0; i < classAmount; i++)
                {
                    isConvergence = true;
                    if (lastClusterdDataSet[i].Count != clusteredDataSet[i].Count)
                    {
                        Console.WriteLine("Amount not equal");
                        isConvergence = false;
                        break;
                    }

                    foreach (var item in lastClusterdDataSet[i])
                    {
                        if (!clusteredDataSet[i].Contains(item))
                        {
                            Console.WriteLine("Item not equal");
                            isConvergence = false;
                            break;
                        }
                    }
                }
                Console.WriteLine(" ");
                iterCount++;
            }
            for (int i = 0; i < classAmount; i++)
            {
                streamWriter.WriteLine(record[i]);
            }
            streamWriter.Close();

            return(clusteredDataSet);
        }