Пример #1
0
        private static float GetDocumentDistance(DocumentVectorTest doc1, DocumentVectorTest doc2)
        {
            var dist = 0.0f;

            for (var i = 0; i < doc1.VectorSpace.Length; i++)
            {
                dist += (float)Math.Pow((double)(doc1.VectorSpace[i] - doc2.VectorSpace[i]), 2.0);
            }
            dist = (float)Math.Pow((double)dist, 0.5);
            return(dist);
        }
Пример #2
0
        private static float Move(DocumentVectorTest documentVector1, DocumentVectorTest documentVector2, float G)
        {
            int length = documentVector1.VectorSpace.Count();

            float[] d        = new float[length];
            var     distance = GetDocumentDistance(documentVector1, documentVector2);

            for (int i = 0; i < length; i++)
            {
                d[i] = documentVector2.VectorSpace[i] - documentVector1.VectorSpace[i];
            }
            for (var i = 0; i < length; i++)
            {
                documentVector1.VectorSpace[i] = documentVector1.VectorSpace[i] + distance * (G / (float)Math.Pow(distance, 3.0));  //--last function
                //documentVector1.VectorSpace[i] = documentVector1.VectorSpace[i] + d[i] * (G / (float)Math.Pow(distance, 3.0));
            }
            return(distance);
        }
        public static List <DocumentVectorTest> CreatingTheDocVectorCollection(string fileName)
        {
            List <DocumentVectorTest> TestDocVectorList = new List <DocumentVectorTest>();
            int number_of_lines = 0;

            char[] separators = { ' ', ',', '/', '.', '-', '\t' };

            const Int32 BufferSize = 128;

            using (var fileStream = File.OpenRead(fileName))
                using (var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize))
                {
                    String line;
                    while ((line = streamReader.ReadLine()) != null)
                    {
                        number_of_lines++;
                        DocumentVectorTest testDoc = new DocumentVectorTest();
                        testDoc.VectorSpace = new float[2];
                        var lineitem  = line.TrimStart(' ');
                        var lineitem2 = lineitem.TrimEnd(' ');
                        var items     = lineitem2.Split(separators);
                        testDoc.VectorSpace[0] = float.Parse(items[0]);
                        testDoc.VectorSpace[1] = float.Parse(items[1]);

                        /*
                         * for (int i = 0; i < items.Count(); i++)
                         * {
                         *  if (!items[i].Contains(" "))
                         *      testDoc.VectorSpace[i] = float.Parse(items[i]);
                         *  else
                         *      continue;
                         * }
                         */
                        //testDoc.Content = "testDataPoint" + number_of_lines;
                        testDoc.Content = number_of_lines.ToString();

                        TestDocVectorList.Add(testDoc);
                    }
                }
            return(TestDocVectorList);
        }
Пример #4
0
        public static List <TestCentroid> UpdateMeans(List <TestCentroid> fillCentroidCollection, List <DocumentVectorTest> vectorSpace)
        {
            List <TestCentroid>       result         = new List <TestCentroid>();
            List <DocumentVectorTest> newVectorSpace = new List <DocumentVectorTest>(vectorSpace);
            int length = vectorSpace[0].VectorSpace.Length;

            float[] newVectorSpaceArray   = new float[length];
            float[] minDistancesToCluster = new float[0];

            for (int i = 0; i < length; i++)
            {
                newVectorSpaceArray[i] = 0.0F;
            }

            for (int c = 0; c < fillCentroidCollection.Count; c++)
            {
                for (int gd = 0; gd < fillCentroidCollection[c].GroupedDocument.Count; gd++)
                {
                    for (int k = 0; k < fillCentroidCollection[c].GroupedDocument[gd].VectorSpace.Length; k++)
                    {
                        newVectorSpaceArray[k] += fillCentroidCollection[c].GroupedDocument[gd].VectorSpace[k];
                    }
                }
            }

            for (int c1 = 0; c1 < fillCentroidCollection.Count; c1++)
            {
                for (int gd1 = 0; gd1 < fillCentroidCollection[c1].GroupedDocument.Count; gd1++)
                {
                    for (int k1 = 0; k1 < fillCentroidCollection[c1].GroupedDocument[gd1].VectorSpace.Length; k1++)
                    {
                        newVectorSpaceArray[k1] = newVectorSpaceArray[k1] / fillCentroidCollection[c1].GroupedDocument.Count;
                    }
                }
            }

            float minDist      = 0.1F;
            float currentValue = 0.1F;
            int   index        = 0;

            for (int i = 0; i < fillCentroidCollection.Count; i++)
            {
                minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count];
                for (int j = 0; j < fillCentroidCollection[i].GroupedDocument.Count; j++)
                {
                    //minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count];
                    minDistancesToCluster[j] = SimilarityMatrixCalculations.FindEuclideanDistance(fillCentroidCollection[i].GroupedDocument.First().VectorSpace, fillCentroidCollection[i].GroupedDocument[j].VectorSpace);
                    //}

                    for (int z = 0; z < minDistancesToCluster.Length; z++)
                    {
                        currentValue = minDistancesToCluster[z];
                        if (currentValue <= minDist && currentValue != 0)
                        {
                            minDist = currentValue;
                            index   = z;
                        }
                        //here we must to find the closest document to new vectorSpace;
                        //for all docs in cluster create the vectorSpace
                    }

                    /*
                     * DocumentVector newClusterCenter = fillCentroidCollection[i].GroupedDocument[index];
                     * fillCentroidCollection[i].GroupedDocument.Clear();
                     * fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter);
                     */
                }
                DocumentVectorTest newClusterCenter = fillCentroidCollection[i].GroupedDocument[index];
                index = 0;
                fillCentroidCollection[i].GroupedDocument.Clear();
                fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter);
            }

            minDistancesToCluster = new float[0];
            result = new List <TestCentroid>(fillCentroidCollection);
            return(result);
        }
        /*
         * private static bool UpdateMeans(List<DocumentVectorTest> data, List<TestCentroid> clustering, List<TestCentroid> means)
         * {
         *  int numClusters = means.Count;
         *  bool changed = false;
         *
         *  List<TestCentroid> newClustering = new List<TestCentroid>(numClusters);
         *  float[] distances = new float[numClusters];
         *  for(int i = 0; i < data.Count; i++)
         *  {
         *      for (int k = 0; k < numClusters; k++)
         *          distances[k] = Logic.ClusteringAlgorithms.SimilarityMatrixCalculations.FindEuclideanDistance(data[i].VectorSpace, means[k].GroupedDocument[0].VectorSpace);
         *
         *      int newClusterID = MinIndex(distances);
         *
         *      if(newClustering[newClusterID].GroupedDocument[0].Content != newClustering[i].GroupedDocument[0].Content)
         *      {
         *          changed = true;
         *          newClustering[i] = newClustering[newClusterID];
         *      }
         *  }
         *  if (changed == false)
         *      return false;
         *
         *
         *  List<TestCentroid> clusterCounts = new List<TestCentroid>(numClusters);
         *  for(int i = 0; i < data.Count; i++)
         *  {
         *      var cluster = newClustering[i];
         ++clusterCounts.IndexOf(cluster);
         *  }
         *
         *  return true;
         * }
         */
        #endregion

        #region OldUpdateMeans

        /*
         * public static bool UpdateMeans(List<DocumentVectorTest> data, List<TestCentroid> clustering, List<TestCentroid> means)
         * {
         *  List<DocumentVectorTest> newVectorSpace = new List<DocumentVectorTest>(data);
         *  int vectorSpaceLength = data[0].VectorSpace.Length;
         *  int numClusters = means.Count;
         *  int[] clusterCounts = new int[numClusters];
         *
         #region AccordingToTheAuthorsWeCanOmitThis
         *
         *  for(int c = 0; c < data.Count; c++)
         *  {
         *      var cluster = data[c];
         ++clusterCounts[data.IndexOf(cluster)];
         *  }
         *
         #endregion
         *
         *  for (int k = 0; k < numClusters; k++)
         *      if (clusterCounts[k] == 0)
         *          return false;
         *
         *  for (int cl = 0; cl < means.Count; cl++)
         *      clustering[cl].GroupedDocument.Clear();
         *
         *  for(int i = 0; i < data.Count; i++)
         *  {
         *      var cluster = clustering.ElementAt(i);
         *      for (int j = 0; j < cluster.GroupedDocument.Count; j++)
         *          for (int v = 0; v < cluster.GroupedDocument[j].VectorSpace.Length; v++)
         *              clustering[clustering.IndexOf(cluster)].GroupedDocument[j].VectorSpace[v] += data[i].VectorSpace[v];
         *  }
         *
         *  for (int k = 0; k < clustering.Count; ++k)
         *      for (int j = 0; j < clustering[k].GroupedDocument.Count; j++)
         *          for (int z = 0; z < clustering[k].GroupedDocument[j].VectorSpace.Length; z++)
         *              clustering[k].GroupedDocument[j].VectorSpace[z] /= clusterCounts[k];
         *  return true;
         * }
         */
        #endregion

        public static Tuple <bool, List <TestCentroid> > UpdateMeans(List <DocumentVectorTest> data, List <TestCentroid> clustering, List <TestCentroid> means)
        {
            Tuple <bool, List <TestCentroid> > result;
            int   numClusters           = means.Count;
            bool  changed               = false;
            float minVal                = float.MaxValue;
            int   minIndex              = 0;
            List <TestCentroid> centers = new List <TestCentroid>(numClusters);

            centers = TestCentroidInitializer(numClusters);
            List <TestCentroid> newMeans = new List <TestCentroid>(numClusters);

            newMeans = TestCentroidInitializer(numClusters);

            for (int i = 0; i < means.Count; i++)
            {
                DocumentVectorTest newCenter = new DocumentVectorTest();
                newCenter.VectorSpace = new float[means[i].GroupedDocument[0].VectorSpace.Length];
                for (int j = 0; j < means[i].GroupedDocument.Count; j++)
                {
                    for (int v = 0; v < means[i].GroupedDocument[j].VectorSpace.Length; v++)
                    {
                        newCenter.VectorSpace[v] += means[i].GroupedDocument[j].VectorSpace[v];
                    }
                    for (int z = 0; z < means[i].GroupedDocument[j].VectorSpace.Length; z++)
                    {
                        newCenter.VectorSpace[z] = newCenter.VectorSpace[z] / means[i].GroupedDocument.Count;
                    }
                }
                newCenter.Content = "ideal center of " + i.ToString() + " cluster";
                centers[i].GroupedDocument.Add(newCenter);
            }

            for (int i = 0; i < clustering.Count; i++)
            {
                float[] distances = new float[clustering[i].GroupedDocument.Count];
                for (int j = 0; j < clustering[i].GroupedDocument.Count; j++)
                {
                    distances[j] = Distance(clustering[i].GroupedDocument[j].VectorSpace, centers[i].GroupedDocument[0].VectorSpace);
                }
                for (int j = 0; j < distances.Length; j++)
                {
                    if (distances[j] < minVal)
                    {
                        minVal   = distances[j];
                        minIndex = j;
                    }
                }
                newMeans[i].GroupedDocument.Add(clustering[i].GroupedDocument[minIndex]);
                minVal   = float.MaxValue;
                minIndex = 0;
            }

            for (int i = 0; i < clustering.Count; i++)
            {
                if (means[i].GroupedDocument[0].Content == newMeans[i].GroupedDocument[0].Content)
                {
                    changed = true;
                }
                else
                {
                    changed = false;
                }
            }

            means  = newMeans;
            result = new Tuple <bool, List <TestCentroid> >(changed, means);
            return(result);
        }