private static float GetDocumentDistance(DocumentVectorTest doc1, DocumentVectorTest doc2) { var dist = 0.0f; for (var i = 0; i < doc1.VectorSpace.Length; i++) { dist += (float)Math.Pow((double)(doc1.VectorSpace[i] - doc2.VectorSpace[i]), 2.0); } dist = (float)Math.Pow((double)dist, 0.5); return(dist); }
private static float Move(DocumentVectorTest documentVector1, DocumentVectorTest documentVector2, float G) { int length = documentVector1.VectorSpace.Count(); float[] d = new float[length]; var distance = GetDocumentDistance(documentVector1, documentVector2); for (int i = 0; i < length; i++) { d[i] = documentVector2.VectorSpace[i] - documentVector1.VectorSpace[i]; } for (var i = 0; i < length; i++) { documentVector1.VectorSpace[i] = documentVector1.VectorSpace[i] + distance * (G / (float)Math.Pow(distance, 3.0)); //--last function //documentVector1.VectorSpace[i] = documentVector1.VectorSpace[i] + d[i] * (G / (float)Math.Pow(distance, 3.0)); } return(distance); }
public static List <DocumentVectorTest> CreatingTheDocVectorCollection(string fileName) { List <DocumentVectorTest> TestDocVectorList = new List <DocumentVectorTest>(); int number_of_lines = 0; char[] separators = { ' ', ',', '/', '.', '-', '\t' }; const Int32 BufferSize = 128; using (var fileStream = File.OpenRead(fileName)) using (var streamReader = new StreamReader(fileStream, Encoding.UTF8, true, BufferSize)) { String line; while ((line = streamReader.ReadLine()) != null) { number_of_lines++; DocumentVectorTest testDoc = new DocumentVectorTest(); testDoc.VectorSpace = new float[2]; var lineitem = line.TrimStart(' '); var lineitem2 = lineitem.TrimEnd(' '); var items = lineitem2.Split(separators); testDoc.VectorSpace[0] = float.Parse(items[0]); testDoc.VectorSpace[1] = float.Parse(items[1]); /* * for (int i = 0; i < items.Count(); i++) * { * if (!items[i].Contains(" ")) * testDoc.VectorSpace[i] = float.Parse(items[i]); * else * continue; * } */ //testDoc.Content = "testDataPoint" + number_of_lines; testDoc.Content = number_of_lines.ToString(); TestDocVectorList.Add(testDoc); } } return(TestDocVectorList); }
public static List <TestCentroid> UpdateMeans(List <TestCentroid> fillCentroidCollection, List <DocumentVectorTest> vectorSpace) { List <TestCentroid> result = new List <TestCentroid>(); List <DocumentVectorTest> newVectorSpace = new List <DocumentVectorTest>(vectorSpace); int length = vectorSpace[0].VectorSpace.Length; float[] newVectorSpaceArray = new float[length]; float[] minDistancesToCluster = new float[0]; for (int i = 0; i < length; i++) { newVectorSpaceArray[i] = 0.0F; } for (int c = 0; c < fillCentroidCollection.Count; c++) { for (int gd = 0; gd < fillCentroidCollection[c].GroupedDocument.Count; gd++) { for (int k = 0; k < fillCentroidCollection[c].GroupedDocument[gd].VectorSpace.Length; k++) { newVectorSpaceArray[k] += fillCentroidCollection[c].GroupedDocument[gd].VectorSpace[k]; } } } for (int c1 = 0; c1 < fillCentroidCollection.Count; c1++) { for (int gd1 = 0; gd1 < fillCentroidCollection[c1].GroupedDocument.Count; gd1++) { for (int k1 = 0; k1 < fillCentroidCollection[c1].GroupedDocument[gd1].VectorSpace.Length; k1++) { newVectorSpaceArray[k1] = newVectorSpaceArray[k1] / fillCentroidCollection[c1].GroupedDocument.Count; } } } float minDist = 0.1F; float currentValue = 0.1F; int index = 0; for (int i = 0; i < fillCentroidCollection.Count; i++) { minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count]; for (int j = 0; j < fillCentroidCollection[i].GroupedDocument.Count; j++) { //minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count]; minDistancesToCluster[j] = SimilarityMatrixCalculations.FindEuclideanDistance(fillCentroidCollection[i].GroupedDocument.First().VectorSpace, fillCentroidCollection[i].GroupedDocument[j].VectorSpace); //} for (int z = 0; z < minDistancesToCluster.Length; z++) { currentValue = minDistancesToCluster[z]; if (currentValue <= minDist && currentValue != 0) { minDist = currentValue; index = z; } //here we must to find the closest document to new vectorSpace; //for all docs in cluster create the vectorSpace } /* * DocumentVector newClusterCenter = fillCentroidCollection[i].GroupedDocument[index]; * fillCentroidCollection[i].GroupedDocument.Clear(); * fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter); */ } DocumentVectorTest newClusterCenter = fillCentroidCollection[i].GroupedDocument[index]; index = 0; fillCentroidCollection[i].GroupedDocument.Clear(); fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter); } minDistancesToCluster = new float[0]; result = new List <TestCentroid>(fillCentroidCollection); return(result); }
/* * private static bool UpdateMeans(List<DocumentVectorTest> data, List<TestCentroid> clustering, List<TestCentroid> means) * { * int numClusters = means.Count; * bool changed = false; * * List<TestCentroid> newClustering = new List<TestCentroid>(numClusters); * float[] distances = new float[numClusters]; * for(int i = 0; i < data.Count; i++) * { * for (int k = 0; k < numClusters; k++) * distances[k] = Logic.ClusteringAlgorithms.SimilarityMatrixCalculations.FindEuclideanDistance(data[i].VectorSpace, means[k].GroupedDocument[0].VectorSpace); * * int newClusterID = MinIndex(distances); * * if(newClustering[newClusterID].GroupedDocument[0].Content != newClustering[i].GroupedDocument[0].Content) * { * changed = true; * newClustering[i] = newClustering[newClusterID]; * } * } * if (changed == false) * return false; * * * List<TestCentroid> clusterCounts = new List<TestCentroid>(numClusters); * for(int i = 0; i < data.Count; i++) * { * var cluster = newClustering[i]; ++clusterCounts.IndexOf(cluster); * } * * return true; * } */ #endregion #region OldUpdateMeans /* * public static bool UpdateMeans(List<DocumentVectorTest> data, List<TestCentroid> clustering, List<TestCentroid> means) * { * List<DocumentVectorTest> newVectorSpace = new List<DocumentVectorTest>(data); * int vectorSpaceLength = data[0].VectorSpace.Length; * int numClusters = means.Count; * int[] clusterCounts = new int[numClusters]; * #region AccordingToTheAuthorsWeCanOmitThis * * for(int c = 0; c < data.Count; c++) * { * var cluster = data[c]; ++clusterCounts[data.IndexOf(cluster)]; * } * #endregion * * for (int k = 0; k < numClusters; k++) * if (clusterCounts[k] == 0) * return false; * * for (int cl = 0; cl < means.Count; cl++) * clustering[cl].GroupedDocument.Clear(); * * for(int i = 0; i < data.Count; i++) * { * var cluster = clustering.ElementAt(i); * for (int j = 0; j < cluster.GroupedDocument.Count; j++) * for (int v = 0; v < cluster.GroupedDocument[j].VectorSpace.Length; v++) * clustering[clustering.IndexOf(cluster)].GroupedDocument[j].VectorSpace[v] += data[i].VectorSpace[v]; * } * * for (int k = 0; k < clustering.Count; ++k) * for (int j = 0; j < clustering[k].GroupedDocument.Count; j++) * for (int z = 0; z < clustering[k].GroupedDocument[j].VectorSpace.Length; z++) * clustering[k].GroupedDocument[j].VectorSpace[z] /= clusterCounts[k]; * return true; * } */ #endregion public static Tuple <bool, List <TestCentroid> > UpdateMeans(List <DocumentVectorTest> data, List <TestCentroid> clustering, List <TestCentroid> means) { Tuple <bool, List <TestCentroid> > result; int numClusters = means.Count; bool changed = false; float minVal = float.MaxValue; int minIndex = 0; List <TestCentroid> centers = new List <TestCentroid>(numClusters); centers = TestCentroidInitializer(numClusters); List <TestCentroid> newMeans = new List <TestCentroid>(numClusters); newMeans = TestCentroidInitializer(numClusters); for (int i = 0; i < means.Count; i++) { DocumentVectorTest newCenter = new DocumentVectorTest(); newCenter.VectorSpace = new float[means[i].GroupedDocument[0].VectorSpace.Length]; for (int j = 0; j < means[i].GroupedDocument.Count; j++) { for (int v = 0; v < means[i].GroupedDocument[j].VectorSpace.Length; v++) { newCenter.VectorSpace[v] += means[i].GroupedDocument[j].VectorSpace[v]; } for (int z = 0; z < means[i].GroupedDocument[j].VectorSpace.Length; z++) { newCenter.VectorSpace[z] = newCenter.VectorSpace[z] / means[i].GroupedDocument.Count; } } newCenter.Content = "ideal center of " + i.ToString() + " cluster"; centers[i].GroupedDocument.Add(newCenter); } for (int i = 0; i < clustering.Count; i++) { float[] distances = new float[clustering[i].GroupedDocument.Count]; for (int j = 0; j < clustering[i].GroupedDocument.Count; j++) { distances[j] = Distance(clustering[i].GroupedDocument[j].VectorSpace, centers[i].GroupedDocument[0].VectorSpace); } for (int j = 0; j < distances.Length; j++) { if (distances[j] < minVal) { minVal = distances[j]; minIndex = j; } } newMeans[i].GroupedDocument.Add(clustering[i].GroupedDocument[minIndex]); minVal = float.MaxValue; minIndex = 0; } for (int i = 0; i < clustering.Count; i++) { if (means[i].GroupedDocument[0].Content == newMeans[i].GroupedDocument[0].Content) { changed = true; } else { changed = false; } } means = newMeans; result = new Tuple <bool, List <TestCentroid> >(changed, means); return(result); }