/// <summary> /// Prepares a collection of document in vector space /// </summary> /// <param name="collection">Document collection/corpus</param> /// <returns>List of, document in vector space</returns> public static List <DocumentVector> ProcessDocumentCollection(DocumentCollection collection) { distinctTerms = new HashSet <string>(); documentCollection = collection.DocumentList; /* * Finds out the total no of distinct terms in the whole corpus so that it will be easy * to represent the document in the vector space. The dimension of the vector space will * be equal to the total no of distinct terms. * */ foreach (string documentContent in collection.DocumentList) { foreach (string term in r.Split(documentContent)) { //sztem ez nem kell, marmint ez a csekk if (!StopWordsHandler.IsStotpWord(term)) { distinctTerms.Add(term); } else { continue; } } } List <string> removeList = new List <string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," }; foreach (string s in removeList) { distinctTerms.Remove(s); } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; foreach (string document in documentCollection) { int count = 0; space = new float[distinctTerms.Count]; foreach (string term in distinctTerms) { space[count] = FindTFIDF(document, term); count++; } _documentVector = new DocumentVector(); _documentVector.Content = document; _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); } return(documentVectorSpace); }
//returns index of closest cluster centroid private static int FindClosestClusterCenter(List <Centroid> clusterCenter, DocumentVector obj) { float[] similarityMeasure = new float[clusterCenter.Count()]; for (int i = 0; i < clusterCenter.Count(); i++) { similarityMeasure[i] = SimilarityMatrics.FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, obj.VectorSpace); } int index = 0; float maxValue = similarityMeasure[0]; for (int i = 0; i < similarityMeasure.Count(); i++) { //if document is similar assign the document to the lowest index cluster center to avoid the long loop if (similarityMeasure[i] > maxValue) { maxValue = similarityMeasure[i]; index = i; } } return(index); }