//returns index of closest cluster centroid public static int FindClosestClusterCenter(List <Centroid> clusterCenter, DocumentVector obj, string sim) { float[] similarityMeasure = new float[clusterCenter.Count()]; for (int i = 0; i < clusterCenter.Count(); i++) { if (sim == "Cosine") { similarityMeasure[i] = SimilarityMatrics.FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, obj.VectorSpace); } else { Wnlib.WNCommon.path = "C:\\Program Files\\WordNet\\3.0\\dict\\"; SentenceSimilarity semsim = new SentenceSimilarity(); similarityMeasure[i] = semsim.GetScore(clusterCenter[i].GroupedDocument[0].Content, obj.Content); } } int index = 0; float maxValue = similarityMeasure[0]; for (int i = 0; i < similarityMeasure.Count(); i++) { //if document is similar assign the document to the lowest index cluster center to avoid the long loop if (similarityMeasure[i] > maxValue) { maxValue = similarityMeasure[i]; index = i; } } return(index); }
/// <summary> /// Prepares a collection of document in vector space /// </summary> /// <param name="collection">Document collection/corpus</param> /// <returns>List of, document in vector space</returns> public static List <DocumentVector> ProcessDocumentCollection(DocumentCollection collection) { distinctTerms = new HashSet <string>(); documentCollection = collection.DocumentList; /* * Finds out the total no of distinct terms in the whole corpus so that it will be easy * to represent the document in the vector space. The dimension of the vector space will * be equal to the total no of distinct terms. * */ foreach (string documentContent in collection.DocumentList) { foreach (string term in r.Split(documentContent)) { if (!StopWordsHandler.IsStotpWord(term)) { distinctTerms.Add(term); } else { continue; } } } List <string> removeList = new List <string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," }; foreach (string s in removeList) { distinctTerms.Remove(s); } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; foreach (string document in documentCollection) { int count = 0; space = new float[distinctTerms.Count]; foreach (string term in distinctTerms) { space[count] = FindTFIDF(document, term); count++; } _documentVector = new DocumentVector(); _documentVector.Content = document; _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); } return(documentVectorSpace); }
//returns index of closest cluster centroid private static int FindClosestClusterCenter(List <Centroid> clusterCenter, DocumentVector obj) { float[] similarityMeasure = new float[clusterCenter.Count()]; for (int i = 0; i < clusterCenter.Count(); i++) { similarityMeasure[i] = SimilarityMatrics.FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, obj.VectorSpace); } int index = 0; float maxValue = similarityMeasure[0]; for (int i = 0; i < similarityMeasure.Count(); i++) { //if document is similar assign the document to the lowest index cluster center to avoid the long loop if (similarityMeasure[i] > maxValue) { maxValue = similarityMeasure[i]; index = i; } } return(index); }
/// <summary> /// Prepares a collection of document in vector space /// </summary> /// <param name="collection">Document collection/corpus</param> /// <returns>List of, document in vector space</returns> public static List <DocumentVector> ProcessDocumentCollection(DocumentCollection collection) { distinctTerms = new HashSet <string>(); documentCollection = collection.DocumentList; /* * Finds out the total no of distinct terms in the whole corpus so that it will be easy * to represent the document in the vector space. The dimension of the vector space will * be equal to the total no of distinct terms. * */ foreach (string documentContent in collection.DocumentList) { foreach (string term in r.Split(documentContent)) { if (!StopWordsHandler.IsStotpWord(term) && distinctTerms.Contains(term, StringComparer.CurrentCultureIgnoreCase) != true) { if (term.Length >= 4 && term.Length <= 25) { distinctTerms.Add(term); } } else { continue; } } } List <string> removeList = new List <string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", ",", "<", ">", "@", ";", "#" }; foreach (string s in removeList) { distinctTerms.Remove(s); } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; string[] keys; foreach (string document in documentCollection) { int count = 0; space = new float[distinctTerms.Count]; keys = new string[distinctTerms.Count]; foreach (string term in distinctTerms) { space[count] = FindTFIDF(document, term); if (space[count] > 0) { keys[count] = term; } count++; } _documentVector = new DocumentVector(); _documentVector.Content = document; _documentVector.VectorSpace = space; _documentVector.keys = keys; documentVectorSpace.Add(_documentVector); } Console.WriteLine(distinctTerms.ToString()); return(documentVectorSpace); }