/// <summary> /// Prepares a collection of document in vector space /// </summary> /// <param name="collection">Document collection/corpus</param> /// <returns>List of, document in vector space</returns> public static List <DocumentVector> ProcessDocumentCollection(DocumentCollection collection) { distinctTerms = new HashSet <string>(); documentCollection = collection.DocumentList; /* * Finds out the total no of distinct terms in the whole corpus so that it will be easy * to represent the document in the vector space. The dimension of the vector space will * be equal to the total no of distinct terms. * */ foreach (string documentContent in collection.DocumentList) { foreach (string term in r.Split(documentContent)) { if (!StopWordsHandler.IsStotpWord(term)) { distinctTerms.Add(term); } else { continue; } } } List <string> removeList = new List <string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," }; foreach (string s in removeList) { distinctTerms.Remove(s); } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; foreach (string document in documentCollection) { int count = 0; space = new float[distinctTerms.Count]; foreach (string term in distinctTerms) { space[count] = FindTFIDF(document, term); count++; } _documentVector = new DocumentVector(); _documentVector.Content = document; _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); } return(documentVectorSpace); }
/// <summary> /// Prepares a collection of document in vector space /// </summary> /// <param name="collection">Document collection/corpus</param> /// <returns>List of, document in vector space</returns> public static List <DocumentVector> ProcessDocumentCollection(DocumentCollection collection) { distinctTerms = new HashSet <string>(); documentCollection = collection.DocumentList; /* * Finds out the total no of distinct terms in the whole corpus so that it will be easy * to represent the document in the vector space. The dimension of the vector space will * be equal to the total no of distinct terms. * */ foreach (string documentContent in collection.DocumentList) { foreach (string term in r.Split(documentContent)) { if (!StopWordsHandler.IsStotpWord(term) && distinctTerms.Contains(term, StringComparer.CurrentCultureIgnoreCase) != true) { if (term.Length >= 4 && term.Length <= 25) { distinctTerms.Add(term); } } else { continue; } } } List <string> removeList = new List <string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", ",", "<", ">", "@", ";", "#" }; foreach (string s in removeList) { distinctTerms.Remove(s); } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; string[] keys; foreach (string document in documentCollection) { int count = 0; space = new float[distinctTerms.Count]; keys = new string[distinctTerms.Count]; foreach (string term in distinctTerms) { space[count] = FindTFIDF(document, term); if (space[count] > 0) { keys[count] = term; } count++; } _documentVector = new DocumentVector(); _documentVector.Content = document; _documentVector.VectorSpace = space; _documentVector.keys = keys; documentVectorSpace.Add(_documentVector); } Console.WriteLine(distinctTerms.ToString()); return(documentVectorSpace); }