public static float GetEucliedeanDistance(DocumentVector docVector1, DocumentVector docVector2) { float euclideanDistance = 0; Parallel.For(0, docVector1.VectorSpace.Length, i => { euclideanDistance += (float)Math.Pow((docVector1.VectorSpace[i] - docVector2.VectorSpace[i]), 2); }); #region sequential_Euclidean_distance /* * for (var i = 0; i <= vector_A.Length-1; i++) * { * euclideanDistance += (float)Math.Pow((vector_A[i] - vector_B[i]), 2); * } */ #endregion var end_result = (float)Math.Sqrt(euclideanDistance); if (float.IsNaN(end_result)) { return(0); } else { return(end_result); } }
public static int FindClosestClusterCenter(List <Centroid> clusterCenter, DocumentVector docVector) { float[] similarityMeasure = new float[clusterCenter.Count()]; for (int i = 0; i < clusterCenter.Count - 1; i++) { similarityMeasure[i] = SimilarityMatrixCalculations.CalculateCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, docVector.VectorSpace); } int index = 0; float maxValue = similarityMeasure[0]; for (int j = 0; j < similarityMeasure.Count(); j++) { if (similarityMeasure[j] > maxValue) { maxValue = similarityMeasure[j]; index = j; } } return(index); }
public static List <Centroid> GetSeedPoints(List <DocumentVector> docCollection, int count) { List <DocumentVector> documentCollection = new List <DocumentVector>(docCollection.Count); documentCollection = docCollection; List <Centroid> seedPoints = new List <Centroid>(count); Document documentDetails = new Document(); List <Document> detailedDocumentCollection = new List <Document>(); int index = 0; firstIndex = GenerateRandomNumber(0, count); Centroid first_Centroid = new Centroid(); first_Centroid.GroupedDocument = new List <DocumentVector>(); DocumentVector docVect = new DocumentVector(); docVect = documentCollection[firstIndex]; //here we have the error!!! with null reference exception try { first_Centroid.GroupedDocument.Add(docVect); } catch (Exception ex) { string processing_log = @"F:\Magistry files\Initialization_cluser_K-means_pp_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The error occured: " + ex.ToString() + '\n'); } System.Windows.MessageBox.Show("Error occured: " + ex.ToString(), "Error!", System.Windows.MessageBoxButton.OK); } seedPoints.Add(first_Centroid); //here we have list with 1 document getting using random index for (int i = 0; i <= count; i++) { if (seedPoints.Count >= 2) { Document minDocumentDetails = GetMinDocumenDetailsDistance(detailedDocumentCollection); minDocumentDetails.VectorSpace = minDocumentDetails.SeedDocument.VectorSpace; minDocumentDetails.document_Content = minDocumentDetails.SeedDocument.Content; /* * using(var dbContext = new ArticlesDataContainer()) * { * var PP_article_Id = dbContext.PP_ArticlesSet.SqlQuery(@"SELECT article_Id FROM dbo.PP_ArticlesSet WHERE dbo.PP_ArticlesSet.article_title LIKE " + "%" + minDocumentDetails.document_Content + "%"); * minDocumentDetails.document_ID = Convert.ToInt32(PP_article_Id.ToArray()[0]); * var PP_author_Id = dbContext.PP_ArticlesSet.SqlQuery(@"SELECT Author_author_Id FROM dbo.PP_ArticlesAuthor WHERE dbo.PP_ArticlesAuthor.PP_Articles_article_Id="+PP_article_Id.ToArray()[0]); * foreach(var authors_id in PP_author_Id) * { * for(int j=0; j<=minDocumentDetails.author_ID.Length-1; j++) * { * minDocumentDetails.author_ID[j] = Convert.ToInt32(authors_id); * } * } * * } */ index = GetWeightedProbDist(minDocumentDetails.Weights, minDocumentDetails.Sum); DocumentVector SubsequentDocument = new DocumentVector(); SubsequentDocument = documentCollection[index]; Centroid subsequentCentroid = new Centroid(); subsequentCentroid.GroupedDocument = new List <DocumentVector>(); subsequentCentroid.GroupedDocument.Add(SubsequentDocument); seedPoints.Add(subsequentCentroid); documentDetails = new Document(); documentDetails = GetAllDetails(documentCollection, subsequentCentroid, documentDetails); //the re is no objects in subsequent document - problem detailedDocumentCollection.Add(documentDetails); } else { documentDetails = new Document(); documentDetails = GetAllDetails(documentCollection, first_Centroid, documentDetails); detailedDocumentCollection.Add(documentDetails); index = GetWeightedProbDist(documentDetails.Weights, documentDetails.Sum); DocumentVector SecondDocumentVector = new DocumentVector(); SecondDocumentVector = documentCollection[index]; Centroid second_Centroid = new Centroid(); second_Centroid.GroupedDocument = new List <DocumentVector>(); try { second_Centroid.GroupedDocument.Add(SecondDocumentVector); } catch (Exception ex) { string processing_log = @"F:\Magistry files\Initialization_cluser_K-means_pp_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The error occured: " + ex.ToString() + '\n'); } System.Windows.MessageBox.Show("Error occured: " + ex.ToString(), "Error!", System.Windows.MessageBoxButton.OK); } seedPoints.Add(second_Centroid); documentDetails = new Document(); documentDetails = GetAllDetails(documentCollection, second_Centroid, documentDetails); detailedDocumentCollection.Add(documentDetails); } } //PointDetails minpd = GetMinDPD(pds); //here we must to calculate distance to other clusters centroids. return(seedPoints); }
public static List <DocumentVector> DocumentCollectionProcessing(List <String> collection) { parallelOption.MaxDegreeOfParallelism = 20; var vector_space_model_calculation = Stopwatch.StartNew(); //dTerms = new HashSet<string>(); //documentCollection = CreateDocumentCollection.GenerateCollection(); #region old_parts_of_code /*foreach (string documentContent in documentCollection) * { * foreach (string term in r.Split(documentContent)) * { * if (!StopWordsHandler.IsStotpWord(term)) * dTerms.Add(term); * else * continue; * } * } * List<string> removeList = new List<string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," }; * foreach (string s in removeList) * { * dTerms.Remove(s); * }*/ #endregion termHashset = new HashSet <string>(); using (var dbContext = new ArticleDBDataModelContainer()) { dbContext.Terms_Vocabulary.Load(); foreach (var terms in dbContext.Terms_Vocabulary.Local) { termHashset.Add(terms.term_value.ToLower()); } } /* * foreach(var items in termHashset) * { * dTerms.Add(items.ToLower()); * } */ List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; // trying to optimize execution time 04.10.2017 //foreach (string document in documentCollection) Parallel.ForEach(collection, parallelOption, document => { int count = 0; space = new float[termHashset.Count]; //space = new float[dTerms.Count]; //foreach (string term in dTerms) foreach (string term in termHashset) { //space[count] = CalculateTFIDF.FindTFIDF(collection, document, term); space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collection, document, term); count++; } _documentVector = new DocumentVector(); #region dont_usable_now //last changes 21.05.2018 /* * using (var PGDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in PGDbContext.PG_ArticlesSet) * { * string PG_article = article.title + article.abstractText + article.keywords; * if (PG_article.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } * using (var PPDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in PPDbContext.PP_ArticlesSet) * { * string PP_record = article.article_title + article.article_source; * if (PP_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } * using (var UMKDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in UMKDbContext.UMK_ArticlesSet) * { * string UMK_record = article.article_title + article.article_Full_title + article.article_eng_keywords + article.article_pl_keywords + article.article_translated_title; * if (UMK_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } * using (var UGDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in UGDbContext.UG_ArticlesSet) * { * string UG_record = article.article_title + article.article_source + article.article_keywords; * if (UG_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * * } * using (var WSBDbContext = new ArticleDBDataModelContainer()) * { * foreach (var article in WSBDbContext.WSB_ArticlesSet) * { * string WSB_record = article.article_title + article.article_common_title + article.article_title_other_lang + article.article_eng_keywords + article.article_pl_keywords + article.article_details; * if (WSB_record.Contains(document)) * _documentVector.ArticleID = article.article_Id; * } * } */ #endregion _documentVector.Content = document; _documentVector.VectorSpace = space; _documentVector.index_Of_Doc_for_labeling = collection.IndexOf(document); documentVectorSpace.Add(_documentVector); }); /* * foreach(string document in collection) * { * int count = 0; * space = new float[dTerms.Count]; * foreach (string term in dTerms){ * space[count] = CalculateTFIDF.FindTFIDF(collection,document, term); * count++; * } * * _documentVector = new DocumentVector(); * _documentVector.Content = document; * _documentVector.VectorSpace = space; * documentVectorSpace.Add(_documentVector); * //tu mamy 2296 termow * //ClusteringAlgorithms.Used_functions.Normalization.Normilize_Term_Frequency(documentVectorSpace); // are that the correct place to perform normalization? * * } */ vector_space_model_calculation.Stop(); string processing_log = @"F:\Magistry files\Processing_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString()); } return(documentVectorSpace); }
internal static List <DocumentVector> DocumentCollectionProcessingDictionary(Dictionary <int, string> docCollectionDictionary) { parallelOption.MaxDegreeOfParallelism = 20; var vector_space_model_calculation = Stopwatch.StartNew(); termHashset = new HashSet <string>(); using (var dbContext = new ArticleDBDataModelContainer()) { dbContext.Terms_Vocabulary.Load(); foreach (var terms in dbContext.Terms_Vocabulary.Local) { termHashset.Add(terms.term_value.ToLower()); } } List <DocumentVector> documentVectorSpace = new List <DocumentVector>(); DocumentVector _documentVector; float[] space; int index = 0; var arrayOfDocs = docCollectionDictionary.Keys.ToArray(); Parallel.ForEach(docCollectionDictionary, parallelOption, document => { int count = 0; space = new float[termHashset.Count]; var collectionValue = docCollectionDictionary.Values.ToList(); foreach (string term in termHashset) { space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collectionValue, document.Value, term); count++; } for (int i = 0; i < arrayOfDocs.Length; i++) { if (arrayOfDocs[i] == document.Key) { index = i; } } _documentVector = new DocumentVector(); _documentVector.ArticleID = document.Key; _documentVector.index_Of_Doc_for_labeling = index; _documentVector.Content = document.Value; _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); }); vector_space_model_calculation.Stop(); string processing_log = @"F:\Magistry files\Processing_log.txt"; using (StreamWriter sw = File.AppendText(processing_log)) { sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString()); } return(documentVectorSpace); }