public static float GetEucliedeanDistance(DocumentVector docVector1, DocumentVector docVector2)
        {
            float euclideanDistance = 0;

            Parallel.For(0, docVector1.VectorSpace.Length, i => {
                euclideanDistance += (float)Math.Pow((docVector1.VectorSpace[i] - docVector2.VectorSpace[i]), 2);
            });

            #region sequential_Euclidean_distance

            /*
             * for (var i = 0; i <= vector_A.Length-1; i++)
             * {
             *  euclideanDistance += (float)Math.Pow((vector_A[i] - vector_B[i]), 2);
             * }
             */
            #endregion

            var end_result = (float)Math.Sqrt(euclideanDistance);

            if (float.IsNaN(end_result))
            {
                return(0);
            }
            else
            {
                return(end_result);
            }
        }
        public static int FindClosestClusterCenter(List <Centroid> clusterCenter, DocumentVector docVector)
        {
            float[] similarityMeasure = new float[clusterCenter.Count()];

            for (int i = 0; i < clusterCenter.Count - 1; i++)
            {
                similarityMeasure[i] = SimilarityMatrixCalculations.CalculateCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, docVector.VectorSpace);
            }
            int   index    = 0;
            float maxValue = similarityMeasure[0];

            for (int j = 0; j < similarityMeasure.Count(); j++)
            {
                if (similarityMeasure[j] > maxValue)
                {
                    maxValue = similarityMeasure[j];
                    index    = j;
                }
            }
            return(index);
        }
        public static List <Centroid> GetSeedPoints(List <DocumentVector> docCollection, int count)
        {
            List <DocumentVector> documentCollection = new List <DocumentVector>(docCollection.Count);

            documentCollection = docCollection;
            List <Centroid> seedPoints                 = new List <Centroid>(count);
            Document        documentDetails            = new Document();
            List <Document> detailedDocumentCollection = new List <Document>();
            int             index = 0;

            firstIndex = GenerateRandomNumber(0, count);
            Centroid first_Centroid = new Centroid();

            first_Centroid.GroupedDocument = new List <DocumentVector>();
            DocumentVector docVect = new DocumentVector();

            docVect = documentCollection[firstIndex];
            //here we have the error!!! with null reference exception
            try
            {
                first_Centroid.GroupedDocument.Add(docVect);
            }
            catch (Exception ex)
            {
                string processing_log = @"F:\Magistry files\Initialization_cluser_K-means_pp_log.txt";

                using (StreamWriter sw = File.AppendText(processing_log))
                {
                    sw.WriteLine(DateTime.Now.ToString() + " The error occured: " + ex.ToString() + '\n');
                }

                System.Windows.MessageBox.Show("Error occured: " + ex.ToString(), "Error!", System.Windows.MessageBoxButton.OK);
            }


            seedPoints.Add(first_Centroid); //here we have list with 1 document getting using random index

            for (int i = 0; i <= count; i++)
            {
                if (seedPoints.Count >= 2)
                {
                    Document minDocumentDetails = GetMinDocumenDetailsDistance(detailedDocumentCollection);
                    minDocumentDetails.VectorSpace      = minDocumentDetails.SeedDocument.VectorSpace;
                    minDocumentDetails.document_Content = minDocumentDetails.SeedDocument.Content;

                    /*
                     * using(var dbContext = new ArticlesDataContainer())
                     * {
                     *  var PP_article_Id = dbContext.PP_ArticlesSet.SqlQuery(@"SELECT article_Id FROM dbo.PP_ArticlesSet WHERE dbo.PP_ArticlesSet.article_title LIKE " + "%" + minDocumentDetails.document_Content + "%");
                     *  minDocumentDetails.document_ID = Convert.ToInt32(PP_article_Id.ToArray()[0]);
                     *  var PP_author_Id = dbContext.PP_ArticlesSet.SqlQuery(@"SELECT Author_author_Id FROM dbo.PP_ArticlesAuthor WHERE dbo.PP_ArticlesAuthor.PP_Articles_article_Id="+PP_article_Id.ToArray()[0]);
                     *  foreach(var authors_id in PP_author_Id)
                     *  {
                     *      for(int j=0; j<=minDocumentDetails.author_ID.Length-1; j++)
                     *      {
                     *          minDocumentDetails.author_ID[j] = Convert.ToInt32(authors_id);
                     *      }
                     *  }
                     *
                     * }
                     */
                    index = GetWeightedProbDist(minDocumentDetails.Weights, minDocumentDetails.Sum);
                    DocumentVector SubsequentDocument = new DocumentVector();
                    SubsequentDocument = documentCollection[index];
                    Centroid subsequentCentroid = new Centroid();
                    subsequentCentroid.GroupedDocument = new List <DocumentVector>();
                    subsequentCentroid.GroupedDocument.Add(SubsequentDocument);
                    seedPoints.Add(subsequentCentroid);

                    documentDetails = new Document();
                    documentDetails = GetAllDetails(documentCollection, subsequentCentroid, documentDetails); //the re is no objects in subsequent document - problem
                    detailedDocumentCollection.Add(documentDetails);
                }
                else
                {
                    documentDetails = new Document();
                    documentDetails = GetAllDetails(documentCollection, first_Centroid, documentDetails);
                    detailedDocumentCollection.Add(documentDetails);
                    index = GetWeightedProbDist(documentDetails.Weights, documentDetails.Sum);
                    DocumentVector SecondDocumentVector = new DocumentVector();
                    SecondDocumentVector = documentCollection[index];
                    Centroid second_Centroid = new Centroid();
                    second_Centroid.GroupedDocument = new List <DocumentVector>();
                    try
                    {
                        second_Centroid.GroupedDocument.Add(SecondDocumentVector);
                    }
                    catch (Exception ex)
                    {
                        string processing_log = @"F:\Magistry files\Initialization_cluser_K-means_pp_log.txt";

                        using (StreamWriter sw = File.AppendText(processing_log))
                        {
                            sw.WriteLine(DateTime.Now.ToString() + " The error occured: " + ex.ToString() + '\n');
                        }

                        System.Windows.MessageBox.Show("Error occured: " + ex.ToString(), "Error!", System.Windows.MessageBoxButton.OK);
                    }

                    seedPoints.Add(second_Centroid);

                    documentDetails = new Document();
                    documentDetails = GetAllDetails(documentCollection, second_Centroid, documentDetails);
                    detailedDocumentCollection.Add(documentDetails);
                }
            }
            //PointDetails minpd = GetMinDPD(pds);
            //here we must to calculate distance to other clusters centroids.
            return(seedPoints);
        }
Exemple #4
0
        public static List <DocumentVector> DocumentCollectionProcessing(List <String> collection)
        {
            parallelOption.MaxDegreeOfParallelism = 20;
            var vector_space_model_calculation = Stopwatch.StartNew();

            //dTerms = new HashSet<string>();
            //documentCollection = CreateDocumentCollection.GenerateCollection();

            #region old_parts_of_code

            /*foreach (string documentContent in documentCollection)
             * {
             *  foreach (string term in r.Split(documentContent))
             *  {
             *      if (!StopWordsHandler.IsStotpWord(term))
             *          dTerms.Add(term);
             *      else
             *          continue;
             *  }
             * }
             * List<string> removeList = new List<string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," };
             * foreach (string s in removeList)
             * {
             *  dTerms.Remove(s);
             * }*/
            #endregion

            termHashset = new HashSet <string>();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                dbContext.Terms_Vocabulary.Load();

                foreach (var terms in dbContext.Terms_Vocabulary.Local)
                {
                    termHashset.Add(terms.term_value.ToLower());
                }
            }

            /*
             * foreach(var items in termHashset)
             * {
             *  dTerms.Add(items.ToLower());
             * }
             */

            List <DocumentVector> documentVectorSpace = new List <DocumentVector>();
            DocumentVector        _documentVector;
            float[] space;

            // trying to optimize execution time 04.10.2017
            //foreach (string document in documentCollection)
            Parallel.ForEach(collection, parallelOption, document => {
                int count = 0;
                space     = new float[termHashset.Count];
                //space = new float[dTerms.Count];
                //foreach (string term in dTerms)
                foreach (string term in termHashset)
                {
                    //space[count] = CalculateTFIDF.FindTFIDF(collection, document, term);
                    space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collection, document, term);
                    count++;
                }

                _documentVector = new DocumentVector();

                #region dont_usable_now
                //last changes 21.05.2018

                /*
                 * using (var PGDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in PGDbContext.PG_ArticlesSet)
                 *  {
                 *      string PG_article = article.title + article.abstractText + article.keywords;
                 *      if (PG_article.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 * using (var PPDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in PPDbContext.PP_ArticlesSet)
                 *  {
                 *      string PP_record = article.article_title + article.article_source;
                 *      if (PP_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 * using (var UMKDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in UMKDbContext.UMK_ArticlesSet)
                 *  {
                 *      string UMK_record = article.article_title + article.article_Full_title + article.article_eng_keywords + article.article_pl_keywords + article.article_translated_title;
                 *      if (UMK_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 * using (var UGDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in UGDbContext.UG_ArticlesSet)
                 *  {
                 *      string UG_record = article.article_title + article.article_source + article.article_keywords;
                 *      if (UG_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 *
                 * }
                 * using (var WSBDbContext = new ArticleDBDataModelContainer())
                 * {
                 *  foreach (var article in WSBDbContext.WSB_ArticlesSet)
                 *  {
                 *      string WSB_record = article.article_title + article.article_common_title + article.article_title_other_lang + article.article_eng_keywords + article.article_pl_keywords + article.article_details;
                 *      if (WSB_record.Contains(document))
                 *          _documentVector.ArticleID = article.article_Id;
                 *  }
                 * }
                 */
                #endregion

                _documentVector.Content     = document;
                _documentVector.VectorSpace = space;
                _documentVector.index_Of_Doc_for_labeling = collection.IndexOf(document);
                documentVectorSpace.Add(_documentVector);
            });

            /*
             * foreach(string document in collection)
             * {
             *  int count = 0;
             *  space = new float[dTerms.Count];
             *  foreach (string term in dTerms){
             *      space[count] = CalculateTFIDF.FindTFIDF(collection,document, term);
             *      count++;
             *  }
             *
             *  _documentVector = new DocumentVector();
             *  _documentVector.Content = document;
             *  _documentVector.VectorSpace = space;
             *  documentVectorSpace.Add(_documentVector);
             *  //tu mamy 2296 termow
             *  //ClusteringAlgorithms.Used_functions.Normalization.Normilize_Term_Frequency(documentVectorSpace); // are that the correct place to perform normalization?
             *
             * }
             */
            vector_space_model_calculation.Stop();

            string processing_log = @"F:\Magistry files\Processing_log.txt";

            using (StreamWriter sw = File.AppendText(processing_log))
            {
                sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString());
            }

            return(documentVectorSpace);
        }
Exemple #5
0
        internal static List <DocumentVector> DocumentCollectionProcessingDictionary(Dictionary <int, string> docCollectionDictionary)
        {
            parallelOption.MaxDegreeOfParallelism = 20;
            var vector_space_model_calculation = Stopwatch.StartNew();

            termHashset = new HashSet <string>();

            using (var dbContext = new ArticleDBDataModelContainer())
            {
                dbContext.Terms_Vocabulary.Load();

                foreach (var terms in dbContext.Terms_Vocabulary.Local)
                {
                    termHashset.Add(terms.term_value.ToLower());
                }
            }

            List <DocumentVector> documentVectorSpace = new List <DocumentVector>();
            DocumentVector        _documentVector;

            float[] space;
            int     index       = 0;
            var     arrayOfDocs = docCollectionDictionary.Keys.ToArray();

            Parallel.ForEach(docCollectionDictionary, parallelOption, document => {
                int count           = 0;
                space               = new float[termHashset.Count];
                var collectionValue = docCollectionDictionary.Values.ToList();

                foreach (string term in termHashset)
                {
                    space[count] = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.FindTFIDF(collectionValue, document.Value, term);
                    count++;
                }
                for (int i = 0; i < arrayOfDocs.Length; i++)
                {
                    if (arrayOfDocs[i] == document.Key)
                    {
                        index = i;
                    }
                }

                _documentVector = new DocumentVector();

                _documentVector.ArticleID = document.Key;
                _documentVector.index_Of_Doc_for_labeling = index;
                _documentVector.Content     = document.Value;
                _documentVector.VectorSpace = space;
                documentVectorSpace.Add(_documentVector);
            });
            vector_space_model_calculation.Stop();

            string processing_log = @"F:\Magistry files\Processing_log.txt";

            using (StreamWriter sw = File.AppendText(processing_log))
            {
                sw.WriteLine(DateTime.Now.ToString() + " The vector space model calculation time is: " + vector_space_model_calculation.Elapsed.Minutes.ToString() + ":" + vector_space_model_calculation.Elapsed.TotalMilliseconds.ToString());
            }

            return(documentVectorSpace);
        }