Пример #1
0
        private void button4_Click(object sender, EventArgs e)
        {
            string path_data_vSpace = "E:\\Dropbox\\Masters\\myMSc\\PracticalPart\\Sematic_K-MEANSClustering\\MScDataSets\\Reuters21578\\data\\00_" + maxNoDoc + "_vSpace.xml";

            if (cboxDataSet.Text == "Reu_01")
            {
                path_data_vSpace = "E:\\Dropbox\\Masters\\myMSc\\PracticalPart\\Sematic_K-MEANSClustering\\MScDataSets\\Reuters21578\\data\\01_" + maxNoDoc + "_vSpace.xml";
            }
            else if (cboxDataSet.Text == "Re0")
            {
            }
            else
            {
                //this is test ds so already assigned
            }

            if (File.Exists(path_data_vSpace))
            {
                vSpace = DeSerializeObject <List <DocumentVector> >(path_data_vSpace);
            }
            else
            {
                var watch = System.Diagnostics.Stopwatch.StartNew();
                vSpace = VectorSpaceModel.ProcessDocumentCollection(docCollection);
                watch.Stop();
                var elapsedMs = watch.ElapsedMilliseconds;
                lblTime.Text = "Time: " + elapsedMs;
                SerializeObject(vSpace, path_data_vSpace);
            }
            MessageBox.Show("Done");
        }
        public ActionResult TestAlgorithm(TestAlgorithmModel model)
        {
            var list = _taiLieuVanBanService.GetDocuments().Take(model.Amount).ToList();

            list.Add(model.Name);

            var docCollection = new DocumentCollection()
            {
                DocumentList = list
            };

            List <DocumentVector> vSpace    = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            List <Centroid>       resultSet = DocumnetClustering.DocumentCluster(model.Cluster, vSpace, model.Name);
            string docNear = DocumnetClustering.FindClosestDocument();

            var mode = new TestAlgorithmModel
            {
                Name         = model.Name,
                Amount       = model.Amount,
                Cluster      = model.Cluster,
                Centroids    = resultSet,
                DocumentNear = docNear
            };

            return(View(mode));
        }
        public ActionResult StorageSuggestion(string document, string type)
        {
            string local = "Không tìm thấy tài liêu/văn bản có cùng nội dung! Tạo hồ sơ mới.";

            var hosos = AutoCompleteTextHoSos(GetHoSos());

            var list = _taiLieuVanBanService.GetDocuments();

            list.Add(document);

            var docCollection = new DocumentCollection()
            {
                DocumentList = list
            };

            var cluster = _taiLieuVanBanService.CountDocumentType(type);

            List <DocumentVector> vSpace    = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            List <Centroid>       resultSet = DocumnetClustering.DocumentCluster(cluster, vSpace, document);

            string documentNeedSearch = DocumnetClustering.FindClosestDocument();

            if (!string.IsNullOrEmpty(documentNeedSearch))
            {
                var taiLieuVanBan = _taiLieuVanBanService.Get(p => p.NoiDung == documentNeedSearch);

                local = hosos.FirstOrDefault(p => p.Id == taiLieuVanBan.HoSoId).Text;
            }

            return(Json(new { da = local }, JsonRequestBehavior.AllowGet));
        }
Пример #4
0
        private void KMeansPP(object sender, RoutedEventArgs e)
        {
            clustResultTxtBox.Document.Blocks.Clear();
            var                      clusterization_stopwatch = Stopwatch.StartNew();
            string                   message   = null;
            string                   algorithm = " k-means++;";
            string                   PPKMeans_label_resul_path = @"F:\Magistry files\data\PPKMeans_label_result4.txt";
            string                   PPK_means_report_path     = @"F:\Magistry files\reports\PPKMeans_report4.txt";
            List <string>            docCollection             = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoading();
            Dictionary <int, string> docCollectionDictionary   = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoadingToDictionary();
            HashSet <string>         termCollection            = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.getTermCollection();
            Dictionary <string, int> wordIndex1     = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.DocumentsContainsTermToDictionary(docCollectionDictionary, termCollection);
            List <DocumentVector>    vSpace1        = VectorSpaceModel.DocumentCollectionProcessingDictionary(docCollectionDictionary);
            int                      totalIteration = 500;
            int                      clusterNumber  = 5;

            clusterNumber = Convert.ToInt32(txtboxClusterNumber.Text);
            List <Centroid> firstCentroidList = new List <Centroid>();

            firstCentroidList = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.InitialCentroidCalculation.CentroidCalculationsForTestKMeansPP(vSpace1, clusterNumber);
            List <Centroid> resultSet = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.KMeans.KMeansClustering(vSpace1, clusterNumber, totalIteration, firstCentroidList);

            clusterization_stopwatch.Stop();
            int[] PPKMeans_label_matrix = new int[vSpace1.Count];
            PPKMeans_label_matrix = Tests.Label_Matrix.ReleaseVersion_Label_Matrix_Extractions(resultSet, PPKMeans_label_resul_path);
            message = RaportGeneration.ReleaseRaportGenerationFunction(resultSet, clusterNumber, totalIteration, clusterization_stopwatch, PPK_means_report_path, algorithm);
            //clustResultTxtBox.AppendText(message);
            invokeFilesToVisualizationGenerator(resultSet, algorithm);
        }
Пример #5
0
        private void btnStartClustering_Click(object sender, EventArgs e)
        {
            List <DocumentVector> vSpace   = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            int             totalIteration = 0;
            List <Centroid> resultSet      = DocumnetClustering.PrepareDocumentCluster(int.Parse(txtClusterNo.Text), vSpace, ref totalIteration);
            string          msg            = string.Empty;
            int             count          = 1;

            foreach (Centroid c in resultSet)
            {
                msg += String.Format("------------------------------[ CLUSTER {0} ]-----------------------------{1}", count, System.Environment.NewLine);
                foreach (DocumentVector document in c.GroupedDocument)
                {
                    msg += document.Content + System.Environment.NewLine;
                    if (c.GroupedDocument.Count > 1)
                    {
                        msg += String.Format("{0}-------------------------------------------------------------------------------{0}", System.Environment.NewLine);
                    }
                }
                msg += "-------------------------------------------------------------------------------" + System.Environment.NewLine;
                count++;
            }

            richTextBox1.Text      = msg;
            lblTotalIteration.Text = totalIteration.ToString();
        }
Пример #6
0
        /// <summary>
        /// 開始訓練
        /// </summary>
        public void train()
        {
            List <DocumentVector> vSpace = VectorSpaceModel.ProcessDocumentCollection(docCollection);

            totalIteration = 0;
            resultSet      = DocumnetClustering.PrepareDocumentCluster(txtClusterNum, vSpace, ref totalIteration);


            Console.WriteLine("totalIteration: " + totalIteration.ToString());
        }
        public ActionResult Test()
        {
            var list = _taiLieuVanBanService.GetDocuments();

            var docCollection = new DocumentCollection()
            {
                DocumentList = list
            };

            var cluster = _taiLieuVanBanService.CountDocumentType("Thông Báo");

            List <DocumentVector> vSpace    = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            List <Centroid>       resultSet = DocumnetClustering.DocumentCluster(cluster, vSpace, "thông báo chính phủ mới");

            return(View(resultSet));
        }
Пример #8
0
        public string GetFromCurrency(string priceToConvert)
        {
            var from_currency = "";

            Preprocess(priceToConvert);

            int totalIteration   = 0;
            int final_index      = -1;
            int collectionNumber = docCollection.DocumentList.Count - 1;

            List <DocumentVector> vSpace    = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            List <Centroid>       resultSet = DocumnetClustering.PrepareDocumentCluster(collectionNumber, vSpace, ref totalIteration, ref final_index, currency);

            from_currency = resultSet[final_index].GroupedDocument[0].Content.Split(',')[0];

            return(from_currency);
        }
Пример #9
0
        private void FuzzyKMeans_Click(object sender, RoutedEventArgs e)
        {
            clustResultTxtBox.Document.Blocks.Clear();
            var                      clusterization_stopwatch = Stopwatch.StartNew();
            string                   message                 = null;
            string                   algorithm               = " Fuzzy c-Means;";
            List <string>            docCollection           = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoading();
            Dictionary <int, string> docCollectionDictionary = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoadingToDictionary();
            HashSet <string>         termCollection          = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.getTermCollection();
            Dictionary <string, int> wordIndex               = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.DocumentsContainsTermToDictionary(docCollectionDictionary, termCollection);
            List <DocumentVector>    vSpace = VectorSpaceModel.DocumentCollectionProcessingDictionary(docCollectionDictionary);
            string                   Fuzzy_K_means_clusterization_result = @"F:\Magistry files\Fuzzy_KMeans_result6.txt";
            string                   Fuzzy_K_means_label_result          = @"F:\Magistry files\FCM_label_result6.txt";
            string                   Fuzzy_K_means_report_path           = @"F:\Magistry files\reports\FCM_report6.txt";
            float                    fuzziness      = 0.5f;
            float                    epsilon        = 0.003f;
            int                      clusterNumber  = 5;
            int                      totalIteration = 0;

            clusterNumber = Convert.ToInt32(txtboxClusterNumber.Text);
            List <Centroid> resultSet = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.FuzzyCMeans.CreateClusterSet(clusterNumber);

            float[,] Result_fcm;

            /*
             * Result_fcm = FuzzyKMeans.Fcm(vSpace, clusterNumber, epsilon, fuzziness, termCollection);
             * FuzzyKMeans.WriteSimilarityArrayToFile(Result_fcm, Fuzzy_K_means_clusterization_result);
             * resultSet = FuzzyKMeans.AssignDocsToClusters(Result_fcm, clusterNumber,vSpace);
             * FuzzyKMeans.Show_clusters(vSpace, Result_fcm, clusterNumber);
             */
            var result = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.FuzzyCMeans.Fcm(vSpace, clusterNumber, epsilon, fuzziness);

            Result_fcm     = result.Item1;
            totalIteration = result.Item2;
            Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.FuzzyCMeans.WriteSimilarityArrayToFile(Result_fcm, Fuzzy_K_means_clusterization_result);
            var assignedResult = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.FuzzyCMeans.AssignDocsToClusters(Result_fcm, clusterNumber, vSpace, resultSet);

            clusterization_stopwatch.Stop();
            int[] FuzzyKMeans_label_matrix  = new int[vSpace.Count];
            int[] FuzzyKMeans_label_matrix1 = assignedResult.Item1;
            resultSet = assignedResult.Item2;
            FuzzyKMeans_label_matrix = Tests.Label_Matrix.ReleaseVersion_Label_Matrix_Extractions(resultSet, Fuzzy_K_means_label_result);
            message = RaportGeneration.ReleaseRaportGenerationFunction(resultSet, clusterNumber, totalIteration, clusterization_stopwatch, Fuzzy_K_means_report_path, algorithm);
            //clustResultTxtBox.AppendText(message);
            invokeFilesToVisualizationGenerator(resultSet, algorithm);
        }
Пример #10
0
        private void btnStartClustering_Click(object sender, EventArgs e)
        {
            List <DocumentVector> vSpace   = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            int             totalIteration = 0;
            List <Centroid> resultSet      = DocumnetClustering.PrepareDocumentCluster(int.Parse(txtClusterNo.Text), vSpace, ref totalIteration);
            string          msg            = string.Empty;
            int             count          = 1;
            string          k     = string.Empty;
            string          max   = string.Empty;
            List <string>   topic = new List <string>();

            foreach (Centroid c in resultSet)
            {
                msg += String.Format("------------------------------[ CLUSTER {0} ]-----------------------------{1}", count, System.Environment.NewLine);
                k   += String.Format("[ CLUSTER {0} ]", count, System.Environment.NewLine);
                max  = string.Empty;
                foreach (DocumentVector document in c.GroupedDocument)
                {
                    for (int i = 0; i < document.keys.Length; i++)
                    {
                        float m = document.VectorSpace[0];
                        if (document.VectorSpace[i] > 0.005 && document.keys[i] != ".")
                        {
                            k += document.keys[i] + ",";
                        }
                        msg += document.Content + System.Environment.NewLine;
                        if (c.GroupedDocument.Count > 1)
                        {
                            msg += String.Format("{0}-------------------------------------------------------------------------------{0}", System.Environment.NewLine);
                        }
                    }

                    msg += "-------------------------------------------------------------------------------" + System.Environment.NewLine;
                    k   += System.Environment.NewLine;
                    topic.Add(max);
                    count++;
                }
            }

            richTextBox2.Text = k;
            richTextBox1.Text = msg;
            label10.Text      = totalIteration.ToString();
        }
Пример #11
0
        public decimal GetExchangedValue(string priceToConvert, string to_currency)
        {
            var     from_currency = "";
            decimal exchangedValue;

            Preprocess(priceToConvert);

            int totalIteration   = 0;
            int final_index      = -1;
            int collectionNumber = docCollection.DocumentList.Count - 1;

            List <DocumentVector> vSpace    = VectorSpaceModel.ProcessDocumentCollection(docCollection);
            List <Centroid>       resultSet = DocumnetClustering.PrepareDocumentCluster(collectionNumber, vSpace, ref totalIteration, ref final_index, currency);

            from_currency = resultSet[final_index].GroupedDocument[0].Content.Split(',')[0];
            WriteInCurrencyDocument(from_currency, currency);
            decimal rate = GetRate(from_currency, to_currency);

            exchangedValue = value * rate;
            return(exchangedValue);
        }
Пример #12
0
    protected void Button1_Click(object sender, EventArgs e)
    {
        Dictionary <int, double> siteRelevancy = new Dictionary <int, double>();

        string word1 = TextBox1.Text;
        string word2 = TextBox2.Text;
        string word3 = TextBox3.Text;

        double weight1, weight2, weight3; // Weights are assiged in the if statement below

        if (Double.TryParse(TextBox4.Text, out weight1) && Double.TryParse(TextBox5.Text, out weight2) && Double.TryParse(TextBox6.Text, out weight3))
        {
            statusLabel.Text = "Good to go!";
            // Pass the weights and words to where ever they need to go
            siteRelevancy = VectorSpaceModel.DoVSM(word1, weight1, word2, weight2, word3, weight3);
            displayLinks(siteRelevancy);
        }
        else
        {
            statusLabel.Text = "Please enter only numbers in the weight boxes!";
        }
    }
Пример #13
0
        private void btnAdd_Click(object sender, EventArgs e)
        {
            int newDoc = 0;

            if (!string.IsNullOrEmpty(txtDoc1.Text))
            {
                docCollection.DocumentList.Add(txtDoc1.Text);
                newDoc++;
            }
            if (!string.IsNullOrEmpty(txtDoc2.Text))
            {
                newDoc++;
                docCollection.DocumentList.Add(txtDoc2.Text);
            }
            if (!string.IsNullOrEmpty(txtDoc3.Text))
            {
                docCollection.DocumentList.Add(txtDoc3.Text);
                newDoc++;
            }
            if (!string.IsNullOrEmpty(txtDoc4.Text))
            {
                newDoc++;
                docCollection.DocumentList.Add(txtDoc4.Text);
            }


            int totalDoc = 0;

            if (int.TryParse(docCollection.DocumentList.Count.ToString(), out totalDoc))
            {
                lblTotalDoc.Text = totalDoc.ToString();
            }

            txtDoc1.Clear();
            txtDoc2.Clear();
            txtDoc3.Clear();
            txtDoc4.Clear();

            if (ddlType.Text == "Incremental" && DocumnetClustering.mainCentroids.Count > 0)
            {
                switch (ddlIncAlg.Text)
                {
                case "KMeans":
                    List <DocumentVector> vSpace = VectorSpaceModel.ProcessDocumentCollection(docCollection);
                    for (int i = 1; i <= newDoc; i++)
                    {
                        DocumentVector obj   = vSpace[vSpace.Count - i];
                        int            index = DocumnetClustering.FindClosestClusterCenter(DocumnetClustering.mainCentroids, obj, ddl_sim.Text);
                        DocumnetClustering.mainCentroids[index].GroupedDocument.Add(obj);
                    }
                    break;

                case "CMeans":
                    List <DocumentVector> vSpace2 = VectorSpaceModel.ProcessDocumentCollection(docCollection);

                    string         outFilepath = @"E:\Dropbox\Masters\myMSc\PracticalPart\Sematic_K-MEANSClustering\FCM\HM_data_Out_centers.dat";
                    var            reader      = new StreamReader(File.OpenRead(outFilepath));
                    List <float[]> values      = new List <float[]>();
                    int            t           = 0;
                    while (!reader.EndOfStream)
                    {
                        var line = reader.ReadLine();
                        values.Add(Array.ConvertAll(line.Split(','), float.Parse));
                        t++;
                    }

                    for (int i = 0; i < newDoc; i++)
                    {
                        int            closeCenter = 0;
                        float          min         = 1000;
                        int            counter     = 1;
                        DocumentVector obj2        = vSpace2[vSpace2.Count - newDoc + i];
                        for (int l = 0; l < t; l++)
                        {
                            //                                float s = SimilarityMatrics.FindCosineSimilarity(values[l], obj2.VectorSpace);
                            float s = ArrayDistanceFunction(values[l], obj2.VectorSpace);
                            if (s < min)
                            {
                                min         = s;
                                closeCenter = counter;
                            }
                            counter++;
                        }

                        MessageBox.Show("Doc:" + (i + 1) + " Close is:" + closeCenter);
                        DocumnetClustering.mainCentroids[closeCenter - 1].GroupedDocument.Add(obj2);
                    }


                    break;
                }
                printAlll();
            }
        }
Пример #14
0
        static void Main(string[] args)
        {
            var enc   = Encoding.GetEncoding(1251);
            var files = Directory.GetFiles(Directory.GetCurrentDirectory(), "*.txt");

            var list  = new List <Document>();
            var books = new List <BookInfo>();

            Dictionary <BookInfo, Document>     book2doc = new Dictionary <BookInfo, Document>();
            Dictionary <DocumentBase, BookInfo> doc2book = new Dictionary <DocumentBase, BookInfo>();

            var spellProc = new SpellProcessor();

            foreach (var file in files)
            {
                Console.WriteLine("{1}: processing: {0}...", Path.GetFileName(file), DateTime.Now.ToString("T"));
                using (var rdr = new StreamReader(file, enc))
                {
                    while (!rdr.EndOfStream)
                    {
                        var bookInfo = new BookInfo(rdr);
                        var text     = bookInfo.Descr;
                        if (text == null || text.Contains("PDF"))
                        {
                            text = bookInfo.Annot;
                        }
                        if (text == null || text.Contains("PDF"))
                        {
                            text = bookInfo.Title;
                        }
                        if (string.IsNullOrEmpty(text))
                        {
                            continue;
                        }
                        //
                        bookInfo.ProcessingText = text.ToLower();
                        var document = new Document(text, spellProc);
                        book2doc[bookInfo] = document;
                        doc2book[document] = bookInfo;
                        //
                        books.Add(bookInfo);
                        list.Add(document);
                    }
                }
            }
            var corpus = new Corpus(list);
            var model  = new VectorSpaceModel(corpus);

            string[] skipList = new[] { "о", "общие", "сведение", "даваться", "по", "для", "профиль", "из", "на",
                                        "при", "вуз", "даны", "их", "предназначить", "студент", "подготовка", "бакалавр", "обучение",
                                        "всех", "направление", "обучаться" };
            string[] reqList = new[] { "учебный", "пособие", "методический", "курсовой", "дипломный", "работа",
                                       "проект", "лабораторный" };

            Dictionary <string, List <BookInfo> > keys2book = new Dictionary <string, List <BookInfo> >();

            foreach (var document in list)
            {
                var ww    = model.GetWeights(document);
                var pairs = ww.Where(x => x.Value > 0).Select(x => x).ToList();
                pairs.Sort((a, b) => b.Value.CompareTo(a.Value));
                //
                var book = doc2book[document];
                book.KeyWords = new List <string>();
                foreach (var pair in pairs)
                {
                    var needInclude = book.KeyWords.Count < 10;
                    var key         = pair.Key;
                    if (needInclude)
                    {
                        needInclude = (Array.IndexOf(skipList, key) == -1) && (key.Length > 1);
                    }
                    if (!needInclude)
                    {
                        needInclude = Array.IndexOf(reqList, key) != -1;
                    }
                    if (!needInclude)
                    {
                        continue;
                    }
                    book.KeyWords.Add(key);
                    if (!keys2book.ContainsKey(key))
                    {
                        keys2book[key] = new List <BookInfo>();
                    }
                    keys2book[key].Add(book);
                }
            }

            var srcBooks = list.Select((d, i) => doc2book[d]).Where(b => b.KeyWords != null && b.KeyWords.Count > 0).ToList();
            var srcTexts = srcBooks.Select(x => x.KeyWords).ToList();

            var pl = new ProbLatentSemanticAnalyse(10, srcTexts);

            pl.Train(20, x => Console.WriteLine("{1}: step {0} finished...", x, DateTime.Now.ToString("T")));

            //

            var phi = pl.PhiWt;
            var qu  = pl.QuDt;
            var wrd = pl.WordIndexes;

            List <KeyValuePair <string, double> > keyz = new List <KeyValuePair <string, double> >();

            using (var wrt = new StreamWriter("theme_word.lst"))
            {
                for (int t = 0; t < pl.ThemeCount; t++)
                {
                    keyz.Clear();

                    wrt.WriteLine("-----------------------------------------------------------------");
                    wrt.WriteLine("theme index: {0}", t);
                    wrt.WriteLine("words collected");

                    foreach (var pair in wrd)
                    {
                        var val = phi[pair.Value, t];
                        if (val >= 0.01)
                        {
                            keyz.Add(new KeyValuePair <string, double>(pair.Key, val));
                        }
                        //if (val >= 0.01) wrt.WriteLine("[{0}] => {1}", pair.Key, val);
                    }

                    keyz.Sort((a, b) => b.Value.CompareTo(a.Value));
                    foreach (var pair in keyz)
                    {
                        wrt.WriteLine("[{0}] => {1}", pair.Key, pair.Value);
                    }
                }
                wrt.Flush();
            }

            //Dictionary<string,int> keyz = new Dictionary<string, int>();
            //Dictionary<int, Dictionary<int, int>> data = new Dictionary<int, Dictionary<int, int>>();
            //for (int d = 0; d < list.Count; d++)
            //{
            //    var document = list[d];
            //    var kws = doc2book[document].KeyWords;
            //    if (kws == null || kws.Count == 0) continue;
            //    data[d] = new Dictionary<int, int>();
            //    foreach (var kw in kws)
            //    {
            //        if (!keyz.ContainsKey(kw)) keyz[kw] = keyz.Count;
            //        var ki = keyz[kw];
            //        if (!data[d].ContainsKey(ki)) data[d][ki] = 0;
            //        data[d][ki] = data[d][ki] + 1;
            //    }
            //}

            //var pl = new ProbLatentSemanticAnalyse(30, data);
            //pl.Train(100);

            ////var d = model.CosineSimilarity(list[0], list[1]);
            //var lsh = new LocalitySensitiveHashing(0.1, 0.5, 0.1, 0.8);
            //lsh.Hashing.Extractor = new NgramExtractor(3);
            //for (int index = 0; index < books.Count; index++) lsh.Process(index, books[index].ProcessingText);
            //for (int index = 0; index < books.Count; index++)
            //{
            //    var indexes = lsh.FindSimiliar(books[index].ProcessingText);
            //    Console.WriteLine("src: {0}", books[index].ProcessingText);
            //    //foreach (var idx in indexes)
            //    //{
            //    //    if (idx == index) continue;
            //    //    var dist = model.CosineSimilarity(list[index], list[idx]);
            //    //    if (dist <= 1e-2) continue;
            //    //    Console.WriteLine();
            //    //    Console.WriteLine("dst: {0} => {1}", books[idx].ProcessingText, dist);
            //    //}

            //    var dists = indexes.Select(idx => model.CosineSimilarity(list[index], list[idx])).ToArray();

            //    Console.WriteLine("------------------------------------------------------------------------------------");
            //}

            ////double[,] dist = new double[list.Count,list.Count];
            ////for (int i = 0; i < list.Count; i++)
            ////{
            ////    for (int j = i; j < list.Count; j++)
            ////    {
            ////        var d = model.CosineSimilarity(list[i], list[j]);
            ////        dist[i, j] = d;
            ////        dist[j, i] = d;
            ////    }
            ////}

            //var wordPairs = keys2book.Select(x => x).ToList();
            //wordPairs.Sort((a,b)=>b.Value.Count.CompareTo(a.Value.Count));

            //using (var wrt = new StreamWriter("keys.lst"))
            //{
            //    foreach (var wp in wordPairs) wrt.WriteLine("{0} => {1}", wp.Key, wp.Value.Count);
            //    wrt.Flush();
            //}


            Console.ReadLine();
        }
Пример #15
0
        private void KMeans(object sender, RoutedEventArgs e)
        {
            clustResultTxtBox.Document.Blocks.Clear();
            var    clusterization_stopwatch = Stopwatch.StartNew();
            string message   = null;
            string algorithm = " k-means;";
            string PKMeans_label_resul_path = @"F:\Magistry files\data\PKMeans_label_result6.txt";
            string K_means_report_path      = @"F:\Magistry files\reports\PKMeans_report6.txt";

            #region OldDataGeneration

            /*
             * List<string> docCollection = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoading();
             * HashSet<string> termCollection = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.getTermCollection();
             * Dictionary<string, int> wordIndex = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.DocumentsContainsTerm(docCollection, termCollection);
             * List<DocumentVector> vSpace = VectorSpaceModel.DocumentCollectionProcessing(docCollection);
             */
            #endregion
            Dictionary <int, string> docCollectionDictionary = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoadingToDictionary();
            HashSet <string>         termCollection          = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.getTermCollection();
            Dictionary <string, int> wordIndex = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.DocumentsContainsTermToDictionary(docCollectionDictionary, termCollection);
            List <DocumentVector>    vSpace    = VectorSpaceModel.DocumentCollectionProcessingDictionary(docCollectionDictionary);
            int totalIteration = 500;
            int clusterNumber  = 5;
            clusterNumber = Convert.ToInt32(txtboxClusterNumber.Text);
            List <Centroid> firstCentroidList = new List <Centroid>();
            #region OldClusteringAlgorithm
            //firstCentroidList = CentroidCalculationClass.CentroidCalculationsForKMeans(vSpace, clusterNumber);
            //List<Centroid> resultSet = Logic.ClusteringAlgorithms.Algorithms.KMeansPPImplementations.MyKmeansPPInterpritationcs.NewKMeansClusterization(clusterNumber, docCollection, totalIteration, vSpace, wordIndex, firstCentroidList);
            #endregion
            firstCentroidList = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.KMeans.CentroidCalculationsForKMeans(vSpace, clusterNumber);
            List <Centroid> resultSet = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.KMeans.KMeansClustering(vSpace, clusterNumber, totalIteration, firstCentroidList);
            clusterization_stopwatch.Stop();
            int[] PKMeans_label_matrix = new int[vSpace.Count];
            PKMeans_label_matrix = Tests.Label_Matrix.ReleaseVersion_Label_Matrix_Extractions(resultSet, PKMeans_label_resul_path);
            #region tests_metrics

            //List<string> docs = Tests.DocClasses.SurveyAndMeasurementsClassOfDocuments_ListCreations();
            //List<List<string>> ClassCollection = Tests.DocClasses.ListOfClasses();

            /*
             * var distance = Tests.InterclusterDistances.d_centroids(resultSet);
             * var min_centroid_distances = Tests.InterclusterDistances.d_min_centroids(resultSet);
             * var max_intracluster_d = Tests.IntraclusterDistances.d_max(resultSet);
             * var min_intracluster_d = Tests.IntraclusterDistances.d_min(resultSet);
             *
             * var median_intracluster_d = Tests.IntraclusterDistances.d_sr(resultSet);
             * //string DistanceMetricsFilePath = @"F:\Magistry files\distanceMetrics\KmeansDistanceMetrics1.txt";
             * //for(int iK=0; iK<clusterNumber; iK++)
             * //{
             * //    for (int jK = 0; jK < clusterNumber; jK++)
             * //    {
             * //        File.WriteAllText(DistanceMetricsFilePath, distance[iK, jK].ToString());
             * //    }
             * //}
             *
             *
             * /*
             * var Recall_result = Tests.Recall.Recall_Calculating(resultSet, docs);
             * var Precision_result = Tests.Precision.Precision_Calculating(resultSet, docs);
             * var Purity = Tests.Purity.Purity_Calculating(resultSet, ClassCollection, vSpace);
             * var Fmeasure = Tests.F1Measure.F1_Measure_Calculating(resultSet, ClassCollection);
             * var GMeasure = Tests.F1Measure.G_Measure_Calculating(resultSet, ClassCollection);
             * var NMI = Tests.NormilizedMutualInformation.NMI_Calculating(resultSet, ClassCollection, vSpace);
             * var Entropy = Tests.Entropy.Enthropy_Calculating(resultSet, ClassCollection);
             */
            #endregion
            message = RaportGeneration.ReleaseRaportGenerationFunction(resultSet, clusterNumber, totalIteration, clusterization_stopwatch, K_means_report_path, algorithm);
            //clustResultTxtBox.AppendText(message);
            invokeFilesToVisualizationGenerator(resultSet, algorithm);
        }
Пример #16
0
        private void Gravitational_Click(object sender, RoutedEventArgs e)
        {
            #region gravitational_old_working_code

            /*
             * List<string> docCollection = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoading();
             * HashSet<string> termCollection = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.getTermCollection();
             * Dictionary<string, int> wordIndex = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.DocumentsContainsTerm(docCollection, termCollection);
             * List<DocumentVector> vSpace = VectorSpaceModel.DocumentCollectionProcessing(docCollection);
             * int M = 1000000;
             * float G = -1.28171817154F; //G=1*e-4 according to 3.2.2  in article
             * float deltaG = 0.01F;
             * float epsilon = -3.28171817154F;//epsilon=1*e-6 according to 3.2.2 in article or 10^(-4)= 0.0001F;
             * //float epsilon = 0.6F;
             * float alpha = 0.06F;
             * var result1 = Logic.ClusteringAlgorithms.Algorithms.GravitationalClusteringAlgorithm.Gravitational(vSpace, G, deltaG, M, epsilon);
             * //var result2 = GravitationalClusteringAlgorithm.GetClusters(result1, alpha, vSpace);
             * List<string> docs = Tests.DocClasses.SurveyAndMeasurementsClassOfDocuments_ListCreations();
             * List<List<string>> ClassCollection = Tests.DocClasses.ListOfClasses();
             * var distance = Tests.InterclusterDistances.d_centroids(result1);
             * var min_centroid_distances = Tests.InterclusterDistances.d_min_centroids(result1);
             * var max_intracluster_d = Tests.IntraclusterDistances.d_max(result1);
             * var min_intracluster_d = Tests.IntraclusterDistances.d_min(result1);
             * var median_intracluster_d = Tests.IntraclusterDistances.d_sr(result1);
             * var Recall_result = Tests.Recall.Recall_Calculating(result1, docs);
             * var Precision_result = Tests.Precision.Precision_Calculating(result1, docs);
             * var Purity = Tests.Purity.Purity_Calculating(result1, ClassCollection, vSpace);
             * var Fmeasure = Tests.F1Measure.F1_Measure_Calculating(result1, ClassCollection);
             * var GMeasure = Tests.F1Measure.G_Measure_Calculating(result1, ClassCollection);
             * var NMI = Tests.NormilizedMutualInformation.NMI_Calculating(result1, ClassCollection, vSpace);
             */
            #endregion

            clustResultTxtBox.Document.Blocks.Clear();
            var    clusterization_stopwatch = Stopwatch.StartNew();
            string message   = null;
            string algorithm = " Gravitational clustering algorithm;";
            Dictionary <int, string> docCollectionDictionary = Logic.ClusteringAlgorithms.Used_functions.CreateDocumentCollection2.GenerateDocumentCollection_withoutLazyLoadingToDictionary();
            HashSet <string>         termCollection          = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.getTermCollection();
            Dictionary <string, int> wordIndex = Logic.ClusteringAlgorithms.Used_functions.TFIDF2ndrealization.DocumentsContainsTermToDictionary(docCollectionDictionary, termCollection);
            List <DocumentVector>    vSpace    = VectorSpaceModel.DocumentCollectionProcessingDictionary(docCollectionDictionary);
            int M = 500;
            //float G = 7 * (float)Math.Pow(10, (-6));
            //float G = -1.28171817154F; //G=1*e-4 according to 3.2.2  in article
            float G      = 6.67408313131313131F * (float)Math.Pow(10, (-6));
            float deltaG = 0.001F;
            //float epsilon = -3.28171817154F;//epsilon=1*e-6 according to 3.2.2 in article or 10^(-4)= 0.0001F;
            float epsilon       = 0.1F;
            float alpha         = 0.06F;
            int   clusterNumber = 6;
            clusterNumber = Convert.ToInt32(txtboxClusterNumber.Text);
            M             = Convert.ToInt32(txtboxIterationCount.Text);
            string          gravitational_label_resul_path = @"F:\Magistry files\data\Gravitational_label_result5.txt";
            string          Gravitational_report_path      = @"F:\Magistry files\reports\Gravitational_report5.txt";
            List <Centroid> result       = new List <Centroid>(vSpace.Count);
            var             results      = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.GravitationalClusteringAlgorithm.GravitationalAlgorithm(vSpace, G, deltaG, M, epsilon);
            var             get_Clusters = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.GravitationalClusteringAlgorithm.GetClusters(results, alpha, vSpace);
            List <Centroid> resultSet    = Logic.ClusteringAlgorithms.WorkedAlgorithmsFromTest.GravitationalClusteringAlgorithm.RemoveSameElementsFromClusters(get_Clusters);
            int[]           label_matrix = Tests.Label_Matrix.ReleaseVersion_Label_Matrix_Extractions(get_Clusters, gravitational_label_resul_path);
            clusterization_stopwatch.Stop();
            message = RaportGeneration.ReleaseRaportGenerationFunction(get_Clusters, get_Clusters.Count, M, clusterization_stopwatch, Gravitational_report_path, algorithm);
            //clustResultTxtBox.AppendText(message);
            invokeFilesToVisualizationGenerator(resultSet, algorithm);
        }