Exemple #1
0
        private void ExecuteSub(MyDoubleDictionary queryTfIdfDictionary, string appendTextToFileName = "")
        {
            // max frequency
            double maxFrequency = queryTfIdfDictionary.Max(x => x.Value);

            // now multiply each by idf to get tfidf for query
            foreach (var queryWordWithTf in queryTfIdfDictionary.ToList())
            {
                queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key)
                    ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key]
                    : 0;
            }

            // Calculate Similarity
            var similarityDictionary = new MyDoubleDictionary();

            // compute similarity of fileText with each _codeFiles
            foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary)
            {
                double cosineSimilarityWithUseCase = Helper.GetSimilarity(queryTfIdfDictionary, codeFileWithTfIdfDictionary.Value);
                similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase);
            }

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Vsm" + appendTextToFileName, similarityDictionary);
        }
        /// <summary>
        /// 对VSM PMI LSI 方法进行初始化
        /// 设置 TF, IDF, TF-IDF
        /// </summary>
        public static void InitializeForVsmSimLsi()
        {
            // 计算 TF and idf
            foreach (var fileAndItsWords in CodeFilesWithContent)
            {
                //TF 词频字典
                MyDoubleDictionary fileTfDictionary = new MyDoubleDictionary();

                // 统计每个源码文件的中的单词及其词频
                foreach (string word in fileAndItsWords.Value)
                {
                    fileTfDictionary.Add(word);
                }

                // 为每个源码文件保存其词频
                TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary);

                // 对每个在源码文件中出现的单词, 记录它出现的文档数目, 此时IDF中保存的是文档频数DF
                foreach (var wordAndItsCount in fileTfDictionary)
                {
                    IdfDictionary.Add(wordAndItsCount.Key);
                }
            }

            // 将DF转换为IDF IDF = log(T/DF)
            // 文档总数
            int totalNumberOfDocuments = CodeFilesWithContent.Count;

            foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list 这样可以改变该字典
            {
                IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value);
            }

            // 为每个文件设置 TF-IDF
            foreach (var sourceFileWithTfDictionary in TfDictionary)
            {
                // 单个源文件的TF-IDF
                MyDoubleDictionary fileTfIdfDictionary = new MyDoubleDictionary();
                foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value)
                {
                    fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]);
                }
                TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary);
            }
        }
        /// <summary>
        /// 计算 VSM 方法
        /// </summary>
        /// <param name="outputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="queryText">查询文本</param>
        public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText)
        {
            Utility.Status("Creating VSM: " + bugName);

            // 创建查询文本的TF-IDF字典
            MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary();

            queryText.ForEach(queryTfIdfDictionary.Add);

            // 最大频度
            double maxFrequency = queryTfIdfDictionary.Max(x => x.Value);

            // 计算TF-IDF
            foreach (var queryWordWithTf in queryTfIdfDictionary.ToList())
            {
                queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key)
                    ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key]
                    : 0;
            }

            // 计算相似度字典
            MyDoubleDictionary         similarityDictionary       = new MyDoubleDictionary();
            CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary);

            // 计算文本文件相似度 with each _codeFiles
            foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary)
            {
                double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value);
                similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase);
            }

            // 将文档向量降序写入文件Project\001\Results\Vsm.txt
            WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary);

            Utility.Status("Completed VSM: " + bugName);
        }
        /// <summary>
        /// Writes vector to file ordered
        /// 降序排列写入文件
        /// </summary>
        /// <param name="filePath"></param>
        /// <param name="vector"></param>
        /// <param name="asInt"></param>
        protected static void WriteDocumentVectorToFileOrderedDescending(string filePath, MyDoubleDictionary vector, bool asInt = false)
        {
            string pattern = asInt ? "##" : "##.00000";

            File.WriteAllLines(filePath, vector.ToList().OrderByDescending(x => x.Value).Select(x => x.Key + " " + x.Value.ToString(pattern)));
        }
Exemple #5
0
        private void ProcessSourceCode()
        {
            // Read all files
            CodeFilesWithContent = new Dictionary <string, List <string> >();
            foreach (var line in File.ReadAllLines(_sourceFilePath))
            {
                var      lineSplit = line.SplitWith("##");
                string[] text      = lineSplit[1].SplitWith(",").Where(x => x.Length > 2).ToArray();
                CodeFilesWithContent.Add(lineSplit[0], text.Take(text.Length / 50).ToList());
            }

            // compute tf and idf
            TfDictionary    = new Dictionary <string, MyDoubleDictionary>();
            IdfDictionary   = new MyDoubleDictionary();
            TfIdfDictionary = new Dictionary <string, MyDoubleDictionary>();
            foreach (var fileAndItsWords in CodeFilesWithContent)
            {
                var fileTfDictionary = new MyDoubleDictionary();

                // for each word in the file add 1 to the count
                foreach (string word in fileAndItsWords.Value)
                {
                    fileTfDictionary.IncreaseCount(word);
                }

                // save tf result for the file
                TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary);

                // for each DISTINCT word found in the file increase the idf by 1. At this point idf holds document frequency
                foreach (var wordAndItsCount in fileTfDictionary)
                {
                    IdfDictionary.IncreaseCount(wordAndItsCount.Key);
                }
            }

            // change df to idf
            int totalNumberOfDocuments = CodeFilesWithContent.Count;

            foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list so that we can change the dictionary
            {
                IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value);
            }

            // update tfidf for each file
            foreach (var sourceFileWithTfDictionary in TfDictionary)
            {
                var fileTfIdfDictionary = new MyDoubleDictionary();
                foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value)
                {
                    fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]);
                }
                TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary);
            }

            WordAndContainingFiles = new Dictionary <string, List <string> >();
            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                sourceFileWithWords.Value.Distinct().ToList().ForEach(word =>
                {
                    if (!WordAndContainingFiles.ContainsKey(word))
                    {
                        WordAndContainingFiles.Add(word, new List <string>());
                    }
                    WordAndContainingFiles[word].Add(sourceFileWithWords.Key);
                });
            }
        }