コード例 #1
0
ファイル: Jsm.cs プロジェクト: skhatiwada01/ASE192
        private void ExecuteSub(MyDoubleDictionary queryTfDictionary, string appendTextToFileName = "")
        {
            // create the vector for each source code
            var allUniqueWordsInSourceAndQuery      = IdfDictionary.Keys.Union(queryTfDictionary.Keys).Distinct().ToList();
            int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count;

            var sourceVectors = new Dictionary <string, double[]>();

            TfDictionary.ToList().ForEach(fileWithTfCount =>
            {
                MyDoubleDictionary tfDictionary = fileWithTfCount.Value;
                int totalWordsInFile            = CodeFilesWithContent[fileWithTfCount.Key].Count;

                double[] vector = new double[allUniqueWordsInSourceAndQueryCount];
                int counter     = 0;
                allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
                {
                    vector[counter] = tfDictionary.ContainsKey(uniqueWord)
                        ? tfDictionary[uniqueWord] / totalWordsInFile
                        : 0;
                    counter++;
                });

                sourceVectors.Add(fileWithTfCount.Key, vector);
            });

            // create the vector for query
            double[] queryVector         = new double[allUniqueWordsInSourceAndQueryCount];
            int      queryCounter        = 0;
            var      queryHashSet        = new HashSet <string>(queryTfDictionary.Keys);
            var      totalQueryWordCount = queryTfDictionary.Sum(x => x.Value);

            allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
            {
                queryVector[queryCounter] = queryHashSet.Contains(uniqueWord)
                    ? (double)queryTfDictionary[uniqueWord] / totalQueryWordCount
                    : 0;
                queryCounter++;
            });

            // calculate H(p), H(q) and H(p + q)
            MyDoubleDictionary similarityDictionary = new MyDoubleDictionary();

            sourceVectors.ToList().ForEach(sourceFileWithVector =>
            {
                var p          = sourceFileWithVector.Value;
                var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy();
                var pEntropy   = 1.0 / 2 * p.JensenEntropy();
                var qEntropy   = 1.0 / 2 * queryVector.JensenEntropy();

                var jensenDivergence = sumEntropy - pEntropy - qEntropy;
                var jensenSimilarity = 1 - jensenDivergence;

                similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity);
            });

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Jsm" + appendTextToFileName, similarityDictionary);
        }
コード例 #2
0
        public static double GetSimilarity(MyDoubleDictionary vector1, MyDoubleDictionary vector2)
        {
            double length1 = GetLength(vector1);
            double length2 = GetLength(vector2);

            double dotProduct = vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key)).Sum(wordWithCount => (wordWithCount.Value * vector2[wordWithCount.Key]));

            return(vector2.Count == 0 ? 0 : dotProduct / (length1 * length2));
        }
コード例 #3
0
        public string GetSimilarityText(MyDoubleDictionary vector2)
        {
            double length1 = GetLength(_vector1);
            double length2 = GetLength(vector2);

            var dotProductObj =
                _vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key))
                .Select(
                    wordWithCount =>
                    new
            {
                Word   = wordWithCount.Key,
                Value1 = wordWithCount.Value,
                Value2 = vector2[wordWithCount.Key]
            })
                .Select(x => $"{x.Word} {x.Value1.ToString("##.000")}, {x.Value2.ToString("##.000")}");

            var dotProductString = string.Join(Environment.NewLine, dotProductObj);

            return(dotProductString);
        }
コード例 #4
0
        /// <summary>
        /// 计算 VSM 方法
        /// </summary>
        /// <param name="outputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="queryText">查询文本</param>
        public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText)
        {
            Utility.Status("Creating VSM: " + bugName);

            // 创建查询文本的TF-IDF字典
            MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary();

            queryText.ForEach(queryTfIdfDictionary.Add);

            // 最大频度
            double maxFrequency = queryTfIdfDictionary.Max(x => x.Value);

            // 计算TF-IDF
            foreach (var queryWordWithTf in queryTfIdfDictionary.ToList())
            {
                queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key)
                    ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key]
                    : 0;
            }

            // 计算相似度字典
            MyDoubleDictionary         similarityDictionary       = new MyDoubleDictionary();
            CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary);

            // 计算文本文件相似度 with each _codeFiles
            foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary)
            {
                double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value);
                similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase);
            }

            // 将文档向量降序写入文件Project\001\Results\Vsm.txt
            WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary);

            Utility.Status("Completed VSM: " + bugName);
        }
コード例 #5
0
        /// <summary>
        /// 计算 Jensen-Shannon 方法
        /// </summary>
        /// <param name="outputFolderPath">输出文件夹,各个bug文件夹</param>
        /// <param name="bugName">bug名称</param>
        /// <param name="queryText">查询文本</param>
        public static void ComputeJen(string outputFolderPath, string bugName, List <string> queryText)
        {
            Utility.Status("Computing JEN: " + bugName);

            /// 为源代码创建向量
            // 源码和查询中出现的单词, 单词库大小的向量
            List <string> allUniqueWordsInSourceAndQuery = IdfDictionary.Keys.Union(queryText).Distinct().ToList();
            // 总单词数
            int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count;
            //源码向量字典
            Dictionary <string, double[]> sourceVectors = new Dictionary <string, double[]>();

            TfDictionary.ToList().ForEach(fileWithTfCount =>
            {
                MyDoubleDictionary tfDictionary = fileWithTfCount.Value;
                // 某源码中的总单词数
                int totalWordsInFile = CodeFilesWithContent[fileWithTfCount.Key].Count;
                // 单个源码文件向量,存放Pd=f(w, d)/Td
                double[] vector = new double[allUniqueWordsInSourceAndQueryCount];
                int counter     = 0;
                allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
                {
                    vector[counter] = tfDictionary.ContainsKey(uniqueWord)
                        ? tfDictionary[uniqueWord] / totalWordsInFile
                        : 0;
                    counter++;
                });

                sourceVectors.Add(fileWithTfCount.Key, vector);
            });

            // 为查询创建向量
            double[] queryVector  = new double[allUniqueWordsInSourceAndQueryCount];
            int      queryCounter = 0;

            allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
            {
                queryVector[queryCounter] = queryText.Contains(uniqueWord)
                    ? (double)queryText.Count(x => x == uniqueWord) / queryText.Count
                    : 0;
                queryCounter++;
            });

            // 计算 H(p), H(q) and H(p + q)
            MyDoubleDictionary similarityDictionary = new MyDoubleDictionary();

            sourceVectors.ToList().ForEach(sourceFileWithVector =>
            {
                var p          = sourceFileWithVector.Value;
                var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy();
                var pEntropy   = 1.0 / 2 * p.JensenEntropy();
                var qEntropy   = 1.0 / 2 * queryVector.JensenEntropy();

                var jensenDivergence = sumEntropy - pEntropy - qEntropy;
                var jensenSimilarity = 1 - jensenDivergence;
                // 源码文件编码-jensen相似度
                similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity);
            });

            // 将文档向量降序写入文件Project\001\Results\Jen.txt
            WriteDocumentVectorToFileOrderedDescending(outputFolderPath + JenFileName, similarityDictionary);

            Utility.Status("DONE Computing JEN: " + bugName);
        }