Example #1
0
        public void Execute(List <string> queryText, string appendTextToFileName)
        {
            ExecuteBase();

            var tssDocumentDictionary = new MyDoubleDictionary();

            // Create list of word contained in query
            var distinctQueryWordList = queryText.Distinct().ToList(); // DISTINCT HERE but since its calculating PMI done remove it
            var nPmiMatrix            = new MyAnyDictionary <MyDoubleDictionary>();
            int n = CodeFilesWithContent.Count;

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            foreach (var queryWordW2 in distinctQueryWordList)
            {
                var nPmiDictionary = new MyDoubleDictionary();

                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    bool sourceContainsQueryWord = WordAndContainingFiles.ContainsKey(queryWordW2);

                    int countW1   = WordAndContainingFiles[sourceWordW1].Count;
                    int countW2   = sourceContainsQueryWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    int countW1W2 = sourceContainsQueryWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;

                    double nPmi;
                    if (countW1W2 == 0)
                    {
                        nPmi = -1;
                    }
                    else if (countW1 == countW1W2 && countW2 == countW1W2)
                    {
                        nPmi = 1;
                    }
                    else
                    {
                        nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n)) - 1;
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }
                nPmiMatrix.Add(queryWordW2, nPmiDictionary);
            }

            var queryWordListForTss           = queryText.ToList();
            int totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                var    sourceWords     = sourceFileWithWords.Value.ToList();
                double sumQueryTimeIdf = 0.0;
                double sumQueryIdf     = 0.0;

                foreach (var queryWord in queryWordListForTss)
                {
                    double maxSim = -1;
                    foreach (var sourceWord in sourceWords)
                    {
                        double currentnPmi = nPmiMatrix[queryWord][sourceWord];
                        if (maxSim < currentnPmi)
                        {
                            maxSim = currentnPmi;
                        }
                    }

                    // if term does not occur in any corpus then its only in use case hence -1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }

                    sumQueryIdf     += idf;
                    sumQueryTimeIdf += (maxSim * idf);
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                foreach (string sourceWord in sourceWords)
                {
                    double maxSim = -1;
                    foreach (string useCaseWord in queryWordListForTss)
                    {
                        double currentNPmi = nPmiMatrix[useCaseWord][sourceWord];
                        if (maxSim < currentNPmi)
                        {
                            maxSim = currentNPmi;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusIdf     += idf;
                    sumCorpusTimeIdf += (maxSim * idf);
                }

                double tss = sumQueryIdf == 0 || sumCorpusIdf == 0 ? -1 : (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Pmi" + appendTextToFileName, tssDocumentDictionary);
        }
Example #2
0
        public void Execute(List <string> queryTextList, string cacheOutputFolderPath)
        {
            ExecuteBase();

            if (!Directory.Exists(cacheOutputFolderPath))
            {
                Directory.CreateDirectory(cacheOutputFolderPath);
                Thread.Sleep(100);
            }

            var tssDocumentDictionary = new MyDoubleDictionary();

            // Create list of word contained in query
            int n = CodeFilesWithContent.Count;

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            int counter = 0;

            foreach (var queryText in queryTextList.Distinct())
            {
                var queryWordW2 = queryText;

                Log($"{_appendLog} Creating {++counter} of {queryTextList.Count}: {queryWordW2}");

                var outputFilePath = Path.Combine(cacheOutputFolderPath, $"_{queryWordW2}.txt");
                if (File.Exists(outputFilePath))
                {
                    continue;
                }

                var nPmiDictionary = new MyDoubleDictionary();

                foreach (var sourceText in WordAndContainingFiles.Keys)
                {
                    var  sourceWordW1            = sourceText;
                    bool sourceContainsQueryWord = WordAndContainingFiles.ContainsKey(queryWordW2);

                    int countW1   = WordAndContainingFiles[sourceWordW1].Count;
                    int countW2   = sourceContainsQueryWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    int countW1W2 = sourceContainsQueryWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;

                    double nPmi;
                    if (countW1W2 == 0)
                    {
                        nPmi = -1;
                    }
                    else if (countW1 == countW1W2 && countW2 == countW1W2)
                    {
                        nPmi = 1;
                    }
                    else
                    {
                        nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n)) - 1;
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }

                var print = nPmiDictionary.Where(x => x.Value > -1).ToDictionary(x => x.Key, x => x.Value.ToString("#.000"));
                File.WriteAllText(outputFilePath, Newtonsoft.Json.JsonConvert.SerializeObject(print));
            }
        }
Example #3
0
        public void Execute(List <string> queryTexts, string pmiCacheFolderPath, string appendTextToFileName)
        {
            var queryTextsDistinct    = queryTexts.Distinct().ToList();
            var tssDocumentDictionary = new MyDoubleDictionary();

            // Create list of word contained in query
            var nPmiMatrix = new Dictionary <string, Dictionary <string, double> >();

            foreach (var queryText in queryTextsDistinct)
            {
                var pmiDictionaryString = Newtonsoft.Json.JsonConvert.DeserializeObject <Dictionary <string, string> >(File.ReadAllText(Path.Combine(pmiCacheFolderPath, $"_{queryText}.txt")));
                var pmiDictionary       = pmiDictionaryString.ToDictionary(x => x.Key, x => double.Parse(x.Value));
                nPmiMatrix.Add(queryText, pmiDictionary);
            }

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            int totalNumberOfDocumentInSource = CodeFilesWithContent.Count;
            int counter = 0;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                counter++;
                Log($"{_appendLog} Running PMI: {counter} of {CodeFilesWithContent.Count}");

                var    sourceWords     = sourceFileWithWords.Value.ToList();
                double sumQueryTimeIdf = 0.0;
                double sumQueryIdf     = 0.0;

                var queryTextsDistinctMaxSimDictionary = new Dictionary <string, double>();

                foreach (var queryWord in queryTexts)
                {
                    if (!queryTextsDistinctMaxSimDictionary.ContainsKey(queryWord))
                    {
                        double maxSimCurrent = sourceWords.AsParallel().Select(x => nPmiMatrix[queryWord].ContainsKey(x) ? nPmiMatrix[queryWord][x] : -1).MyMax(-1);
                        queryTextsDistinctMaxSimDictionary.Add(queryWord, maxSimCurrent);
                    }

                    double maxSim = queryTextsDistinctMaxSimDictionary[queryWord];

                    // if term does not occur in any corpus then its only in use case hence -1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }

                    sumQueryIdf     += idf;
                    sumQueryTimeIdf += (maxSim * idf);
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                var sourceWordsAsIntsMaxSimDictionary = new Dictionary <string, double>();
                foreach (var sourceWord in sourceWords)
                {
                    if (!sourceWordsAsIntsMaxSimDictionary.ContainsKey(sourceWord))
                    {
                        double maxSimCurrent = queryTextsDistinct.AsParallel().Select(x => nPmiMatrix[x].ContainsKey(sourceWord) ? nPmiMatrix[x][sourceWord] : -1).MyMax(-1);
                        sourceWordsAsIntsMaxSimDictionary[sourceWord] = maxSimCurrent;
                    }

                    var maxSim = sourceWordsAsIntsMaxSimDictionary[sourceWord];

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusIdf     += idf;
                    sumCorpusTimeIdf += (maxSim * idf);
                }

                double tss = sumQueryIdf == 0 || sumCorpusIdf == 0 ? -1 : (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Pmi" + appendTextToFileName, tssDocumentDictionary);
        }