Пример #1
0
        public static void Init(CodegramsContract contract, string path)
        {
            SQLiteConnection.CreateFile(path);
            var sqlite = new SQLiteConnection(string.Format("Data Source={0}", path));

            sqlite.Open();  //Initiate connection to the db

            CreateWordIdTable(sqlite);
            CreateFrequencyTable(sqlite);

            using (SQLiteTransaction transaction = sqlite.BeginTransaction())
            {
                foreach (var pair in contract.WordId)
                {
                    AddWordIdRow(sqlite, pair.Key, contract.KeyFrequency[pair.Key], pair.Value);
                }
                foreach (var pair in contract.SequenceFrequency)
                {
                    AddFrequencyRow(sqlite, pair.Key, pair.Value);
                }

                transaction.Commit();
            }
            sqlite.Close();
            Console.WriteLine();
        }
Пример #2
0
        public CodegramsContract Build(string projectsDir)
        {
            var codegrams = new CodegramsContract();

            codegrams.SequenceFrequency = new Dictionary <string, SequenceFrequency>();
            codegrams.KeyFrequency      = new Dictionary <string, KeyFrequency>();

            codegrams.WordId = new Dictionary <string, int>();

            var projects = Directory.GetFiles(projectsDir).Where(p => p.EndsWith(".zip")).ToList();

            // We are optimizing for space, so using two passes over input.
            // 1a) First Pass: Read words and identifiers
            foreach (var project in projects)
            {
                using (var stream = new ZipInputStream(project))
                {
                    // sum word counts in code file
                    foreach (var codeContent in ExtractCodeContent(stream))
                    {
                        var counter        = new CodegramCounter();
                        var identSequences = counter.CountIdentifierSequences(1, codeContent);
                        var wordSequences  = counter.CountWordSequences(1, codeContent);

                        foreach (var ident in identSequences.Keys)
                        {
                            if (!codegrams.KeyFrequency.ContainsKey(ident))
                            {
                                codegrams.KeyFrequency[ident] = new KeyFrequency();
                            }
                            codegrams.KeyFrequency[ident].IdentifierFrequency += identSequences[ident];
                        }

                        foreach (var word in wordSequences.Keys)
                        {
                            if (!codegrams.KeyFrequency.ContainsKey(word))
                            {
                                codegrams.KeyFrequency[word] = new KeyFrequency();
                            }
                            codegrams.KeyFrequency[word].WordFrequency += wordSequences[word];
                        }
                    }
                }
                Console.WriteLine("Counting words in project {0}", project);
            }

            // 1b) Sort by word frequency and assign numerical word id
            int id = 0;

            foreach (var word in codegrams.KeyFrequency
                     .OrderByDescending(w => w.Value.IdentifierFrequency + w.Value.WordFrequency)
                     .Select(w => w.Key)
                     )
            {
                codegrams.WordId[word] = id++;
            }

            // 2) Second Pass: Build Codegrams
            foreach (var project in projects)
            {
                using (var stream = new ZipInputStream(project))
                {
                    // sum word and identifier gram counts in code file
                    foreach (var codeContent in ExtractCodeContent(stream))
                    {
                        var counter          = new CodegramCounter();
                        var identifiersGrams = counter.CountIdentifierSequences(2, codeContent);
                        var wordGrams        = counter.CountWordSequences(2, codeContent);

                        foreach (var phrase in wordGrams.Keys)
                        {
                            if (phrase.All(c => c == '_'))
                            {
                                continue;
                            }

                            string unicodeKey = UnicodeEncoder.GetUnicodeKeyFromString(phrase, word => codegrams.WordId[word]);
                            //CodegramCounter.IncrementKeyCountByValue(unicodeKey, codegrams.WordSequenceFrequencyMap, wordGrams[phrase]);
                            if (!codegrams.SequenceFrequency.ContainsKey(unicodeKey))
                            {
                                codegrams.SequenceFrequency[unicodeKey] = new SequenceFrequency();
                            }
                            codegrams.SequenceFrequency[unicodeKey].WordSequenceFrequency += wordGrams[phrase];
                        }

                        foreach (var phrase in identifiersGrams.Keys)
                        {
                            if (phrase.All(c => c == '_'))
                            {
                                continue;
                            }

                            string unicodeKey = UnicodeEncoder.GetUnicodeKeyFromString(phrase, word => codegrams.WordId[word]);
                            //CodegramCounter.IncrementKeyCountByValue(unicodeKey, codegrams.IdentifierSequenceFrequencyMap, identifiersGrams[phrase]);
                            if (!codegrams.SequenceFrequency.ContainsKey(unicodeKey))
                            {
                                codegrams.SequenceFrequency[unicodeKey] = new SequenceFrequency();
                            }
                            codegrams.SequenceFrequency[unicodeKey].IdentifierSequenceFrequency += identifiersGrams[phrase];
                        }
                    }
                }
            }
            return(codegrams);
        }