예제 #1
0
        public double LineSalience(int n, string line)
        {
            var counter          = new CodegramCounter();
            var identifiersGrams = counter.IdentifierSequences(n, line).ToList();
            var wordGrams        = counter.WordSequences(n, line).ToList();
            var words            = counter.AllWords(line).Select(w => w.ToLower()).ToList();
            var identifiers      = counter.AllIdentifiers(line).Select(ident => ident.ToLower()).ToList();

            var sumIdentifierGrams = 0.0;
            var sumWordGrams       = 0.0;
            var sumWords           = 0.0;
            var sumIdentifiers     = 0.0;

            foreach (var word in words)
            {
                sumWords += (ReadCommands.LookupWordFrequency(Connection, Cache, word) + 1) / (double)WordCount;
            }

            foreach (var ident in identifiers)
            {
                sumIdentifiers += (ReadCommands.LookupIdentifierFrequency(Connection, Cache, ident) + 1) / (double)IdentifierCount;
            }

            foreach (var wordGram in wordGrams)
            {
                sumWordGrams += (SequenceWordFrequency(wordGram) + 1) / (double)WordSequenceCount;
            }

            foreach (var identGram in identifiersGrams)
            {
                sumIdentifierGrams += (SequenceIdentifierFrequency(identGram) + 1) / (double)IdentifierSequenceCount;
            }

            var vals = new double[] { sumWords, sumIdentifiers, sumWordGrams, sumIdentifierGrams };

            if (vals.All(v => v == 0.0))
            {
                return(0.0);
            }
            var multiplier = 1.0;

            if (words.Count == 1 && identifiers.Count == 1)
            {
                //Console.Write(line);
                multiplier = 0.01;
            }

            var salience = (vals.Where(s => s > 0.0).Min() / vals.Max()) * multiplier;

            return(salience);
            //return sumIdentifierGrams / IdentifierCount;
        }
예제 #2
0
        public static string GetUnicodeKeyFromString(string phrase, WordIdLookup lookup)
        {
            var words = (CodegramCounter.Identifiers(phrase)
                         .SelectMany(id => CodegramCounter.Words(id))
                         .Select(w => lookup(w)))
                        .ToList();

            if (words.Any(w => w == -1))
            {
                return(null);
            }

            var chars = new List <char>();

            foreach (var word in words)
            {
                // if more words than range, emit extra character to cover difference.
                var offset = word;
                while (offset >= 0)
                {
                    var ch = (char)Math.Min(char.MaxValue, offset);

                    // If ch happens to fall in suggorate pair range, need proper leading and trailing surrogate pairs.
                    if (ch >= '\uD800' && ch <= '\uDBFF')
                    {
                        chars.Add(ch);
                        chars.Add('\uDFFF');
                    }
                    else if (ch >= '\uDC00' && ch <= '\uDFFF')
                    {
                        chars.Add('\uD800');
                        chars.Add(ch);
                    }
                    else
                    {
                        chars.Add(ch);
                    }
                    offset = offset - char.MaxValue;
                }
            }
            return(string.Join("", chars));
        }
예제 #3
0
        public CodegramsContract Build(string projectsDir)
        {
            var codegrams = new CodegramsContract();

            codegrams.SequenceFrequency = new Dictionary <string, SequenceFrequency>();
            codegrams.KeyFrequency      = new Dictionary <string, KeyFrequency>();

            codegrams.WordId = new Dictionary <string, int>();

            var projects = Directory.GetFiles(projectsDir).Where(p => p.EndsWith(".zip")).ToList();

            // We are optimizing for space, so using two passes over input.
            // 1a) First Pass: Read words and identifiers
            foreach (var project in projects)
            {
                using (var stream = new ZipInputStream(project))
                {
                    // sum word counts in code file
                    foreach (var codeContent in ExtractCodeContent(stream))
                    {
                        var counter        = new CodegramCounter();
                        var identSequences = counter.CountIdentifierSequences(1, codeContent);
                        var wordSequences  = counter.CountWordSequences(1, codeContent);

                        foreach (var ident in identSequences.Keys)
                        {
                            if (!codegrams.KeyFrequency.ContainsKey(ident))
                            {
                                codegrams.KeyFrequency[ident] = new KeyFrequency();
                            }
                            codegrams.KeyFrequency[ident].IdentifierFrequency += identSequences[ident];
                        }

                        foreach (var word in wordSequences.Keys)
                        {
                            if (!codegrams.KeyFrequency.ContainsKey(word))
                            {
                                codegrams.KeyFrequency[word] = new KeyFrequency();
                            }
                            codegrams.KeyFrequency[word].WordFrequency += wordSequences[word];
                        }
                    }
                }
                Console.WriteLine("Counting words in project {0}", project);
            }

            // 1b) Sort by word frequency and assign numerical word id
            int id = 0;

            foreach (var word in codegrams.KeyFrequency
                     .OrderByDescending(w => w.Value.IdentifierFrequency + w.Value.WordFrequency)
                     .Select(w => w.Key)
                     )
            {
                codegrams.WordId[word] = id++;
            }

            // 2) Second Pass: Build Codegrams
            foreach (var project in projects)
            {
                using (var stream = new ZipInputStream(project))
                {
                    // sum word and identifier gram counts in code file
                    foreach (var codeContent in ExtractCodeContent(stream))
                    {
                        var counter          = new CodegramCounter();
                        var identifiersGrams = counter.CountIdentifierSequences(2, codeContent);
                        var wordGrams        = counter.CountWordSequences(2, codeContent);

                        foreach (var phrase in wordGrams.Keys)
                        {
                            if (phrase.All(c => c == '_'))
                            {
                                continue;
                            }

                            string unicodeKey = UnicodeEncoder.GetUnicodeKeyFromString(phrase, word => codegrams.WordId[word]);
                            //CodegramCounter.IncrementKeyCountByValue(unicodeKey, codegrams.WordSequenceFrequencyMap, wordGrams[phrase]);
                            if (!codegrams.SequenceFrequency.ContainsKey(unicodeKey))
                            {
                                codegrams.SequenceFrequency[unicodeKey] = new SequenceFrequency();
                            }
                            codegrams.SequenceFrequency[unicodeKey].WordSequenceFrequency += wordGrams[phrase];
                        }

                        foreach (var phrase in identifiersGrams.Keys)
                        {
                            if (phrase.All(c => c == '_'))
                            {
                                continue;
                            }

                            string unicodeKey = UnicodeEncoder.GetUnicodeKeyFromString(phrase, word => codegrams.WordId[word]);
                            //CodegramCounter.IncrementKeyCountByValue(unicodeKey, codegrams.IdentifierSequenceFrequencyMap, identifiersGrams[phrase]);
                            if (!codegrams.SequenceFrequency.ContainsKey(unicodeKey))
                            {
                                codegrams.SequenceFrequency[unicodeKey] = new SequenceFrequency();
                            }
                            codegrams.SequenceFrequency[unicodeKey].IdentifierSequenceFrequency += identifiersGrams[phrase];
                        }
                    }
                }
            }
            return(codegrams);
        }