public double LineSalience(int n, string line) { var counter = new CodegramCounter(); var identifiersGrams = counter.IdentifierSequences(n, line).ToList(); var wordGrams = counter.WordSequences(n, line).ToList(); var words = counter.AllWords(line).Select(w => w.ToLower()).ToList(); var identifiers = counter.AllIdentifiers(line).Select(ident => ident.ToLower()).ToList(); var sumIdentifierGrams = 0.0; var sumWordGrams = 0.0; var sumWords = 0.0; var sumIdentifiers = 0.0; foreach (var word in words) { sumWords += (ReadCommands.LookupWordFrequency(Connection, Cache, word) + 1) / (double)WordCount; } foreach (var ident in identifiers) { sumIdentifiers += (ReadCommands.LookupIdentifierFrequency(Connection, Cache, ident) + 1) / (double)IdentifierCount; } foreach (var wordGram in wordGrams) { sumWordGrams += (SequenceWordFrequency(wordGram) + 1) / (double)WordSequenceCount; } foreach (var identGram in identifiersGrams) { sumIdentifierGrams += (SequenceIdentifierFrequency(identGram) + 1) / (double)IdentifierSequenceCount; } var vals = new double[] { sumWords, sumIdentifiers, sumWordGrams, sumIdentifierGrams }; if (vals.All(v => v == 0.0)) { return(0.0); } var multiplier = 1.0; if (words.Count == 1 && identifiers.Count == 1) { //Console.Write(line); multiplier = 0.01; } var salience = (vals.Where(s => s > 0.0).Min() / vals.Max()) * multiplier; return(salience); //return sumIdentifierGrams / IdentifierCount; }
public static string GetUnicodeKeyFromString(string phrase, WordIdLookup lookup) { var words = (CodegramCounter.Identifiers(phrase) .SelectMany(id => CodegramCounter.Words(id)) .Select(w => lookup(w))) .ToList(); if (words.Any(w => w == -1)) { return(null); } var chars = new List <char>(); foreach (var word in words) { // if more words than range, emit extra character to cover difference. var offset = word; while (offset >= 0) { var ch = (char)Math.Min(char.MaxValue, offset); // If ch happens to fall in suggorate pair range, need proper leading and trailing surrogate pairs. if (ch >= '\uD800' && ch <= '\uDBFF') { chars.Add(ch); chars.Add('\uDFFF'); } else if (ch >= '\uDC00' && ch <= '\uDFFF') { chars.Add('\uD800'); chars.Add(ch); } else { chars.Add(ch); } offset = offset - char.MaxValue; } } return(string.Join("", chars)); }
public CodegramsContract Build(string projectsDir) { var codegrams = new CodegramsContract(); codegrams.SequenceFrequency = new Dictionary <string, SequenceFrequency>(); codegrams.KeyFrequency = new Dictionary <string, KeyFrequency>(); codegrams.WordId = new Dictionary <string, int>(); var projects = Directory.GetFiles(projectsDir).Where(p => p.EndsWith(".zip")).ToList(); // We are optimizing for space, so using two passes over input. // 1a) First Pass: Read words and identifiers foreach (var project in projects) { using (var stream = new ZipInputStream(project)) { // sum word counts in code file foreach (var codeContent in ExtractCodeContent(stream)) { var counter = new CodegramCounter(); var identSequences = counter.CountIdentifierSequences(1, codeContent); var wordSequences = counter.CountWordSequences(1, codeContent); foreach (var ident in identSequences.Keys) { if (!codegrams.KeyFrequency.ContainsKey(ident)) { codegrams.KeyFrequency[ident] = new KeyFrequency(); } codegrams.KeyFrequency[ident].IdentifierFrequency += identSequences[ident]; } foreach (var word in wordSequences.Keys) { if (!codegrams.KeyFrequency.ContainsKey(word)) { codegrams.KeyFrequency[word] = new KeyFrequency(); } codegrams.KeyFrequency[word].WordFrequency += wordSequences[word]; } } } Console.WriteLine("Counting words in project {0}", project); } // 1b) Sort by word frequency and assign numerical word id int id = 0; foreach (var word in codegrams.KeyFrequency .OrderByDescending(w => w.Value.IdentifierFrequency + w.Value.WordFrequency) .Select(w => w.Key) ) { codegrams.WordId[word] = id++; } // 2) Second Pass: Build Codegrams foreach (var project in projects) { using (var stream = new ZipInputStream(project)) { // sum word and identifier gram counts in code file foreach (var codeContent in ExtractCodeContent(stream)) { var counter = new CodegramCounter(); var identifiersGrams = counter.CountIdentifierSequences(2, codeContent); var wordGrams = counter.CountWordSequences(2, codeContent); foreach (var phrase in wordGrams.Keys) { if (phrase.All(c => c == '_')) { continue; } string unicodeKey = UnicodeEncoder.GetUnicodeKeyFromString(phrase, word => codegrams.WordId[word]); //CodegramCounter.IncrementKeyCountByValue(unicodeKey, codegrams.WordSequenceFrequencyMap, wordGrams[phrase]); if (!codegrams.SequenceFrequency.ContainsKey(unicodeKey)) { codegrams.SequenceFrequency[unicodeKey] = new SequenceFrequency(); } codegrams.SequenceFrequency[unicodeKey].WordSequenceFrequency += wordGrams[phrase]; } foreach (var phrase in identifiersGrams.Keys) { if (phrase.All(c => c == '_')) { continue; } string unicodeKey = UnicodeEncoder.GetUnicodeKeyFromString(phrase, word => codegrams.WordId[word]); //CodegramCounter.IncrementKeyCountByValue(unicodeKey, codegrams.IdentifierSequenceFrequencyMap, identifiersGrams[phrase]); if (!codegrams.SequenceFrequency.ContainsKey(unicodeKey)) { codegrams.SequenceFrequency[unicodeKey] = new SequenceFrequency(); } codegrams.SequenceFrequency[unicodeKey].IdentifierSequenceFrequency += identifiersGrams[phrase]; } } } } return(codegrams); }