public static void Analyze(string fileName) { var letterCount = KnownLetters.Length; var singleLetterFrequency = new CappedFrequencyCounter <char>(letterCount); var doubleLetterFrequency = new CappedFrequencyCounter <string>(letterCount * letterCount); var threeLetterFrequency = new CappedFrequencyCounter <string>(letterCount * letterCount * letterCount); var fiveLetterFrequency = new CappedFrequencyCounter <string>(letterCount * letterCount * letterCount * letterCount * letterCount); var tenLetterFrequency = new CappedFrequencyCounter <string>(100000); var letterBuffer = new CircularBuffer <char>(10); var baseIndex = 0uL; var lineIdx = 0; using (var streamReader = new StreamReader(fileName)) { string line; while ((line = streamReader.ReadLine()) != null) { lineIdx++; if (line.StartsWith(">")) { continue; } if (lineIdx % (1000 * 1000) == 0) { Console.WriteLine($"Line {lineIdx}, base {baseIndex}"); } foreach (var c in line) { if (!KnownLetters.Contains(c)) { throw new Exception($"Unknown base {c}"); } baseIndex++; letterBuffer.Put(c); singleLetterFrequency.Add(c); if (baseIndex > 1) { var doubleLetter = new string(letterBuffer.Take(2).ToArray()); doubleLetterFrequency.Add(doubleLetter); } if (baseIndex > 2) { var threeLetter = new string(letterBuffer.Take(3).ToArray()); threeLetterFrequency.Add(threeLetter); } if (baseIndex > 4) { var fiveLetter = new string(letterBuffer.Take(5).ToArray()); fiveLetterFrequency.Add(fiveLetter); } if (baseIndex > 9) { var tenLetter = new string(letterBuffer.Take(10).ToArray()); tenLetterFrequency.Add(tenLetter); } } if (lineIdx > 55 * 1000 * 1000) { break; } } } File.WriteAllText("nucleotide_statistics.txt", "" + singleLetterFrequency.Counter.OrderByDescending(kvp => kvp.Value).Select(kvp => kvp.Key + ": " + kvp.Value).Aggregate((a, b) => a + Environment.NewLine + b) + Environment.NewLine + "--------------------" + Environment.NewLine + doubleLetterFrequency.Counter.OrderByDescending(kvp => kvp.Value).Select(kvp => kvp.Key + ": " + kvp.Value).Aggregate((a, b) => a + Environment.NewLine + b) + Environment.NewLine + "--------------------" + Environment.NewLine + threeLetterFrequency.Counter.OrderByDescending(kvp => kvp.Value).Select(kvp => kvp.Key + ": " + kvp.Value).Aggregate((a, b) => a + Environment.NewLine + b) + Environment.NewLine + "--------------------" + Environment.NewLine + fiveLetterFrequency.Counter.OrderByDescending(kvp => kvp.Value).Select(kvp => kvp.Key + ": " + kvp.Value).Aggregate((a, b) => a + Environment.NewLine + b) + Environment.NewLine + "--------------------" + Environment.NewLine + tenLetterFrequency.Counter.OrderByDescending(kvp => kvp.Value).Select(kvp => kvp.Key + ": " + kvp.Value).Aggregate((a, b) => a + Environment.NewLine + b) + Environment.NewLine); }
public FixedLengthSequenceFrequencyCounter(int sequenceLength, int combinationLimit) { this.SequenceLength = sequenceLength; circularBuffer = new CircularBuffer <char>(sequenceLength); FrequencyCounter = new CappedFrequencyCounter <string>(combinationLimit); }