private static void SequenceAlignment() { if (!Directory.Exists(OutputDirectory)) { Directory.CreateDirectory(OutputDirectory); } var peptides = PeptideFileReader.ReadPeptides(@"G:\Projects\HumanGenome\Homo_sapiens.GRCh38.pep.all.fa"); string lastChromosome = null; string chromosomeData = null; foreach (var peptide in peptides.OrderBy(pep => pep.Chromosome)) { Console.WriteLine(peptide.GeneSymbol + ", chromosome " + peptide.Chromosome); var chromosomeFile = Directory.GetFiles(ChromosomeDataDirectory, $"*chromosome_{peptide.Chromosome}.*").Single(); if (chromosomeData == null || peptide.Chromosome != lastChromosome) { chromosomeData = File.ReadAllText(chromosomeFile); lastChromosome = peptide.Chromosome; } var nucleotides = chromosomeData.Substring(peptide.StartBase - 1, peptide.EndBase - peptide.StartBase + 1); var peptideDescription = $"{peptide.GeneSymbol}:{peptide.Chromosome}:{peptide.StartBase}:{peptide.EndBase}"; try { var exons = ExonExtractor.ExtractExons(nucleotides, new string(peptide.Sequence.ToArray())); var alignedSequence = BuildAlignedSequence(nucleotides, exons); var statistics = Environment.NewLine + "Statistics:" + Environment.NewLine + "-------------------------------" + Environment.NewLine + "Exon count: " + exons.Count + Environment.NewLine + "Shortest exon: " + exons.Min(e => e.AminoAcids.Count) + Environment.NewLine + "Longest exon: " + exons.Max(e => e.AminoAcids.Count) + Environment.NewLine + "Median exon length: " + exons.Median(e => e.AminoAcids.Count); File.AppendAllLines(Path.Combine(OutputDirectory, peptide.GeneSymbol + ".txt"), new[] { peptideDescription, InterleaveLines(SpliceText(nucleotides, 120), SpliceText(alignedSequence, 120)) .Aggregate((line1, line2) => line1 + Environment.NewLine + line2), statistics, Environment.NewLine }); } catch (Exception ex) { File.AppendAllLines(Path.Combine(OutputDirectory, peptide.GeneSymbol + ".txt"), new[] { peptideDescription, ex.Message, Environment.NewLine }); } } }
private static void WellAlignedSequences() { var peptides = PeptideFileReader.ReadPeptides(@"G:\Projects\HumanGenome\Homo_sapiens.GRCh38.pep.all.fa"); string lastChromosome = null; string chromosomeData = null; const int longSequenceThreshold = 10; foreach (var peptide in peptides.OrderBy(pep => pep.Chromosome)) { Console.WriteLine(peptide.GeneSymbol + ", chromosome " + peptide.Chromosome); var chromosomeFile = Directory.GetFiles(ChromosomeDataDirectory, $"*chromosome_{peptide.Chromosome}.*").Single(); if (chromosomeData == null || peptide.Chromosome != lastChromosome) { chromosomeData = File.ReadAllText(chromosomeFile); lastChromosome = peptide.Chromosome; } var nucleotides = chromosomeData.Substring(peptide.StartBase - 1, peptide.EndBase - peptide.StartBase + 1); var peptideDescription = $"{peptide.GeneSymbol}:{peptide.Chromosome}:{peptide.StartBase}:{peptide.EndBase}"; try { if (!peptide.Sequence.Any()) { continue; } if (peptide.Sequence.First() != 'M') { continue; } var exons = ExonExtractor.ExtractExons(nucleotides, new string(peptide.Sequence.ToArray())); var sequenceLengths = exons.Select(e => e.AminoAcids.Count).ToList(); var longSequenceMarkers = sequenceLengths.Select(x => x >= longSequenceThreshold).ToList(); var hasContiguousLongSequences = Enumerable.Range(0, longSequenceMarkers.Count - 1) .Select(idx => longSequenceMarkers[idx] && longSequenceMarkers[idx + 1]) .Any(x => x); if (!hasContiguousLongSequences) { continue; } var exonData = peptideDescription + ":" + exons .Select(e => e.StartNucelotideIndex + ";" + (e.StartNucelotideIndex + 3 * e.AminoAcids.Count - 1)) .Aggregate((a, b) => a + ";" + b); File.AppendAllText(@"well_matched_peptides.csv", exonData + Environment.NewLine); } catch { // ignored } } }
private static void TestExonExtraction() { var testPeptideSequence = "MAGVRTTGKLTT"; var testNucleotides = NucleotideFromPeptideSequence("IIMAGVLI#ARTTIIVLRTTGKLRTT#ITT"); var testExons = ExonExtractor.ExtractExons(testNucleotides, testPeptideSequence); var testAlignedSequence = BuildAlignedSequence(testNucleotides, testExons); File.WriteAllLines(@"peptide_alignment.txt", new[] { testNucleotides, testAlignedSequence }); }