/// <summary> /// This is the method that gets hit first when this application is run. /// In this method we read data from Chr1.fa for database and from Prog2-input-NM_032291-10exon-seqs.fa for /// query sequence for matching. Get all the query sequences into Keyvalue pair list. /// Loop the list so that optimum score is calucluated for each exon by calling SmithWatermanScore(string,string) /// </summary> /// <param name="args"></param> static void Main(string[] args) { string bases = string.Empty; //Read bases from file and make it as string from array string[] lines = File.ReadAllLines(@"D:\BioInfo Activity\chr1.fa"); List <string> list = new List <string>(lines); list.RemoveAt(0); bases = string.Join("", list.ToArray()); string RefSeq = bases; //Variables used to store query sequence label, bases and final text format to be displayed. string PatternID = string.Empty; string Pattern_Seq = string.Empty; string FinalText = string.Empty; //Initialize KeyValuePair List for storing query sequences. List <KeyValuePair <string, string> > PatternData = new List <KeyValuePair <string, string> >(); //Loop through Prog2-input-NM_032291-10exon-seqs.fa file to store all query sequences into list. foreach (string ExonData in File.ReadLines(@"E:\Bio Informatics\Assignment 2\Prog2-input-NM_032291-10exon-seqs.fa")) { if (ExonData.Contains(">")) { if (Pattern_Seq != string.Empty) { PatternData.Add(new KeyValuePair <string, string>(PatternID, Pattern_Seq)); Pattern_Seq = string.Empty; } PatternID = ExonData; } else { Pattern_Seq = ExonData; } } PatternData.Add(new KeyValuePair <string, string>(PatternID, Pattern_Seq)); //Loop through List of query sequences and get Optimum Score and ending positions of optimum local alignment. //For the string the required format mentioned below. //>chr1.66999824.67000051.NM_032291_exon_0_0_chr1_66999825_f.+ len=227 //ref(1-10) = NNNNNNNNNN ; patern(1-10) = TTTCTCTCAG //--- Optimum Smith-Waterman score = 454 (i=227, j=67000051) foreach (KeyValuePair <string, string> PatternValue in PatternData) { //Forming string to show the output in required format. string Header = PatternValue.Key + "\t len=" + PatternValue.Value.Length.ToString() + "\n"; string MiddleText = "ref(1-10) = " + RefSeq.Substring(0, 10) + " ; patern(1-10) = " + PatternValue.Value.Substring(0, 10) + "\n"; string score_indexText = SmithWatermanScore(RefSeq, PatternValue.Value);//calling SmithWatermanScore method FinalText += Header + MiddleText + score_indexText; } //Store the output/complete result for all 10 exon sequences into Result.txt file. File.WriteAllText(@"E:\Bio Informatics\Assignment 2\Result.txt", FinalText); }
static void Main(string[] args) { string CHR1Data = string.Empty; //Read bases from file and make it as string from array which will be used for masking non-exon region. string[] lines = File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\chr1.fa"); List <string> list = new List <string>(lines); list.RemoveAt(0); CHR1Data = string.Join("", list.ToArray()); ////Program - 1\\\\ //Code below will collapse the exon regions List <string> FinalData = new List <string>(); string Data = string.Empty; int Start = 0; int End = 0; //Loop thru exon annotation file. foreach (string ExonData in File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\hg19-refseq-exon-annot-chr1_sorted")) { //Split each line to get details of each exon in exon annotation. string[] ExonArray = ExonData.Split('\t'); //Check if current start value is same as previous start value then check current end is greater than //previous end, if so then add entry into list with new end value. if (ExonArray[1] == Start.ToString()) { if (Convert.ToInt64(ExonArray[2]) >= End) { End = Convert.ToInt32(ExonArray[2]); } FinalData.RemoveAt(FinalData.Count - 1); Data = "chr1\t" + Start.ToString() + "\t" + End.ToString() + "\t" + "X\t" + "0\t+"; FinalData.Add(Data); } else if (Convert.ToInt64(ExonArray[1]) <= End) { FinalData.RemoveAt(FinalData.Count - 1); Data = "chr1\t" + Start.ToString() + "\t" + ExonArray[2] + "\t" + "X\t" + "0\t+";//End.ToString() FinalData.Add(Data); End = Convert.ToInt32(ExonArray[2]); } else { if (Convert.ToInt64(ExonArray[1]) >= Start && Convert.ToInt64(ExonArray[1]) <= End) { FinalData.RemoveAt(FinalData.Count - 1); Data = "chr1\t" + Start.ToString() + "\t" + ExonArray[2] + "\t" + "X\t" + "0\t+";//End.ToString() } else { Data = ExonArray[0] + "\t" + ExonArray[1] + "\t" + ExonArray[2] + "\t" + "X" + "\t" + ExonArray[4] + "\t" + "+"; } //Store start and end values for further use. Start = Convert.ToInt32(ExonArray[1]); End = Convert.ToInt32(ExonArray[2]); FinalData.Add(Data); } } //Convert list into string and place it into a file. string CollapsedData = String.Join("\n", FinalData.ToList().ToArray()); File.WriteAllText(@"E:\Bio Informatics\Assignment 8\ExonCollapsedFile.txt", CollapsedData); ////Program - 2\\\\ //File will be created or used for writing new chr1 which contains masked non-exon region. TextWriter tsw = new StreamWriter(@"E:\Bio Informatics\Assignment 8\Masked_chr1.fa"); tsw.Write(">chr1"); int indexValue = 0; //Loop through the masked exon region data. for (int j = 0; j < FinalData.Count; j++) { string data = string.Empty; string[] ChangeData = FinalData[j].Split('\t'); int startIndex = Convert.ToInt32(ChangeData[1]); //If index value less than start of exon then get substring from already saved chr1 and //complement with N's. otherwise normal exon data will be picked and placed into a file. if (indexValue < startIndex) { int length1 = startIndex - indexValue; tsw.Write(ComplementBasesH(CHR1Data.Substring(indexValue, length1))); } //Write exon region data. int endIndex = Convert.ToInt32(ChangeData[2]); int length2 = endIndex - startIndex; tsw.Write(CHR1Data.Substring(startIndex, length2)); //Populate end value in index value. indexValue = endIndex; } //Close the text writer. tsw.Close(); ////Program - 3\\\\ //Initialize dictionary to store each Exon and its corresponding read count. Dictionary <string, int> ExonReadData = new Dictionary <string, int>(); //Loop thru originial exon refseq file to find read count for each exon. foreach (string ExonRefData in File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\hg19-refseq-exon-annot-chr1_sorted")) { string[] ExonRefData_Array = ExonRefData.Split('\t'); string ExonName = ExonRefData_Array[3]; //Loop through the bowtie BED file fetched by mapping "ERR030893-1.fq" reads file onto Masked CHR1. foreach (string BToutData in File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\bowtie-0.12.7\BTout-BED-75_Modified_Sorted")) { string[] BToutData_Array = BToutData.Split('\t'); //Check if there exists Exon Name in dictionary and add 1 to the value. if (ExonReadData.ContainsKey(ExonName)) { //Add 1 to existing value for particular ExonID. If Map start and end are in range of Exon ID. if ((Convert.ToInt32(BToutData_Array[1]) >= Convert.ToInt32(ExonRefData_Array[1])) && (Convert.ToInt32(BToutData_Array[2]) < Convert.ToInt32(ExonRefData_Array[2]))) { ExonReadData[ExonName] = ExonReadData[ExonName] + 1; } } else { //Add Exon ID and 1 as initial value. If Map start and end are in range of Exon ID. if ((Convert.ToInt32(BToutData_Array[1]) >= Convert.ToInt32(ExonRefData_Array[1])) && (Convert.ToInt32(BToutData_Array[2]) < Convert.ToInt32(ExonRefData_Array[2]))) { ExonReadData.Add(ExonName, 1); } } } } //Initialize dictionary for gene level expression by having Gene ID and reads count. Dictionary <string, int> GeneRead = new Dictionary <string, int>(); //Loop thru list of Exon data. foreach (KeyValuePair <string, int> ExonRefValue in ExonReadData) { string GeneName = ExonRefValue.Key.Substring(0, ExonRefValue.Key.IndexOf("_e")); //Check if there exists Gene Name in dictionary and add new value to the existing value. if (GeneRead.ContainsKey(GeneName)) { GeneRead[GeneName] = GeneRead[GeneName] + ExonRefValue.Value; } else { GeneRead.Add(GeneName, ExonRefValue.Value); } } //Convert list into a string and write into a file. String GeneD = String.Join("\n", GeneRead.ToArray()); GeneD = GeneD.Replace("[", "").Replace("]", "").Replace(',', '\t'); File.WriteAllText(@"E:\Bio Informatics\Assignment 8\GeneID_read.txt", GeneD); }