Пример #1
0
        /// <summary>
        /// This is the method that gets hit first when this application is run.
        /// In this method we read data from Chr1.fa for database and from Prog2-input-NM_032291-10exon-seqs.fa for
        /// query sequence for matching. Get all the query sequences into Keyvalue pair list.
        /// Loop the list so that optimum score is calucluated for each exon by calling SmithWatermanScore(string,string)
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            string bases = string.Empty;

            //Read bases from file and make it as string from array
            string[]      lines = File.ReadAllLines(@"D:\BioInfo Activity\chr1.fa");
            List <string> list  = new List <string>(lines);

            list.RemoveAt(0);
            bases = string.Join("", list.ToArray());
            string RefSeq = bases;
            //Variables used to store query sequence label, bases and final text format to be displayed.
            string PatternID   = string.Empty;
            string Pattern_Seq = string.Empty;
            string FinalText   = string.Empty;
            //Initialize KeyValuePair List for storing query sequences.
            List <KeyValuePair <string, string> > PatternData = new List <KeyValuePair <string, string> >();

            //Loop through Prog2-input-NM_032291-10exon-seqs.fa file to store all query sequences into list.
            foreach (string ExonData in File.ReadLines(@"E:\Bio Informatics\Assignment 2\Prog2-input-NM_032291-10exon-seqs.fa"))
            {
                if (ExonData.Contains(">"))
                {
                    if (Pattern_Seq != string.Empty)
                    {
                        PatternData.Add(new KeyValuePair <string, string>(PatternID, Pattern_Seq));
                        Pattern_Seq = string.Empty;
                    }
                    PatternID = ExonData;
                }
                else
                {
                    Pattern_Seq = ExonData;
                }
            }
            PatternData.Add(new KeyValuePair <string, string>(PatternID, Pattern_Seq));
            //Loop through List of query sequences and get Optimum Score and ending positions of optimum local alignment.
            //For the string the required format mentioned below.
            //>chr1.66999824.67000051.NM_032291_exon_0_0_chr1_66999825_f.+		 len=227
            //ref(1-10) = NNNNNNNNNN ; patern(1-10) = TTTCTCTCAG
            //--- Optimum Smith-Waterman score = 454 (i=227, j=67000051)
            foreach (KeyValuePair <string, string> PatternValue in PatternData)
            {
                //Forming string to show the output in required format.
                string Header          = PatternValue.Key + "\t len=" + PatternValue.Value.Length.ToString() + "\n";
                string MiddleText      = "ref(1-10) = " + RefSeq.Substring(0, 10) + " ; patern(1-10) = " + PatternValue.Value.Substring(0, 10) + "\n";
                string score_indexText = SmithWatermanScore(RefSeq, PatternValue.Value);//calling SmithWatermanScore method
                FinalText += Header + MiddleText + score_indexText;
            }
            //Store the output/complete result for all 10 exon sequences into Result.txt file.
            File.WriteAllText(@"E:\Bio Informatics\Assignment 2\Result.txt", FinalText);
        }
Пример #2
0
        static void Main(string[] args)
        {
            string CHR1Data = string.Empty;

            //Read bases from file and make it as string from array which will be used for masking non-exon region.
            string[]      lines = File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\chr1.fa");
            List <string> list  = new List <string>(lines);

            list.RemoveAt(0);
            CHR1Data = string.Join("", list.ToArray());

            ////Program - 1\\\\

            //Code below will collapse the exon regions
            List <string> FinalData = new List <string>();
            string        Data      = string.Empty;
            int           Start     = 0;
            int           End       = 0;

            //Loop thru exon annotation file.
            foreach (string ExonData in File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\hg19-refseq-exon-annot-chr1_sorted"))
            {
                //Split each line to get details of each exon in exon annotation.
                string[] ExonArray = ExonData.Split('\t');
                //Check if current start value is same as previous start value then check current end is greater than
                //previous end, if so then add entry into list with new end value.
                if (ExonArray[1] == Start.ToString())
                {
                    if (Convert.ToInt64(ExonArray[2]) >= End)
                    {
                        End = Convert.ToInt32(ExonArray[2]);
                    }
                    FinalData.RemoveAt(FinalData.Count - 1);
                    Data = "chr1\t" + Start.ToString() + "\t" + End.ToString() + "\t" + "X\t" + "0\t+";
                    FinalData.Add(Data);
                }
                else if (Convert.ToInt64(ExonArray[1]) <= End)
                {
                    FinalData.RemoveAt(FinalData.Count - 1);
                    Data = "chr1\t" + Start.ToString() + "\t" + ExonArray[2] + "\t" + "X\t" + "0\t+";//End.ToString()
                    FinalData.Add(Data);
                    End = Convert.ToInt32(ExonArray[2]);
                }
                else
                {
                    if (Convert.ToInt64(ExonArray[1]) >= Start && Convert.ToInt64(ExonArray[1]) <= End)
                    {
                        FinalData.RemoveAt(FinalData.Count - 1);
                        Data = "chr1\t" + Start.ToString() + "\t" + ExonArray[2] + "\t" + "X\t" + "0\t+";//End.ToString()
                    }
                    else
                    {
                        Data = ExonArray[0] + "\t" + ExonArray[1] + "\t" + ExonArray[2] + "\t" + "X" + "\t" + ExonArray[4] + "\t" + "+";
                    }
                    //Store start and end values for further use.
                    Start = Convert.ToInt32(ExonArray[1]);
                    End   = Convert.ToInt32(ExonArray[2]);
                    FinalData.Add(Data);
                }
            }
            //Convert list into string and place it into a file.
            string CollapsedData = String.Join("\n", FinalData.ToList().ToArray());

            File.WriteAllText(@"E:\Bio Informatics\Assignment 8\ExonCollapsedFile.txt", CollapsedData);

            ////Program - 2\\\\

            //File will be created or used for writing new chr1 which contains masked non-exon region.
            TextWriter tsw = new StreamWriter(@"E:\Bio Informatics\Assignment 8\Masked_chr1.fa");

            tsw.Write(">chr1");
            int indexValue = 0;

            //Loop through the masked exon region data.
            for (int j = 0; j < FinalData.Count; j++)
            {
                string   data       = string.Empty;
                string[] ChangeData = FinalData[j].Split('\t');
                int      startIndex = Convert.ToInt32(ChangeData[1]);
                //If index value less than start of exon then get substring from already saved chr1 and
                //complement with N's. otherwise normal exon data will be picked and placed into a file.
                if (indexValue < startIndex)
                {
                    int length1 = startIndex - indexValue;
                    tsw.Write(ComplementBasesH(CHR1Data.Substring(indexValue, length1)));
                }
                //Write exon region data.
                int endIndex = Convert.ToInt32(ChangeData[2]);
                int length2  = endIndex - startIndex;
                tsw.Write(CHR1Data.Substring(startIndex, length2));
                //Populate end value in index value.
                indexValue = endIndex;
            }
            //Close the text writer.
            tsw.Close();

            ////Program - 3\\\\

            //Initialize dictionary to store each Exon and its corresponding read count.
            Dictionary <string, int> ExonReadData = new Dictionary <string, int>();

            //Loop thru originial exon refseq file to find read count for each exon.
            foreach (string ExonRefData in File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\hg19-refseq-exon-annot-chr1_sorted"))
            {
                string[] ExonRefData_Array = ExonRefData.Split('\t');
                string   ExonName          = ExonRefData_Array[3];
                //Loop through the bowtie BED file fetched by mapping "ERR030893-1.fq" reads file onto Masked CHR1.
                foreach (string BToutData in File.ReadAllLines(@"E:\Bio Informatics\Assignment 8\bowtie-0.12.7\BTout-BED-75_Modified_Sorted"))
                {
                    string[] BToutData_Array = BToutData.Split('\t');
                    //Check if there exists Exon Name in dictionary and add 1 to the value.
                    if (ExonReadData.ContainsKey(ExonName))
                    {
                        //Add 1 to existing value for particular ExonID. If Map start and end are in range of Exon ID.
                        if ((Convert.ToInt32(BToutData_Array[1]) >= Convert.ToInt32(ExonRefData_Array[1])) &&
                            (Convert.ToInt32(BToutData_Array[2]) < Convert.ToInt32(ExonRefData_Array[2])))
                        {
                            ExonReadData[ExonName] = ExonReadData[ExonName] + 1;
                        }
                    }
                    else
                    {
                        //Add Exon ID and 1 as initial value. If Map start and end are in range of Exon ID.
                        if ((Convert.ToInt32(BToutData_Array[1]) >= Convert.ToInt32(ExonRefData_Array[1])) &&
                            (Convert.ToInt32(BToutData_Array[2]) < Convert.ToInt32(ExonRefData_Array[2])))
                        {
                            ExonReadData.Add(ExonName, 1);
                        }
                    }
                }
            }
            //Initialize dictionary for gene level expression by having Gene ID and reads count.
            Dictionary <string, int> GeneRead = new Dictionary <string, int>();

            //Loop thru list of Exon data.
            foreach (KeyValuePair <string, int> ExonRefValue in ExonReadData)
            {
                string GeneName = ExonRefValue.Key.Substring(0, ExonRefValue.Key.IndexOf("_e"));
                //Check if there exists Gene Name in dictionary and add new value to the existing value.
                if (GeneRead.ContainsKey(GeneName))
                {
                    GeneRead[GeneName] = GeneRead[GeneName] + ExonRefValue.Value;
                }
                else
                {
                    GeneRead.Add(GeneName, ExonRefValue.Value);
                }
            }
            //Convert list into a string and write into a file.
            String GeneD = String.Join("\n", GeneRead.ToArray());

            GeneD = GeneD.Replace("[", "").Replace("]", "").Replace(',', '\t');
            File.WriteAllText(@"E:\Bio Informatics\Assignment 8\GeneID_read.txt", GeneD);
        }