public void AddCodingSequence(CodingSequence cds)
 {
     collection.Add(cds);
 }
Exemple #2
0
        public CodingSequence CreateCodingSequenceEntry(string currentEntryData)
        {
            string locusTag                = "";
            string geneId                  = "";
            string translatedSequence      = "";
            string geneProduct             = "";
            string proteinId               = "";
            bool   isReverse               = true;
            bool   foundTranslatedSequence = false;
            int    start = 0;
            int    end   = 0;

            currentEntryData.Split('\n');
            string[] entryLines = currentEntryData.Split('\n');
            foreach (string item in entryLines)
            {
                //string[] geneData = entryLines.Where(line => !string.IsNullOrEmpty(line)).ToArray();
                string[] splitItem = item.Split(' ');
                string   cdsData   = splitItem[splitItem.Length - 1];
                if (Regex.IsMatch(cdsData, "complement\\(\\d*\\.\\.\\d*\\)"))
                {
                    String   substring   = cdsData.Substring(11);
                    string   replaceData = substring.Replace(")", "");
                    string[] coordinates = Regex.Split(replaceData, @"\.\.");
                    start     = Int32.Parse(coordinates[0]);
                    end       = Int32.Parse(coordinates[1]);
                    isReverse = true;
                }
                else if (Regex.IsMatch(cdsData, "\\d*\\.\\.\\d*"))
                {
                    //string[] coordinateSplit = data.Split(new[] { ".." }, StringSplitOptions.None);
                    string[] coordinates = Regex.Split(cdsData, @"\.\.");
                    start     = Int32.Parse(coordinates[0]);
                    end       = Int32.Parse(coordinates[1]);
                    isReverse = false;
                }
                else if (item.Contains("/gene"))
                {
                    string split = item.Split('=')[1];
                    geneId = split.Replace("\"", "");
                }
                else if (item.Contains("/locus_tag"))
                {
                    string split = item.Split('=')[1];
                    locusTag = split.Replace("\"", "");
                }
                else if (item.Contains("/product"))
                {
                    string split = item.Split('=')[1];
                    geneProduct = split.Replace("\"", "");
                }
                else if (item.Contains("/protein_id"))
                {
                    string split = item.Split('=')[1];
                    proteinId = split.Replace("\"", "");
                }
                else if (item.Contains("/translation") && !foundTranslatedSequence)
                {
                    String sequence = splitItem[splitItem.Length - 2];
                    translatedSequence     += sequence.Substring(0, sequence.Length);
                    foundTranslatedSequence = true;
                }
                else if (foundTranslatedSequence && !item.Contains("ORIGIN") && item != "")
                {
                    String sequence = item.Replace(" ", "");
                    translatedSequence += sequence.Substring(0, sequence.Length - 1);
                }
            }

            CodingSequence codingSequence = new CodingSequence(locusTag, geneId, translatedSequence, geneProduct, isReverse, start, end, proteinId);

            return(codingSequence);
        }
Exemple #3
0
        /**
         * Fetches data from GeneCollection and CodingSequenceCollection.
         * @returns siteFeatureCollection containing SiteFeature objects.
         */
        public SiteFeatureCollection FetchSiteFeatures(GeneCollection geneCollection, CodingSequenceCollection cdsCollection, string[] featureCoordinateArray)
        {
            SiteFeatureCollection siteFeatureCollection = new SiteFeatureCollection();
            List <string>         nonMatchedFeatures    = new List <string>();

            foreach (string coordinate in featureCoordinateArray)
            {
                bool isMatch = false;
                if (Regex.IsMatch(coordinate, "\\d*\\.\\.\\d*"))
                {
                    string[] split           = Regex.Split(coordinate, "\\.\\.");
                    int      startCoordinate = Int32.Parse(split[0]);
                    int      endCoordinate   = Int32.Parse(split[1]);
                    for (int i = 0; i < geneCollection.collection.Count; i++)
                    {
                        string         geneId      = "";
                        string         type        = "";
                        string         orientation = "";
                        Gene           gene        = geneCollection.collection[i];
                        CodingSequence cds         = cdsCollection.collection[i];
                        if (gene.StartCoordinate >= startCoordinate && gene.EndCoordinate <= endCoordinate)
                        {
                            if (gene.ID != "")
                            {
                                geneId = gene.ID;
                            }
                            else
                            {
                                geneId = gene.LocusTag;
                            }
                            type = "gene";
                            int geneStartCoordinate = gene.StartCoordinate;
                            int geneStopCoordinate  = gene.EndCoordinate;
                            if (gene.IsReverse)
                            {
                                orientation = "R";
                            }
                            else
                            {
                                orientation = "F";
                            }
                            SiteFeature feature = new SiteFeature(geneId, type, geneStartCoordinate, geneStopCoordinate, orientation);
                            siteFeatureCollection.AddSiteFeature(feature);
                            //Change type and change geneId to geneProduct for a CDS entry. Other values are similar to Gene values.
                            string product = cds.GeneProduct;
                            type    = "CDS";
                            feature = new SiteFeature(product, type, geneStartCoordinate, geneStopCoordinate, orientation);
                            siteFeatureCollection.AddSiteFeature(feature);
                            isMatch = true;
                        }
                    }
                    if (!isMatch)
                    {
                        nonMatchedFeatures.Add(coordinate);
                    }
                }
            }
            if (!siteFeatureCollection.collection.Any() || !nonMatchedFeatures.Any())
            {
                //Display list of site entries that did not match.
                String message = "";
                foreach (String feature in nonMatchedFeatures)
                {
                    if (message != "")
                    {
                        message += "Some of the provided enrties could not be found. Here is the list of sites; ";
                        message += feature;
                    }
                    else
                    {
                        message += ", " + feature;
                    }
                }
                Console.WriteLine("\n" + message);
            }
            siteFeatureCollection.Sort();
            return(siteFeatureCollection);
        }
Exemple #4
0
        public CollectedGeneBankData ReadGenebankFile(string inputFile)
        {
            //Split path to get file name.
            GeneCollection           geneCollection           = new GeneCollection();
            CodingSequenceCollection codingSequenceCollection = new CodingSequenceCollection();
            string fileName       = Path.GetFileName(inputFile);
            string organism       = "";
            string accession      = "";
            string length         = "";
            string originSequence = "";

            //Booleans for if/else statements.
            bool isFirst            = true;
            bool isOrigin           = false;
            bool currentEntryIsCDS  = false;
            bool currentEntryIsGene = false;
            //Both patterns check if both complement and non-complement entries are present.
            string genePattern = " *gene *(complement)?\\(?\\d*\\.\\.\\d*\\)?";
            string cdsPattern  = " *CDS *(complement)?\\(?\\d*\\.\\.\\d*\\)?";

            string       currentEntry = "";
            StreamReader reader       = new StreamReader(inputFile);
            string       gbkLine;

            while ((gbkLine = reader.ReadLine()) != null)
            {
                //All comming lines contain nucleotide data which can be added to the origin sequence.
                if (isOrigin)
                {
                    originSequence += Regex.Replace(gbkLine, "(\\d| )", "");
                }
                //Only occurs untill first entry is false.
                if (isFirst)
                {
                    if (gbkLine.StartsWith("LOCUS"))
                    {
                        length = GetSequenceLength(gbkLine);
                    }
                    if (gbkLine.Contains("  ORGANISM"))
                    {
                        organism = GetOrganism(gbkLine);
                    }
                    if (gbkLine.Contains("ACCESSION"))
                    {
                        accession = GetAccessionId(gbkLine);
                    }
                }
                //Check if
                if (currentEntryIsCDS && !Regex.IsMatch(gbkLine, genePattern))
                {
                    currentEntry += gbkLine + "\n";
                }
                else if (currentEntryIsCDS && Regex.IsMatch(gbkLine, genePattern))
                {
                    currentEntryIsGene = true;
                    currentEntryIsCDS  = false;
                    CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry);
                    codingSequenceCollection.AddCodingSequence(codingSequence);
                    currentEntry = gbkLine + "\n";
                }
                else if (currentEntryIsGene && !Regex.IsMatch(gbkLine, cdsPattern))
                {
                    currentEntry += gbkLine + "\n";
                }
                else if (currentEntryIsGene && Regex.IsMatch(gbkLine, cdsPattern))
                {
                    currentEntryIsGene = false;
                    currentEntryIsCDS  = true;
                    Gene gene = CreateGeneEntry(currentEntry);
                    geneCollection.AddGene(gene);
                    currentEntry = gbkLine + "\n";
                }
                else if (isFirst && Regex.IsMatch(gbkLine, genePattern))
                {
                    currentEntryIsGene = true;
                    isFirst            = false;
                    currentEntry      += gbkLine + "\n";
                }
                else if (isFirst && Regex.IsMatch(gbkLine, cdsPattern))
                {
                    currentEntryIsCDS = true;
                    isFirst           = false;
                    currentEntry     += gbkLine + "\n";
                }
                if (gbkLine.StartsWith("ORIGIN"))
                {
                    //Set isOrigin to true: first if statement will be handled.
                    isOrigin = true;
                    string line = gbkLine.Replace("ORIGIN", "");
                    originSequence += Regex.Replace(line, "(\\d| )", "");
                    if (currentEntryIsCDS)
                    {
                        currentEntryIsCDS = false;
                        CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry);
                        codingSequenceCollection.AddCodingSequence(codingSequence);
                    }
                    else if (currentEntryIsGene)
                    {
                        currentEntryIsGene = false;
                        Gene gene = CreateGeneEntry(currentEntry);
                        geneCollection.AddGene(gene);
                    }
                }
            }
            int    geneCount          = geneCollection.collection.Count;           //Size of gene collection
            int    cdsCount           = codingSequenceCollection.collection.Count; //Size of coding sequence collection
            double totalGeneCounter   = 0.0;
            double forwardGeneCounter = 0.0;

            foreach (Gene geneEntry in geneCollection.collection)
            {
                if (!geneEntry.IsReverse)
                {
                    totalGeneCounter++;
                    forwardGeneCounter++;
                }
                else
                {
                    totalGeneCounter++;
                }
            }
            //Forward/Reverse (FR) ratio calculation.
            double value = (forwardGeneCounter / totalGeneCounter);
            double forwardReverseBalance = Math.Round(value, 1);

            //For each gene: if gene isForward or !isReverse > +1 to total and foward
            //else +1 to total
            Summary summary = new Summary(fileName, organism, accession, length, geneCount, forwardReverseBalance, cdsCount, originSequence);
            CollectedGeneBankData geneBankeData = new CollectedGeneBankData(geneCollection, codingSequenceCollection, summary);

            return(geneBankeData);
        }