public void AddCodingSequence(CodingSequence cds) { collection.Add(cds); }
public CodingSequence CreateCodingSequenceEntry(string currentEntryData) { string locusTag = ""; string geneId = ""; string translatedSequence = ""; string geneProduct = ""; string proteinId = ""; bool isReverse = true; bool foundTranslatedSequence = false; int start = 0; int end = 0; currentEntryData.Split('\n'); string[] entryLines = currentEntryData.Split('\n'); foreach (string item in entryLines) { //string[] geneData = entryLines.Where(line => !string.IsNullOrEmpty(line)).ToArray(); string[] splitItem = item.Split(' '); string cdsData = splitItem[splitItem.Length - 1]; if (Regex.IsMatch(cdsData, "complement\\(\\d*\\.\\.\\d*\\)")) { String substring = cdsData.Substring(11); string replaceData = substring.Replace(")", ""); string[] coordinates = Regex.Split(replaceData, @"\.\."); start = Int32.Parse(coordinates[0]); end = Int32.Parse(coordinates[1]); isReverse = true; } else if (Regex.IsMatch(cdsData, "\\d*\\.\\.\\d*")) { //string[] coordinateSplit = data.Split(new[] { ".." }, StringSplitOptions.None); string[] coordinates = Regex.Split(cdsData, @"\.\."); start = Int32.Parse(coordinates[0]); end = Int32.Parse(coordinates[1]); isReverse = false; } else if (item.Contains("/gene")) { string split = item.Split('=')[1]; geneId = split.Replace("\"", ""); } else if (item.Contains("/locus_tag")) { string split = item.Split('=')[1]; locusTag = split.Replace("\"", ""); } else if (item.Contains("/product")) { string split = item.Split('=')[1]; geneProduct = split.Replace("\"", ""); } else if (item.Contains("/protein_id")) { string split = item.Split('=')[1]; proteinId = split.Replace("\"", ""); } else if (item.Contains("/translation") && !foundTranslatedSequence) { String sequence = splitItem[splitItem.Length - 2]; translatedSequence += sequence.Substring(0, sequence.Length); foundTranslatedSequence = true; } else if (foundTranslatedSequence && !item.Contains("ORIGIN") && item != "") { String sequence = item.Replace(" ", ""); translatedSequence += sequence.Substring(0, sequence.Length - 1); } } CodingSequence codingSequence = new CodingSequence(locusTag, geneId, translatedSequence, geneProduct, isReverse, start, end, proteinId); return(codingSequence); }
/** * Fetches data from GeneCollection and CodingSequenceCollection. * @returns siteFeatureCollection containing SiteFeature objects. */ public SiteFeatureCollection FetchSiteFeatures(GeneCollection geneCollection, CodingSequenceCollection cdsCollection, string[] featureCoordinateArray) { SiteFeatureCollection siteFeatureCollection = new SiteFeatureCollection(); List <string> nonMatchedFeatures = new List <string>(); foreach (string coordinate in featureCoordinateArray) { bool isMatch = false; if (Regex.IsMatch(coordinate, "\\d*\\.\\.\\d*")) { string[] split = Regex.Split(coordinate, "\\.\\."); int startCoordinate = Int32.Parse(split[0]); int endCoordinate = Int32.Parse(split[1]); for (int i = 0; i < geneCollection.collection.Count; i++) { string geneId = ""; string type = ""; string orientation = ""; Gene gene = geneCollection.collection[i]; CodingSequence cds = cdsCollection.collection[i]; if (gene.StartCoordinate >= startCoordinate && gene.EndCoordinate <= endCoordinate) { if (gene.ID != "") { geneId = gene.ID; } else { geneId = gene.LocusTag; } type = "gene"; int geneStartCoordinate = gene.StartCoordinate; int geneStopCoordinate = gene.EndCoordinate; if (gene.IsReverse) { orientation = "R"; } else { orientation = "F"; } SiteFeature feature = new SiteFeature(geneId, type, geneStartCoordinate, geneStopCoordinate, orientation); siteFeatureCollection.AddSiteFeature(feature); //Change type and change geneId to geneProduct for a CDS entry. Other values are similar to Gene values. string product = cds.GeneProduct; type = "CDS"; feature = new SiteFeature(product, type, geneStartCoordinate, geneStopCoordinate, orientation); siteFeatureCollection.AddSiteFeature(feature); isMatch = true; } } if (!isMatch) { nonMatchedFeatures.Add(coordinate); } } } if (!siteFeatureCollection.collection.Any() || !nonMatchedFeatures.Any()) { //Display list of site entries that did not match. String message = ""; foreach (String feature in nonMatchedFeatures) { if (message != "") { message += "Some of the provided enrties could not be found. Here is the list of sites; "; message += feature; } else { message += ", " + feature; } } Console.WriteLine("\n" + message); } siteFeatureCollection.Sort(); return(siteFeatureCollection); }
public CollectedGeneBankData ReadGenebankFile(string inputFile) { //Split path to get file name. GeneCollection geneCollection = new GeneCollection(); CodingSequenceCollection codingSequenceCollection = new CodingSequenceCollection(); string fileName = Path.GetFileName(inputFile); string organism = ""; string accession = ""; string length = ""; string originSequence = ""; //Booleans for if/else statements. bool isFirst = true; bool isOrigin = false; bool currentEntryIsCDS = false; bool currentEntryIsGene = false; //Both patterns check if both complement and non-complement entries are present. string genePattern = " *gene *(complement)?\\(?\\d*\\.\\.\\d*\\)?"; string cdsPattern = " *CDS *(complement)?\\(?\\d*\\.\\.\\d*\\)?"; string currentEntry = ""; StreamReader reader = new StreamReader(inputFile); string gbkLine; while ((gbkLine = reader.ReadLine()) != null) { //All comming lines contain nucleotide data which can be added to the origin sequence. if (isOrigin) { originSequence += Regex.Replace(gbkLine, "(\\d| )", ""); } //Only occurs untill first entry is false. if (isFirst) { if (gbkLine.StartsWith("LOCUS")) { length = GetSequenceLength(gbkLine); } if (gbkLine.Contains(" ORGANISM")) { organism = GetOrganism(gbkLine); } if (gbkLine.Contains("ACCESSION")) { accession = GetAccessionId(gbkLine); } } //Check if if (currentEntryIsCDS && !Regex.IsMatch(gbkLine, genePattern)) { currentEntry += gbkLine + "\n"; } else if (currentEntryIsCDS && Regex.IsMatch(gbkLine, genePattern)) { currentEntryIsGene = true; currentEntryIsCDS = false; CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry); codingSequenceCollection.AddCodingSequence(codingSequence); currentEntry = gbkLine + "\n"; } else if (currentEntryIsGene && !Regex.IsMatch(gbkLine, cdsPattern)) { currentEntry += gbkLine + "\n"; } else if (currentEntryIsGene && Regex.IsMatch(gbkLine, cdsPattern)) { currentEntryIsGene = false; currentEntryIsCDS = true; Gene gene = CreateGeneEntry(currentEntry); geneCollection.AddGene(gene); currentEntry = gbkLine + "\n"; } else if (isFirst && Regex.IsMatch(gbkLine, genePattern)) { currentEntryIsGene = true; isFirst = false; currentEntry += gbkLine + "\n"; } else if (isFirst && Regex.IsMatch(gbkLine, cdsPattern)) { currentEntryIsCDS = true; isFirst = false; currentEntry += gbkLine + "\n"; } if (gbkLine.StartsWith("ORIGIN")) { //Set isOrigin to true: first if statement will be handled. isOrigin = true; string line = gbkLine.Replace("ORIGIN", ""); originSequence += Regex.Replace(line, "(\\d| )", ""); if (currentEntryIsCDS) { currentEntryIsCDS = false; CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry); codingSequenceCollection.AddCodingSequence(codingSequence); } else if (currentEntryIsGene) { currentEntryIsGene = false; Gene gene = CreateGeneEntry(currentEntry); geneCollection.AddGene(gene); } } } int geneCount = geneCollection.collection.Count; //Size of gene collection int cdsCount = codingSequenceCollection.collection.Count; //Size of coding sequence collection double totalGeneCounter = 0.0; double forwardGeneCounter = 0.0; foreach (Gene geneEntry in geneCollection.collection) { if (!geneEntry.IsReverse) { totalGeneCounter++; forwardGeneCounter++; } else { totalGeneCounter++; } } //Forward/Reverse (FR) ratio calculation. double value = (forwardGeneCounter / totalGeneCounter); double forwardReverseBalance = Math.Round(value, 1); //For each gene: if gene isForward or !isReverse > +1 to total and foward //else +1 to total Summary summary = new Summary(fileName, organism, accession, length, geneCount, forwardReverseBalance, cdsCount, originSequence); CollectedGeneBankData geneBankeData = new CollectedGeneBankData(geneCollection, codingSequenceCollection, summary); return(geneBankeData); }