public GeneCollection FetchGeneData(GeneCollection geneCollection, string[] geneArray, string originSequence) { List <string> nonMatchedGenes = new List <string>(); GeneCollection updatedGeneCollection = new GeneCollection(); foreach (String givenGene in geneArray) { Boolean isMatch = false; foreach (Gene gene in geneCollection.collection) { if (gene.ID.Contains(givenGene) || gene.LocusTag.Contains(givenGene)) { int start = gene.StartCoordinate; int distance = gene.EndCoordinate - gene.StartCoordinate; string nucleotideSequence = originSequence.Substring(start, distance); gene.Sequence = nucleotideSequence; updatedGeneCollection.AddGene(gene); isMatch = true; break; } } //If no match was found: add to nonMatchedGenes list. if (!isMatch) { nonMatchedGenes.Add(givenGene); } } //If all or some geneId entries did not match:\ if (nonMatchedGenes.Any()) { //Display list of gene entries that did not match. String message = ""; foreach (String gene in nonMatchedGenes) { if (message.Length == 0) { message += "Some of the provided enrties could not be found. Here is the list of genes; "; message += gene; } else { message += ", " + gene; } } Console.WriteLine("\n" + message); } updatedGeneCollection.SortOnID(); return(updatedGeneCollection); }
public CollectedGeneBankData ReadGenebankFile(string inputFile) { //Split path to get file name. GeneCollection geneCollection = new GeneCollection(); CodingSequenceCollection codingSequenceCollection = new CodingSequenceCollection(); string fileName = Path.GetFileName(inputFile); string organism = ""; string accession = ""; string length = ""; string originSequence = ""; //Booleans for if/else statements. bool isFirst = true; bool isOrigin = false; bool currentEntryIsCDS = false; bool currentEntryIsGene = false; //Both patterns check if both complement and non-complement entries are present. string genePattern = " *gene *(complement)?\\(?\\d*\\.\\.\\d*\\)?"; string cdsPattern = " *CDS *(complement)?\\(?\\d*\\.\\.\\d*\\)?"; string currentEntry = ""; StreamReader reader = new StreamReader(inputFile); string gbkLine; while ((gbkLine = reader.ReadLine()) != null) { //All comming lines contain nucleotide data which can be added to the origin sequence. if (isOrigin) { originSequence += Regex.Replace(gbkLine, "(\\d| )", ""); } //Only occurs untill first entry is false. if (isFirst) { if (gbkLine.StartsWith("LOCUS")) { length = GetSequenceLength(gbkLine); } if (gbkLine.Contains(" ORGANISM")) { organism = GetOrganism(gbkLine); } if (gbkLine.Contains("ACCESSION")) { accession = GetAccessionId(gbkLine); } } //Check if if (currentEntryIsCDS && !Regex.IsMatch(gbkLine, genePattern)) { currentEntry += gbkLine + "\n"; } else if (currentEntryIsCDS && Regex.IsMatch(gbkLine, genePattern)) { currentEntryIsGene = true; currentEntryIsCDS = false; CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry); codingSequenceCollection.AddCodingSequence(codingSequence); currentEntry = gbkLine + "\n"; } else if (currentEntryIsGene && !Regex.IsMatch(gbkLine, cdsPattern)) { currentEntry += gbkLine + "\n"; } else if (currentEntryIsGene && Regex.IsMatch(gbkLine, cdsPattern)) { currentEntryIsGene = false; currentEntryIsCDS = true; Gene gene = CreateGeneEntry(currentEntry); geneCollection.AddGene(gene); currentEntry = gbkLine + "\n"; } else if (isFirst && Regex.IsMatch(gbkLine, genePattern)) { currentEntryIsGene = true; isFirst = false; currentEntry += gbkLine + "\n"; } else if (isFirst && Regex.IsMatch(gbkLine, cdsPattern)) { currentEntryIsCDS = true; isFirst = false; currentEntry += gbkLine + "\n"; } if (gbkLine.StartsWith("ORIGIN")) { //Set isOrigin to true: first if statement will be handled. isOrigin = true; string line = gbkLine.Replace("ORIGIN", ""); originSequence += Regex.Replace(line, "(\\d| )", ""); if (currentEntryIsCDS) { currentEntryIsCDS = false; CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry); codingSequenceCollection.AddCodingSequence(codingSequence); } else if (currentEntryIsGene) { currentEntryIsGene = false; Gene gene = CreateGeneEntry(currentEntry); geneCollection.AddGene(gene); } } } int geneCount = geneCollection.collection.Count; //Size of gene collection int cdsCount = codingSequenceCollection.collection.Count; //Size of coding sequence collection double totalGeneCounter = 0.0; double forwardGeneCounter = 0.0; foreach (Gene geneEntry in geneCollection.collection) { if (!geneEntry.IsReverse) { totalGeneCounter++; forwardGeneCounter++; } else { totalGeneCounter++; } } //Forward/Reverse (FR) ratio calculation. double value = (forwardGeneCounter / totalGeneCounter); double forwardReverseBalance = Math.Round(value, 1); //For each gene: if gene isForward or !isReverse > +1 to total and foward //else +1 to total Summary summary = new Summary(fileName, organism, accession, length, geneCount, forwardReverseBalance, cdsCount, originSequence); CollectedGeneBankData geneBankeData = new CollectedGeneBankData(geneCollection, codingSequenceCollection, summary); return(geneBankeData); }