internal void FetchGeneDisplay(GeneCollection geneCollection, string[] geneArray, string originSequence) { GbkFeatureFetcher fetcher = new GbkFeatureFetcher(); GeneCollection updatedGeneCollection = fetcher.FetchGeneData(geneCollection, geneArray, originSequence); foreach (Gene gene in updatedGeneCollection.collection) { String geneMessage = ""; if (gene.ID != "") { geneMessage += ">" + gene.ID + newLine; } else { geneMessage += ">" + gene.LocusTag + newLine; } //Print sequence in substrings of 80. String sequence = gene.Sequence; int length = sequence.Length; for (int start = 0; start < length;) { int end = start + 80; if (end < length) { geneMessage += sequence.Substring(start, 80) + newLine; } else { geneMessage += sequence.Substring(start, (length - start)); } start += 80; } Console.WriteLine(geneMessage); } Console.WriteLine(newLine); }
public void FetchFeatures(GeneCollection geneCollection, CodingSequenceCollection cdsCollection, string[] featureArray) { Console.Write("FEATURE;TYPE;START;STOP;ORIENTATION"); GbkFeatureFetcher fetcher = new GbkFeatureFetcher(); SiteFeatureCollection siteFeatureCollection = fetcher.FetchSiteFeatures(geneCollection, cdsCollection, featureArray); foreach (SiteFeature siteFeature in siteFeatureCollection.collection) { Console.WriteLine(siteFeature.GeneID + ";" + siteFeature.Type + ";" + siteFeature.StartCoordinate + ";" + siteFeature.EndCoordinate + ";" + siteFeature.Orientation); } Console.WriteLine(newLine); }
public GeneCollection FetchGeneData(GeneCollection geneCollection, string[] geneArray, string originSequence) { List <string> nonMatchedGenes = new List <string>(); GeneCollection updatedGeneCollection = new GeneCollection(); foreach (String givenGene in geneArray) { Boolean isMatch = false; foreach (Gene gene in geneCollection.collection) { if (gene.ID.Contains(givenGene) || gene.LocusTag.Contains(givenGene)) { int start = gene.StartCoordinate; int distance = gene.EndCoordinate - gene.StartCoordinate; string nucleotideSequence = originSequence.Substring(start, distance); gene.Sequence = nucleotideSequence; updatedGeneCollection.AddGene(gene); isMatch = true; break; } } //If no match was found: add to nonMatchedGenes list. if (!isMatch) { nonMatchedGenes.Add(givenGene); } } //If all or some geneId entries did not match:\ if (nonMatchedGenes.Any()) { //Display list of gene entries that did not match. String message = ""; foreach (String gene in nonMatchedGenes) { if (message.Length == 0) { message += "Some of the provided enrties could not be found. Here is the list of genes; "; message += gene; } else { message += ", " + gene; } } Console.WriteLine("\n" + message); } updatedGeneCollection.SortOnID(); return(updatedGeneCollection); }
public void FetchSites(GeneCollection geneCollection, string originSequence, string[] siteArray) { Console.Write("POSITION;SEQUENCE;GENE"); GbkFeatureFetcher fetcher = new GbkFeatureFetcher(); List <SearchSiteCollection> list = fetcher.FetchSearchSiteData(geneCollection, originSequence, siteArray); foreach (SearchSiteCollection searchSiteCollection in list) { foreach (SearchSite searchSite in searchSiteCollection.collection) { Console.WriteLine(searchSite.StartPosition + ";" + searchSite.Site + ";" + searchSite.GeneName); } } Console.WriteLine(newLine); }
public List <SearchSiteCollection> FetchSearchSiteData(GeneCollection geneCollection, string originSequence, string[] siteArray) { Dictionary <string, string> iupacTable = CreateIupacTable(); List <SearchSiteCollection> finalSiteList = new List <SearchSiteCollection>(); List <string> nonMatchedSites = new List <string>(); string upperOriginSequence = originSequence.ToUpper(); foreach (string site in siteArray) { SearchSiteCollection searchSiteCollection = new SearchSiteCollection(); string[] nucleotideSites = site.Split(); List <string> targetSequences = new List <string>(); bool isMatch = false; foreach (char nucleotideSite in site) { string nucSite = nucleotideSite.ToString(); if (iupacTable.ContainsKey(nucSite)) { String newNucleotides = iupacTable[nucSite]; String regex = "[" + newNucleotides + "]"; foreach (char newNuc in newNucleotides) { string nuc = newNuc.ToString(); string targetSequence = site.Replace(nucSite, nuc); targetSequences.Add(targetSequence); } searchSiteCollection.searchSiteRegexMessage = "site search: " + nucSite + " (regex: " + nucSite.Replace(nucSite, regex) + ")"; } } foreach (String targetSequence in targetSequences) { int index = 0; while (index >= 0 && index != upperOriginSequence.Length) { index++; index = upperOriginSequence.IndexOf(targetSequence, index); String strIndex = "" + (index + 1); if (index != -1) { String geneName = "INTERGENIC"; foreach (Gene gene in geneCollection.collection) { if (index >= gene.StartCoordinate && index <= gene.EndCoordinate) { //Index + 1 because coding index starts at zero and sequence index starts at 1. if (gene.ID != "") { geneName = gene.ID; } else { geneName = gene.LocusTag; } isMatch = true; } } SearchSite searchSite = new SearchSite(strIndex, targetSequence, geneName); if (searchSiteCollection.collection.Any()) { bool newEntry = true; foreach (SearchSite siteEntry in searchSiteCollection.collection) { if (siteEntry.StartPosition.Equals(strIndex)) { if (siteEntry.Site.Equals("INTERGENIC") && !geneName.Equals("INTERGENIC")) { siteEntry.GeneName = geneName; newEntry = false; break; } } } if (newEntry) { searchSiteCollection.AddSearchSite(searchSite); } } else { searchSiteCollection.AddSearchSite(searchSite); } } } } finalSiteList.Add(searchSiteCollection); if (!isMatch) { nonMatchedSites.Add(site); } } //If all site entries did not match: if (nonMatchedSites.Any()) { //Display list of nucleotide site entries that did not match. String message = ""; foreach (String site in nonMatchedSites) { if (message == "") { message += "Some of the provided entries could not be found. Here is the list of nucleotide sites; "; message += site; } else { message += ", " + site; } } Console.WriteLine("\n" + message); } return(finalSiteList); }
/** * Fetches data from GeneCollection and CodingSequenceCollection. * @returns siteFeatureCollection containing SiteFeature objects. */ public SiteFeatureCollection FetchSiteFeatures(GeneCollection geneCollection, CodingSequenceCollection cdsCollection, string[] featureCoordinateArray) { SiteFeatureCollection siteFeatureCollection = new SiteFeatureCollection(); List <string> nonMatchedFeatures = new List <string>(); foreach (string coordinate in featureCoordinateArray) { bool isMatch = false; if (Regex.IsMatch(coordinate, "\\d*\\.\\.\\d*")) { string[] split = Regex.Split(coordinate, "\\.\\."); int startCoordinate = Int32.Parse(split[0]); int endCoordinate = Int32.Parse(split[1]); for (int i = 0; i < geneCollection.collection.Count; i++) { string geneId = ""; string type = ""; string orientation = ""; Gene gene = geneCollection.collection[i]; CodingSequence cds = cdsCollection.collection[i]; if (gene.StartCoordinate >= startCoordinate && gene.EndCoordinate <= endCoordinate) { if (gene.ID != "") { geneId = gene.ID; } else { geneId = gene.LocusTag; } type = "gene"; int geneStartCoordinate = gene.StartCoordinate; int geneStopCoordinate = gene.EndCoordinate; if (gene.IsReverse) { orientation = "R"; } else { orientation = "F"; } SiteFeature feature = new SiteFeature(geneId, type, geneStartCoordinate, geneStopCoordinate, orientation); siteFeatureCollection.AddSiteFeature(feature); //Change type and change geneId to geneProduct for a CDS entry. Other values are similar to Gene values. string product = cds.GeneProduct; type = "CDS"; feature = new SiteFeature(product, type, geneStartCoordinate, geneStopCoordinate, orientation); siteFeatureCollection.AddSiteFeature(feature); isMatch = true; } } if (!isMatch) { nonMatchedFeatures.Add(coordinate); } } } if (!siteFeatureCollection.collection.Any() || !nonMatchedFeatures.Any()) { //Display list of site entries that did not match. String message = ""; foreach (String feature in nonMatchedFeatures) { if (message != "") { message += "Some of the provided enrties could not be found. Here is the list of sites; "; message += feature; } else { message += ", " + feature; } } Console.WriteLine("\n" + message); } siteFeatureCollection.Sort(); return(siteFeatureCollection); }
public CollectedGeneBankData(GeneCollection collectedGenes, CodingSequenceCollection cdsCollection, Summary sum) { geneCollection = collectedGenes; codingSequenceCollection = cdsCollection; summary = sum; }
public CollectedGeneBankData ReadGenebankFile(string inputFile) { //Split path to get file name. GeneCollection geneCollection = new GeneCollection(); CodingSequenceCollection codingSequenceCollection = new CodingSequenceCollection(); string fileName = Path.GetFileName(inputFile); string organism = ""; string accession = ""; string length = ""; string originSequence = ""; //Booleans for if/else statements. bool isFirst = true; bool isOrigin = false; bool currentEntryIsCDS = false; bool currentEntryIsGene = false; //Both patterns check if both complement and non-complement entries are present. string genePattern = " *gene *(complement)?\\(?\\d*\\.\\.\\d*\\)?"; string cdsPattern = " *CDS *(complement)?\\(?\\d*\\.\\.\\d*\\)?"; string currentEntry = ""; StreamReader reader = new StreamReader(inputFile); string gbkLine; while ((gbkLine = reader.ReadLine()) != null) { //All comming lines contain nucleotide data which can be added to the origin sequence. if (isOrigin) { originSequence += Regex.Replace(gbkLine, "(\\d| )", ""); } //Only occurs untill first entry is false. if (isFirst) { if (gbkLine.StartsWith("LOCUS")) { length = GetSequenceLength(gbkLine); } if (gbkLine.Contains(" ORGANISM")) { organism = GetOrganism(gbkLine); } if (gbkLine.Contains("ACCESSION")) { accession = GetAccessionId(gbkLine); } } //Check if if (currentEntryIsCDS && !Regex.IsMatch(gbkLine, genePattern)) { currentEntry += gbkLine + "\n"; } else if (currentEntryIsCDS && Regex.IsMatch(gbkLine, genePattern)) { currentEntryIsGene = true; currentEntryIsCDS = false; CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry); codingSequenceCollection.AddCodingSequence(codingSequence); currentEntry = gbkLine + "\n"; } else if (currentEntryIsGene && !Regex.IsMatch(gbkLine, cdsPattern)) { currentEntry += gbkLine + "\n"; } else if (currentEntryIsGene && Regex.IsMatch(gbkLine, cdsPattern)) { currentEntryIsGene = false; currentEntryIsCDS = true; Gene gene = CreateGeneEntry(currentEntry); geneCollection.AddGene(gene); currentEntry = gbkLine + "\n"; } else if (isFirst && Regex.IsMatch(gbkLine, genePattern)) { currentEntryIsGene = true; isFirst = false; currentEntry += gbkLine + "\n"; } else if (isFirst && Regex.IsMatch(gbkLine, cdsPattern)) { currentEntryIsCDS = true; isFirst = false; currentEntry += gbkLine + "\n"; } if (gbkLine.StartsWith("ORIGIN")) { //Set isOrigin to true: first if statement will be handled. isOrigin = true; string line = gbkLine.Replace("ORIGIN", ""); originSequence += Regex.Replace(line, "(\\d| )", ""); if (currentEntryIsCDS) { currentEntryIsCDS = false; CodingSequence codingSequence = CreateCodingSequenceEntry(currentEntry); codingSequenceCollection.AddCodingSequence(codingSequence); } else if (currentEntryIsGene) { currentEntryIsGene = false; Gene gene = CreateGeneEntry(currentEntry); geneCollection.AddGene(gene); } } } int geneCount = geneCollection.collection.Count; //Size of gene collection int cdsCount = codingSequenceCollection.collection.Count; //Size of coding sequence collection double totalGeneCounter = 0.0; double forwardGeneCounter = 0.0; foreach (Gene geneEntry in geneCollection.collection) { if (!geneEntry.IsReverse) { totalGeneCounter++; forwardGeneCounter++; } else { totalGeneCounter++; } } //Forward/Reverse (FR) ratio calculation. double value = (forwardGeneCounter / totalGeneCounter); double forwardReverseBalance = Math.Round(value, 1); //For each gene: if gene isForward or !isReverse > +1 to total and foward //else +1 to total Summary summary = new Summary(fileName, organism, accession, length, geneCount, forwardReverseBalance, cdsCount, originSequence); CollectedGeneBankData geneBankeData = new CollectedGeneBankData(geneCollection, codingSequenceCollection, summary); return(geneBankeData); }
static void Main(string[] args) { var options = new Options(); if (args.Length != 0) { Parser parser = new Parser(); FileReader reader = new FileReader(); GeneBankDisplayFetcher displayFetcher = new GeneBankDisplayFetcher(); try { if (parser.ParseArguments(args, options)) { //Match to regex. Checks if file ends with .gbk string gbkFile = options.InputFile; if (File.Exists(gbkFile) && gbkFile.EndsWith(".gb") || gbkFile.EndsWith(".gbk")) { CollectedGeneBankData geneBankData = reader.ReadGenebankFile(gbkFile); GeneCollection geneCollection = geneBankData.geneCollection; Summary summary = geneBankData.summary; CodingSequenceCollection cdsCollection = geneBankData.codingSequenceCollection; if (options.Summary) { // get summary from data fetcher displayFetcher.FetchSummary(geneBankData.summary); } string[] geneArray = options.FetchGenes; if (geneArray != null && geneArray.Length != 0) { Console.WriteLine("Fetching gene sequences..."); displayFetcher.FetchGeneDisplay(geneCollection, geneArray, summary.OriginSequence); //give options.GenesToFetch to dataFetcher and return the required data. } if (options.FetchCDS != null && options.FetchCDS.Length != 0) { Console.WriteLine("Fetching cds product sequences..."); string[] cdsArray = options.FetchCDS; displayFetcher.FetchCDSs(cdsCollection, cdsArray); //give options.CDSsToFetch to dataFetcher and return the required data. } if (options.FetchFeatures != null && options.FetchFeatures.Length != 0) { Console.WriteLine("Fetching gene location features..."); string[] featureArray = options.FetchFeatures; displayFetcher.FetchFeatures(geneCollection, cdsCollection, featureArray); //give options.FeaturesToFetch to dataFetcher and return the required data. } if (options.FetchSites != null && options.FetchSites.Length != 0) { Console.WriteLine("Fetching given nucleotide sites..."); string[] siteArray = options.FetchSites; displayFetcher.FetchSites(geneCollection, geneBankData.summary.OriginSequence, siteArray); //give options.SitesToFetch to dataFetcher and return the required data. } Console.ReadKey(); } else { Console.WriteLine("Given file " + gbkFile + " seems to not exist on your computer. Please check your input."); Console.WriteLine("Press any key to close the console."); Console.ReadKey(); } } else { Console.WriteLine("The commandline parser could not find any arguments. Please use the --help function for options."); Console.WriteLine("Press any key to close the console."); Console.WriteLine(parser.ParseArguments(args, options)); Console.WriteLine(options.InputFile); Console.WriteLine(args[0] + " " + args[1]); Console.ReadKey(); } } catch (Exception e) { Console.WriteLine("Encounted an exception while parsing." + e + "\nPlease check your cmd arguments and try again, or use the --help function."); Console.WriteLine("Press any key to close the console."); Console.ReadKey(); } } else { Console.WriteLine("No arguments we're provided."); options.GetHelp(); Console.WriteLine("Press any key to close the console."); Console.ReadKey(); } }