/// <summary> /// Private Constructor for clone method. /// </summary> /// <param name="other">CrossReferenceLink instance to clone.</param> private CrossReferenceLink(CrossReferenceLink other) { Type = other.Type; Numbers = new List <string>(other.Numbers); }
/// <summary> /// Helper method to parse the metadata of gen bank data /// </summary> /// <param name="metadata">Metadata object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseGenBankMetadata(GenBankMetadata metadata, object[,] cellRange, int rowIndex) { string message = string.Empty; string key; string subKey; string value = string.Empty; while (rowIndex < cellRange.GetLength(0)) { if (null != cellRange[rowIndex, KeyColumnIndex] && !string.IsNullOrWhiteSpace(cellRange[rowIndex, KeyColumnIndex].ToString())) { key = cellRange[rowIndex, KeyColumnIndex].ToString().ToUpperInvariant(); if (key.Equals(FEATURES)) { break; } } else { rowIndex++; continue; } subKey = cellRange[rowIndex, SubKeyColumnIndex] != null ? cellRange[rowIndex, SubKeyColumnIndex].ToString() : string.Empty; value = cellRange[rowIndex, ValueColumnIndex] != null ? cellRange[rowIndex, ValueColumnIndex].ToString() : string.Empty; string[] tokens; switch (key) { case LOCUS: rowIndex = ParseLocus(metadata, cellRange, rowIndex); rowIndex--; break; case DEFINITION: metadata.Definition = value; break; case ACCESSION: metadata.Accession = new GenBankAccession(); if (string.IsNullOrWhiteSpace(value)) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, ACCESSION); throw new FormatException(message); } string[] accessions = value.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case DBLINK: if (!string.IsNullOrWhiteSpace(value)) { break; } tokens = value.Split(':'); if (tokens.Length == 2) { if (metadata.DbLinks == null) { metadata.DbLinks = new List<CrossReferenceLink>(2); } var curLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { curLink.Type = CrossReferenceType.Project; } else if (string.Compare(tokens[0], CrossReferenceType.BioProject.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { curLink.Type = CrossReferenceType.BioProject; } else { curLink.Type = CrossReferenceType.TraceAssemblyArchive; } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { curLink.Numbers.Add(tokens[i]); } metadata.DbLinks.Add(curLink); } break; case DBSOURCE: metadata.DbSource = value; break; case VERSION: if (string.IsNullOrWhiteSpace(value)) { break; } tokens = value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. metadata.Version.Accession = m.Groups["accession"].Value; } if (tokens.Length > 1) { // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GiNumber = m.Groups["primaryID"].Value; } } break; case SEGMENT: if (string.IsNullOrWhiteSpace(value)) { break; } tokens = value.Split(" of ".ToArray(), StringSplitOptions.RemoveEmptyEntries); if (tokens.Length == 2) { int current; int count; if (int.TryParse(tokens[0], out current)) { if (int.TryParse(tokens[1], out count)) { metadata.Segment = new SequenceSegment(); metadata.Segment.Current = current; metadata.Segment.Count = count; } } } if (metadata.Segment == null) { message = String.Format( CultureInfo.InvariantCulture, Resources.UnrecognizedGenBankMetadataFormat, ACCESSION); throw new FormatException(message); } break; case KEYWORDS: metadata.Keywords = value; break; case SOURCE: rowIndex = ParseSource(metadata, cellRange, rowIndex); rowIndex--; break; case REFERENCE: rowIndex = ParseReference(metadata, cellRange, rowIndex); rowIndex--; break; case PRIMARY: metadata.Primary = value; break; case COMMENT: if (!string.IsNullOrWhiteSpace(value)) { tokens = value.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); foreach (string str in tokens) { metadata.Comments.Add(str); } } break; } rowIndex++; } return rowIndex; }
/// <summary> /// Parses the GenBank headers from the GenBank file. /// parses everything before the features section /// </summary> /// <param name="sequence">The sequence.</param> /// <param name="noOfSequence">The current sequence index.</param> /// <param name="line">parse line</param> /// <param name="stream">The stream reader.</param> /// <returns>The parsed line.</returns> private string ParseHeaders(ref Sequence sequence, int noOfSequence, string line, StreamReader stream) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data; string[] tokens; // only allow one locus line bool haveParsedLocus = false; string lineData; if (noOfSequence == 0) { line = string.Empty; line = GoToNextLine(line, stream); } // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while ((line != null) && !haveFinishedHeaders) { switch (GetLineHeader(line, DataIndent)) { case "LOCUS": if (haveParsedLocus) { string message = String.Format(CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus); Trace.Report(message); throw new InvalidDataException(message); } line = ParseLocusByTokens(line, ref sequence, stream); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": lineData = GetLineData(line, DataIndent); tokens = lineData.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } if (tokens.Length > 1) { // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GiNumber = m.Groups["primaryID"].Value; } } line = GoToNextLine(line, stream); break; case "PROJECT": lineData = GetLineData(line, DataIndent); tokens = lineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier { Name = tokens[0] }; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + line); } line = GoToNextLine(line, stream); break; case "SOURCE": line = ParseSource(line, ref sequence, stream); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": line = ParseReferences(line, ref sequence, stream); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": line = ParseComments(line, ref sequence, stream); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. lineData = GetLineData(line, DataIndent); tokens = lineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); // Validating for minimum two headers. if (tokens.Length != 4) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(ref line, Environment.NewLine, DataIndent, stream); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "ACCESSION": data = ParseMultiLineData(ref line, " ", DataIndent, stream); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": data = ParseMultiLineData(ref line, "\n", DataIndent, stream); metadata.DbLinks = new List<CrossReferenceLink>(); foreach (string link in data.Split('\n')) { tokens = link.Split(':'); if (tokens.Length == 2) { CrossReferenceLink newLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.Project; } else if (string.Compare(tokens[0], CrossReferenceType.BioProject.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.BioProject; } else { newLink.Type = CrossReferenceType.None; if (string.Compare(tokens[0], TraceAssemblyArchive, StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.TraceAssemblyArchive; } } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { newLink.Numbers.Add(tokens[i]); } metadata.DbLinks.Add(newLink); } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + line); } } break; case "DBSOURCE": metadata.DbSource = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "SEGMENT": data = ParseMultiLineData(ref line, " ", DataIndent, stream); const string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); int outvalue; if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: string lineHeader = GetLineHeader(line, DataIndent); lineData = GetLineData(line, DataIndent); ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", lineHeader, lineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, lineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new InvalidDataException(message); } return line; }
/// <summary> /// Parses the GenBank headers from the GenBank file. /// parses everything before the features section /// </summary> /// <param name="sequence">The sequence.</param> /// <param name="noOfSequence">The current sequence index.</param> /// <param name="line">parse line</param> /// <param name="stream">The stream reader.</param> /// <returns>The parsed line.</returns> private string ParseHeaders(ref Sequence sequence, int noOfSequence, string line, StreamReader stream) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data; string[] tokens; // only allow one locus line bool haveParsedLocus = false; string lineData; if (noOfSequence == 0) { line = string.Empty; line = GoToNextLine(line, stream); } // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while ((line != null) && !haveFinishedHeaders) { switch (GetLineHeader(line, DataIndent)) { case "LOCUS": if (haveParsedLocus) { string message = String.Format(CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus); Trace.Report(message); throw new InvalidDataException(message); } line = ParseLocusByTokens(line, ref sequence, stream); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": lineData = GetLineData(line, DataIndent); tokens = lineData.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } if (tokens.Length > 1) { // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GiNumber = m.Groups["primaryID"].Value; } } line = GoToNextLine(line, stream); break; case "PROJECT": lineData = GetLineData(line, DataIndent); tokens = lineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier { Name = tokens[0] }; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + line); } line = GoToNextLine(line, stream); break; case "SOURCE": line = ParseSource(line, ref sequence, stream); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": line = ParseReferences(line, ref sequence, stream); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": line = ParseComments(line, ref sequence, stream); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. lineData = GetLineData(line, DataIndent); tokens = lineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); // Validating for minimum two headers. if (tokens.Length != 4) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(ref line, Environment.NewLine, DataIndent, stream); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "ACCESSION": data = ParseMultiLineData(ref line, " ", DataIndent, stream); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": data = ParseMultiLineData(ref line, "\n", DataIndent, stream); metadata.DbLinks = new List <CrossReferenceLink>(); foreach (string link in data.Split('\n')) { tokens = link.Split(':'); if (tokens.Length == 2) { CrossReferenceLink newLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.Project; } else if (string.Compare(tokens[0], CrossReferenceType.BioProject.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.BioProject; } else { newLink.Type = CrossReferenceType.None; if (string.Compare(tokens[0], TraceAssemblyArchive, StringComparison.OrdinalIgnoreCase) == 0) { newLink.Type = CrossReferenceType.TraceAssemblyArchive; } } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { newLink.Numbers.Add(tokens[i]); } metadata.DbLinks.Add(newLink); } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + line); } } break; case "DBSOURCE": metadata.DbSource = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(ref line, " ", DataIndent, stream); break; case "SEGMENT": data = ParseMultiLineData(ref line, " ", DataIndent, stream); const string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); int outvalue; if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: string lineHeader = GetLineHeader(line, DataIndent); lineData = GetLineData(line, DataIndent); ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", lineHeader, lineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, lineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, Name); Trace.Report(message); throw new InvalidDataException(message); } return(line); }
/// <summary> /// Private Constructor for clone method. /// </summary> /// <param name="other">CrossReferenceLink instance to clone.</param> private CrossReferenceLink(CrossReferenceLink other) { Type = other.Type; Numbers = new List<string>(other.Numbers); }