/// <summary> /// This parses a single row of data and adds it to a sequence. /// </summary> /// <param name="bioValidator"></param> /// <param name="seqdataline"></param> private void LoadSequenceData(IBioValidator bioValidator, Match seqdataline) { string rowName = seqdataline.Groups["Rowname"].Value; Debug.Assert(!string.IsNullOrEmpty(rowName)); AE2Sequence sequence = (AE2Sequence)_sequences.FirstOrDefault(seq => string.Compare(seq.CommonName, rowName) == 0); if (sequence == null) { sequence = new AE2Sequence(bioValidator) { CommonName = rowName, ScientificName = rowName }; _sequences.Add(sequence); } // Get the data this represents. int startIndex = Int32.Parse(seqdataline.Groups["Startindex"].Value); // AE2 omits the blanks so skip anything we haven't seen with a gap. if (sequence.AlignedData.Count < startIndex) { for (int i = 0; i < (startIndex - sequence.AlignedData.Count); i++) { sequence.AlignedData.Add(BioSymbol.Gap); } } string seqdata = seqdataline.Groups["Seqdata"].Value; for (int index = 0; index < seqdata.Length;) { IBioSymbol symbol; if (seqdata[index] == '\\') { symbol = CharacterToBioSymbol(bioValidator, ConvertOctal(new[] { seqdata[index + 1], seqdata[index + 2], seqdata[index + 3] })); index += 4; } else { symbol = CharacterToBioSymbol(bioValidator, seqdata[index]); index++; } sequence.AlignedData.Add(symbol ?? BioSymbol.Gap); } }
internal FastASequence(FastAFile owner, bool loadAllIntoMemory, string header, long startPos, int length, int firstNonGap, int count, IBioValidator validator) { Debug.Assert(header.Length > 0); string[] bits = header.Split(new[] { "::" }, StringSplitOptions.None); if (bits.Length == 3) { ScientificName = /*bits[0] +":" +*/ bits[1]; CommonName = bits[2]; } else { ScientificName = header; } Validator = validator; FirstDataColumn = firstNonGap; if (string.IsNullOrEmpty(CommonName)) { CommonName = "<Not Available>"; } if (string.IsNullOrEmpty(ScientificName)) { ScientificName = CommonName; } _dp = new FastAFileSequenceDataProvider(owner, startPos, length, count, validator); if (loadAllIntoMemory) { _alignedList = new List <IBioSymbol>(_dp.LoadRange(0, length)); _dp.Dispose(); _dp = null; } else { _alignedList = new VirtualizingList <IBioSymbol>(_dp, Math.Min(1024, length), 60); } }
/// <summary> /// This loads the sequences from the file /// </summary> /// <param name="bioValidator"></param> /// <returns></returns> private int LoadSequences(IBioValidator bioValidator) { _sequences.Clear(); using (var fs = File.OpenText(Filename)) { string line = fs.ReadLine(); while (!string.IsNullOrEmpty(line)) { // Header metadata? if (Regex.IsMatch(line, HeaderRegex)) { // Not loading metadata in this release. //var headerLine = Regex.Match(line, HeaderRegex); //if (headerLine.Groups["Statusin"].Value.Equals("in")) //{ // string rowName = headerLine.Groups["Rowname"].Value; //} } else if (!line.StartsWith("#-") && !line.StartsWith("#:") && Regex.IsMatch(line, SeqdataRowRegex)) { LoadSequenceData(bioValidator, Regex.Match(line, SeqdataRowRegex)); } line = fs.ReadLine(); } } // Calculate the first data column in each found sequence Parallel.ForEach(_sequences.Cast <AE2Sequence>(), seq => { seq.FirstDataColumn = Enumerable.Range(0, seq.AlignedData.Count).FirstOrDefault( i => seq.AlignedData[i].Type == BioSymbolType.Nucleotide); }); return(_sequences.Count); }
private static IBioSymbol CharacterToBioSymbol(IBioValidator bioValidator, char ch) { IBioSymbol symbol = null; switch (ch) { case '~': case '|': symbol = BioSymbol.None; break; case '-': symbol = BioSymbol.Gap; break; default: if (bioValidator.IsValid(ch)) { symbol = new BioSymbol(BioSymbolType.Nucleotide, ch); } break; } return(symbol); }
/// <summary> /// This loads the sequences from the file /// </summary> /// <param name="bioValidator"></param> /// <returns></returns> private int LoadSequences(IBioValidator bioValidator) { var headerBuffer = new byte[512]; string currHeader = string.Empty; long startPos = -1; int totalSequenceCount = 0, startingNonGap = -1; bool loadIntoMemory = _loadAllIntoMemory || (new FileInfo(Filename).Length < SmallFileSize); MmFile = MemoryMappedFile.CreateFromFile(Filename, FileMode.Open); using (var fs = MmFile.CreateViewStream(0, 0, MemoryMappedFileAccess.Read)) { for (; fs != null;) { int db = fs.ReadByte(); if (db <= 0) { break; } if ((char)db == '>') // Header for species? { // Just finished a sequence? if (currHeader != string.Empty) { if (MaxSequenceLength < totalSequenceCount) { MaxSequenceLength = totalSequenceCount; } if (startingNonGap == -1) { startingNonGap = totalSequenceCount; } _sequences.Add( new FastASequence(this, loadIntoMemory, currHeader, startPos, (int)(fs.Position - startPos), startingNonGap, totalSequenceCount, bioValidator)); } // Start new sequence currHeader = string.Empty; totalSequenceCount = 0; startingNonGap = -1; for (int i = 0; currHeader == string.Empty;) { db = fs.ReadByte(); if (db == -1 || db == 0x0a || db == 0x0d) { currHeader = Encoding.ASCII.GetString(headerBuffer, 0, i); } else { headerBuffer[i++] = (byte)db; } } // This is the start of the nucleotide chain startPos = fs.Position; } else if (db != 0x0a && db != 0x0d) { if (startingNonGap == -1 && bioValidator.IsValid((char)db)) { startingNonGap = totalSequenceCount; } totalSequenceCount++; } } // Handle the final sequence in the file. Debug.Assert(currHeader != string.Empty); Debug.Assert(totalSequenceCount > 0); Debug.Assert(startPos > 0); if (MaxSequenceLength < totalSequenceCount) { MaxSequenceLength = totalSequenceCount; } if (startingNonGap == -1) { startingNonGap = totalSequenceCount; } _sequences.Add( new FastASequence(this, loadIntoMemory, currHeader, startPos, (int)(fs.Position - startPos) - 1, startingNonGap, totalSequenceCount, bioValidator)); } // Force all sequences to have the same "virtual" length _sequences.ForEach(ns => ((FastASequence)ns).ForceLength(MaxSequenceLength)); if (loadIntoMemory) { MmFile.Dispose(); MmFile = null; } return(_sequences.Count); }
public FastAFileSequenceDataProvider(FastAFile owner, long startPos, int length, int count, IBioValidator bv) { _fs = owner.MmFile.CreateViewStream(startPos, length, MemoryMappedFileAccess.Read); _count = count; _validator = bv; }
public AE2Sequence(IBioValidator validator) { _alignedList = new List <IBioSymbol>(); Validator = validator; }