Ejemplo n.º 1
0
        /// <summary>
        /// Parses the sequence represented by the specified sequence pointer.
        /// </summary>
        /// <param name="pointer">
        /// A sequence pointer which holds information about the sequence to be retrieved.
        /// </param>
        /// <returns>IAlignedSequence object.</returns>
        public IAlignedSequence ParseAlignedSequence(SequencePointer pointer)
        {
            if (pointer == null)
            {
                throw new ArgumentNullException("pointer");
            }

            if (string.IsNullOrEmpty(_fileName))
            {
                throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile);
            }

            if (pointer.IndexOffsets[0] >= pointer.IndexOffsets[1])
                return null;

            if (_mbfStreamReader == null || !_mbfStreamReader.CanRead)
            {
                _mbfStreamReader = new MBFStreamReader(_fileName);
            }

            string buffer;

            _mbfStreamReader.Seek(pointer.IndexOffsets[0], SeekOrigin.Begin);
            
            buffer = _mbfStreamReader.ReadLine();

            return ParseSequence(buffer, _isReadOnly);
        }
Ejemplo n.º 2
0
        public void ValidateFVQSPProperties()
        {
            IVirtualSequenceParser parserObj = new FastQParser();

            try
            {
                FileVirtualQualitativeSequenceProvider provObj =
                    new FileVirtualQualitativeSequenceProvider(parserObj, GetSequencePointer());
                provObj.BlockSize               = 5;
                provObj.IsReadOnly              = false;
                provObj.MaxNumberOfBlocks       = 10;
                provObj.SequencePointerInstance = GetSequencePointer();
                SequencePointer seqPoint = GetSequencePointer();
                Assert.AreEqual(5, provObj.BlockSize);
                Assert.AreEqual(10, provObj.MaxNumberOfBlocks);
                Assert.IsFalse(provObj.IsReadOnly);
                Assert.AreEqual(seqPoint.AlphabetName, provObj.SequencePointerInstance.AlphabetName);
                Assert.AreEqual(26, provObj.Count);

                ApplicationLog.WriteLine(@"FVQSP Bvt : Successfully validated all the properties");
                Console.WriteLine(@"FVQSP Bvt : Successfully validated all the properties");
            }
            finally
            {
                (parserObj as FastQParser).Dispose();
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Parses a range of symbols starting from the specified index in the sequence.
        /// </summary>
        /// <param name="startIndex">The zero-based index at which to begin parsing.</param>
        /// <param name="count">The number of symbols to parse.</param>
        /// <param name="seqPointer">The sequence pointer of that sequence.</param>
        /// <returns>The parsed symbols as ASCII values.</returns>
        public byte[] ParseRange(int startIndex, int count, SequencePointer seqPointer)
        {
            if (string.IsNullOrEmpty(_fileName))
            {
                throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile);
            }

            if (seqPointer == null)
            {
                throw new ArgumentNullException("seqPointer");
            }

            if (startIndex < 0)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (count <= 0)
            {
                throw new ArgumentOutOfRangeException("count");
            }

            if (_mbfStreamReader == null || !_mbfStreamReader.CanRead)
            {
                _mbfStreamReader = new MBFStreamReader(_fileName);
            }

            long fileIndex = startIndex + seqPointer.IndexOffsets[0];

            return(_mbfStreamReader.ReadBytes(fileIndex, count));
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Parses all the sequences in a SAM file.
        /// This method is used only in data virtualization scenarios.
        /// </summary>
        /// <param name="mbfReader">A reader for the sequence alignment text.</param>
        private void ParseSequences(MBFStreamReader mbfReader)
        {
            // if DV enabled
            if (IsDataVirtualizationEnabled && _sidecarFileProvider.SidecarFileExists)
            {
                try
                {
                    while (mbfReader.HasLines && !mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
                    {
                        SequencePointer sequencePointer = new SequencePointer { AlphabetName = Alphabets.DNA.Name };

                        // sequence starting index
                        sequencePointer.IndexOffsets[0] = mbfReader.CurrentLineStartingIndex;
                        // sequence ending index
                        sequencePointer.IndexOffsets[1] = mbfReader.CurrentLineStartingIndex + mbfReader.Line.Length;

                        // Write each sequence pointer to the sidecar file immediately
                        _sidecarFileProvider.WritePointer(sequencePointer);

                        mbfReader.GoToNextLine();
                        _lineCount++;
                    }

                    _sidecarFileProvider.Close();
                }
                catch (Exception)
                {
                    _sidecarFileProvider.Cleanup();
                }
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Gets Sequence pointer.
        /// </summary>
        /// <param name="startLine">Set starting index of the pointer</param>
        /// <returns>sequence pointer</returns>
        static SequencePointer GetSequencePointer(int startLine)
        {
            SequencePointer seqPointer = new SequencePointer();

            seqPointer.AlphabetName = "DNA";
            //seqPointer.EndingIndex = 0;
            seqPointer.StartingLine = startLine;
            seqPointer.Id           = null;
            // seqPointer.StartingIndex = 0;
            return(seqPointer);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Gets BAM Sequence pointer.
        /// </summary>
        /// <param name="startLine">Set starting index of the pointer</param>
        /// <returns>sequence pointer</returns>
        private static SequencePointer GetBAMSequencePointer(int LineNumber,
                                                             int startIndex, int endIndex)
        {
            SequencePointer seqPointer = new SequencePointer();

            seqPointer.AlphabetName    = "DNA";
            seqPointer.IndexOffsets[0] = startIndex;
            seqPointer.IndexOffsets[1] = endIndex;
            seqPointer.Id           = null;
            seqPointer.StartingLine = LineNumber;
            return(seqPointer);
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Gets the SequencePointer
        /// </summary>
        /// <returns>Sequence Pointer</returns>
        private static SequencePointer GetSequencePointer()
        {
            SequencePointer pointerObj = new SequencePointer();
            pointerObj.AlphabetName = "DNA";
            pointerObj.Id =
                "gi|186972394|gb|EU490707.1| Selenipedium aequinoctiale maturase K (matK) gene, partial cds; chloroplast";
            pointerObj.IndexOffsets[0] = 104;
            pointerObj.IndexOffsets[1] = 1405;
            pointerObj.StartingLine = 1;

            return pointerObj;
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Gets the SequencePointer
        /// </summary>
        /// <returns>Sequence Pointer</returns>
        private static SequencePointer GetSequencePointer()
        {
            SequencePointer pointerObj = new SequencePointer();

            pointerObj.AlphabetName = "DNA";
            pointerObj.Id           =
                "SRR002012.1 Oct4:5:1:871:340 length=26";
            pointerObj.IndexOffsets[0] = 40;
            pointerObj.IndexOffsets[1] = pointerObj.IndexOffsets[0] + 26;
            pointerObj.StartingLine    = 1;

            return(pointerObj);
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Parses a range of sequence items starting from the specified index in the sequence.
        /// </summary>
        /// <param name="startIndex">The zero-based index at which to begin parsing.</param>
        /// <param name="count">The number of symbols to parse.</param>
        /// <param name="seqPointer">The sequence pointer of that sequence.</param>
        /// <returns>The parsed sequence.</returns>
        public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer)
        {
            if (0 > startIndex)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (0 >= count)
            {
                throw new ArgumentOutOfRangeException("count");
            }

            if (seqPointer == null)
            {
                throw new ArgumentNullException("seqPointer");
            }

            // if the start index exceeds the sequence boundary
            if ((long)startIndex + seqPointer.IndexOffsets[0] >= seqPointer.IndexOffsets[1])
            {
                return(null);
            }

            IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName));
            Sequence  sequence = new Sequence(alphabet)
            {
                IsReadOnly = false
            };

            if (_mbfStreamReader == null || !_mbfStreamReader.CanRead)
            {
                _mbfStreamReader = new MBFStreamReader(_fileName);
            }

            long filePosition = startIndex + seqPointer.IndexOffsets[0];

            int sequenceLength = (int)(seqPointer.IndexOffsets[1] - seqPointer.IndexOffsets[0]);

            if (count + startIndex >= sequenceLength)
            {
                count = (int)(sequenceLength - startIndex);
            }

            char[] buffer = _mbfStreamReader.ReadChars(filePosition, count);
            sequence.InsertRange(0, new string(buffer));

            // default for partial load
            sequence.IsReadOnly = true;

            return(sequence);
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Get perf nos of each block
        /// </summary>
        /// <param name="parserObj">Fasta Parser object</param>
        /// <param name="pointerObj">Seq pointer</param>
        /// <param name="seq">Isequence</param>
        private void GetBlockPerfNumber(FastaParser parserObj,
                                        SequencePointer pointerObj,
                                        ISequence seq, string seqCountToRead)
        {
            // Calculating First Block Time and CPU Utilization
            _watchObj.Reset();
            _watchObj.Start();

            ISequence firstBlock = parserObj.ParseRange(0, Int32.Parse(seqCountToRead)
                                                        , pointerObj);

            _watchObj.Stop();

            Console.WriteLine(string.Format("FirstBlock Perf Time : {0} Secs",
                                            TimeSpan.FromMilliseconds(
                                                _watchObj.ElapsedMilliseconds).TotalSeconds.ToString()));
            Console.WriteLine(string.Format("FirstBlock CPU Utilization : {0}",
                                            _cpuCounterObj.NextValue().ToString()));

            // Calculating Middle Block Time and CPU Utilization
            _watchObj.Reset();
            _watchObj.Start();

            ISequence middleBlock =
                parserObj.ParseRange((seq.Count / 2),
                                     Int32.Parse(seqCountToRead), pointerObj);

            _watchObj.Stop();
            Console.WriteLine(string.Format("MiddleBlock Perf Time : {0} Secs",
                                            TimeSpan.FromMilliseconds(
                                                _watchObj.ElapsedMilliseconds).TotalSeconds.ToString()));
            Console.WriteLine(string.Format("MiddleBlock CPU Utilization : {0}",
                                            _cpuCounterObj.NextValue().ToString()));

            // Calculating Last Block Time and CPU Utilization
            _watchObj.Reset();
            _watchObj.Start();
            ISequence lastBlock =
                parserObj.ParseRange(seq.Count - Int32.Parse(seqCountToRead),
                                     Int32.Parse(seqCountToRead), pointerObj);

            _watchObj.Stop();

            Console.WriteLine(string.Format("LastBlock Perf Time : {0} Secs",
                                            TimeSpan.FromMilliseconds(
                                                _watchObj.ElapsedMilliseconds).TotalSeconds.ToString()));
            Console.WriteLine(string.Format("LastBlock CPU Utilization : {0}",
                                            _cpuCounterObj.NextValue().ToString()));
        }
Ejemplo n.º 11
0
        public void ValidateFVQSPVirtualSeqParserSeqPointerConstructor()
        {
            FileVirtualQualitativeSequenceProvider provObj =
                GetVirtualSequenceProvider();

            SequencePointer seqPointerObj = GetSequencePointer();

            Assert.AreEqual(26, provObj.Count);
            Assert.AreEqual(seqPointerObj.Id, provObj.SequencePointerInstance.Id);

            ApplicationLog.WriteLine(@"FVQSP Bvt : Successfully validated the constructor 
                FileVirtualQualitativeSequenceProvider(IVirtualSequenceParser, SequencePointer)");
            Console.WriteLine(@"FVQSP Bvt : Successfully validated the constructor 
                FileVirtualQualitativeSequenceProvider(IVirtualSequenceParser, SequencePointer)");
        }
Ejemplo n.º 12
0
        public void PerformObjectModelPerf()
        {
            string filePathObj =
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.FilePathNode);
            string seqCountToRead =
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.SequenceRangeToRead);

            Assert.IsNotNullOrEmpty(filePathObj);

            // Create a List for input files.
            List <string> lstInputFiles = new List <string>();

            lstInputFiles.Add(filePathObj);

            FastaParser parserObj = new FastaParser();

            parserObj.EnforceDataVirtualization = true;

            IList <ISequence> seqListObj = parserObj.Parse(filePathObj, false);

            SequencePointer pointerObj = new SequencePointer();

            pointerObj.AlphabetName =
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.AlphabetNode);
            pointerObj.Id =
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.SequenceIDNode);
            pointerObj.IndexOffsets[0] = int.Parse(
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.StartIndexNode));
            pointerObj.IndexOffsets[1] = int.Parse(
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.EndIndexNode));
            pointerObj.StartingLine = int.Parse(
                Utility._xmlUtil.GetTextValue(Constants.ObjectModelNodeName,
                                              Constants.StartLineNode));

            GetBlockPerfNumber(parserObj, pointerObj, seqListObj[0],
                               seqCountToRead);
            GetSequencePerfNumber(seqListObj[0]);
        }
Ejemplo n.º 13
0
        /// <summary>
        /// Parses a range of sequence items starting from the specified index in the sequence.
        /// </summary>
        /// <param name="startIndex">The zero-based index at which to begin parsing.</param>
        /// <param name="count">The number of symbols to parse.</param>
        /// <param name="seqPointer">The sequence pointer of that sequence.</param>
        /// <returns>The parsed sequence.</returns>
        public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer)
        {
            if (string.IsNullOrEmpty(_fileName))
            {
                throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile);
            }

            if (startIndex < 0)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (count <= 0)
            {
                throw new ArgumentOutOfRangeException("count");
            }

            IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName));
            Sequence  sequence = new Sequence(alphabet)
            {
                IsReadOnly = false
            };

            int start = (int)seqPointer.StartingIndex + startIndex;

            if (start >= seqPointer.EndingIndex)
            {
                return(null);
            }

            int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length;
            int len             = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex);

            using (BioTextReader bioReader = new BioTextReader(_fileName))
            {
                string str = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len);
                sequence.InsertRange(0, str);
            }

            // default for partial load
            sequence.IsReadOnly = true;

            return(sequence);
        }
Ejemplo n.º 14
0
        /// <summary>
        /// Gets the sequence ID corresponding to the specified sequence pointer.
        /// </summary>
        /// <param name="pointer">
        /// A sequence pointer representing the sequence whose ID is to be retrieved.
        /// </param>
        /// <returns>The sequence ID of the specified sequence.</returns>
        public string GetSequenceID(SequencePointer pointer)
        {
            if (pointer == null)
            {
                throw new ArgumentNullException("pointer");
            }

            if (_mbfStreamReader == null || !_mbfStreamReader.CanRead)
            {
                _mbfStreamReader = new MBFStreamReader(_fileName);
            }

            _mbfStreamReader.Seek(pointer.IndexOffsets[0] - pointer.StartingLine, SeekOrigin.Begin);
            _mbfStreamReader.ReadLine();

            // Read Sequence ID by looking back from the sequence starting index
            pointer.Id = _mbfStreamReader.GetLineField(2);
            return(pointer.Id);
        }
Ejemplo n.º 15
0
        public void ValidateBAMParseAlignedSeqWithSeqPointer()
        {
            // Get values from XML node.
            string expectedSequence = _utilityObj._xmlUtil.GetTextValue(
                Constants.BAMToSAMConversionNode, Constants.ExpectedSeqWithPointersNode);
            string samFilePath = _utilityObj._xmlUtil.GetTextValue(
                Constants.BAMToSAMConversionNode, Constants.FilePathNode);
            string startingLineForPointer = _utilityObj._xmlUtil.GetTextValue(
                Constants.BAMToSAMConversionNode, Constants.LineNumberToPointNode);
            string startIndex = _utilityObj._xmlUtil.GetTextValue(
                Constants.BAMToSAMConversionNode, Constants.StartIndexNode);
            string endIndex = _utilityObj._xmlUtil.GetTextValue(
                Constants.BAMToSAMConversionNode, Constants.EndIndexNode);

            // Parse a BAM file
            using (BAMParser parserObj = new BAMParser())
            {
                parserObj.EnforceDataVirtualization = true;

                SequenceAlignmentMap seqList = parserObj.Parse(samFilePath);
                Assert.IsNotNull(seqList);

                // Get a pointer object
                SequencePointer pointerObj =
                    GetBAMSequencePointer(Int32.Parse(startingLineForPointer, (IFormatProvider)null),
                                          Int32.Parse(startIndex, (IFormatProvider)null), Int32.Parse(endIndex, (IFormatProvider)null));

                // Parse a BAM file using Sequence Pointer.
                SAMAlignedSequence alignedSeq = (
                    SAMAlignedSequence)parserObj.ParseAlignedSequence(pointerObj);

                // Validate parsed SAM aligned sequence.
                Assert.AreEqual(expectedSequence,
                                alignedSeq.QuerySequence.ToString());

                Console.WriteLine(string.Format((IFormatProvider)null,
                                                "BAM Parser BVT : Sequence alignment aligned seq {0} validate successfully",
                                                alignedSeq.Sequences[0].ToString()));
                ApplicationLog.WriteLine(string.Format((IFormatProvider)null,
                                                       "BAM Parser BVT : Sequence alignment aligned seq validate successfully"));
            }
        }
Ejemplo n.º 16
0
        public void ValidateSequencePointerProperties()
        {
            SequencePointer pointerObj = new SequencePointer();
            pointerObj.AlphabetName = "Dna";
            pointerObj.Id = "PointerID";
            pointerObj.IndexOffsets[0] = 1;
            pointerObj.IndexOffsets[1] = 10;
            pointerObj.StartingLine = 1;

            Assert.AreEqual("Dna", pointerObj.AlphabetName);
            Assert.AreEqual("PointerID", pointerObj.Id);
            Assert.AreEqual(1, pointerObj.IndexOffsets[0]);
            Assert.AreEqual(10, pointerObj.IndexOffsets[1]);
            Assert.AreEqual(1, pointerObj.StartingLine);

            ApplicationLog.WriteLine(
                "Sequence Pointer Bvt : Successfully validated all the properties");
            Console.WriteLine(
                "Sequence Pointer Bvt : Successfully validated all the properties");
        }
Ejemplo n.º 17
0
        /// <summary>
        /// Parses a range of symbols starting from the specified index in the sequence.
        /// </summary>
        /// <param name="startIndex">The zero-based index at which to begin parsing.</param>
        /// <param name="count">The number of symbols to parse.</param>
        /// <param name="seqPointer">The sequence pointer of that sequence.</param>
        /// <returns>The parsed symbols as ASCII values.</returns>
        public byte[] ParseRange(int startIndex, int count, SequencePointer seqPointer)
        {
            if (0 > startIndex)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (0 >= count)
            {
                throw new ArgumentOutOfRangeException("count");
            }

            if (seqPointer == null)
            {
                throw new ArgumentNullException("seqPointer");
            }

            // if the start index exceeds the sequence boundary
            if ((long)startIndex + seqPointer.IndexOffsets[0] >= seqPointer.IndexOffsets[1])
            {
                return(null);
            }

            if (_mbfStreamReader == null || !_mbfStreamReader.CanRead)
            {
                _mbfStreamReader = new MBFStreamReader(_fileName);
            }

            long filePosition = startIndex + seqPointer.IndexOffsets[0];

            int sequenceLength = (int)(seqPointer.IndexOffsets[1] - seqPointer.IndexOffsets[0]);

            if (count + startIndex >= sequenceLength)
            {
                count = (int)(sequenceLength - startIndex);
            }

            return(_mbfStreamReader.ReadBytes(filePosition, count));
        }
Ejemplo n.º 18
0
        /// <summary>
        /// Parses a range of sequence items starting from the specified index in the sequence.
        /// </summary>
        /// <param name="startIndex">The zero-based index at which to begin parsing.</param>
        /// <param name="count">The number of symbols to parse.</param>
        /// <param name="seqPointer">The sequence pointer of that sequence.</param>
        /// <returns>The parsed sequence.</returns>
        public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer)
        {
            if (0 > startIndex)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (0 >= count)
            {
                throw new ArgumentOutOfRangeException("count");
            }

            IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName));
            Sequence  sequence = new Sequence(alphabet);

            sequence.IsReadOnly = false;

            int start = (int)seqPointer.StartingIndex + startIndex;

            if (start >= seqPointer.EndingIndex)
            {
                return(null);
            }

            int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length;
            int len             = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex);

            using (BioTextReader bioReader = new BioTextReader(_fileName))
            {
                string sequenceString = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len);
                sequence.InsertRange(0, sequenceString);
            }

            // default for partial load
            sequence.IsReadOnly = true;

            return(sequence);
        }
Ejemplo n.º 19
0
        public void ValidateSAMParseAlignedSeqWithSeqPointer()
        {
            // Get values from XML node.
            string expectedSequence = Utility._xmlUtil.GetTextValue(
                Constants.SAMFileWithAllFieldsNode, Constants.ExpectedSeqWithPointersNode);
            string samFilePath = Utility._xmlUtil.GetTextValue(
                Constants.SAMFileWithAllFieldsNode, Constants.FilePathNode);
            string lineNumberForPointer = Utility._xmlUtil.GetTextValue(
                Constants.SAMFileWithAllFieldsNode, Constants.LineNumberToPointNode);

            // Parse a SAM file
            SAMParser parserObj = new SAMParser();

            parserObj.EnforceDataVirtualization = true;

            SequenceAlignmentMap seqList = parserObj.Parse(samFilePath);

            Assert.IsNotNull(seqList);

            // Get a pointer object
            SequencePointer pointerObj = GetSequencePointer(Int32.Parse(lineNumberForPointer));

            pointerObj.IndexOffsets[0] = 156;
            pointerObj.IndexOffsets[1] = 304;

            // Parse a SAM file using Sequence Pointer.
            SAMAlignedSequence alignedSeq = (SAMAlignedSequence)parserObj.ParseAlignedSequence(pointerObj);

            // Validate parsed SAM aligned sequence.
            Assert.AreEqual(expectedSequence,
                            alignedSeq.QuerySequence.ToString());

            Console.WriteLine(string.Format(null,
                                            "SAM Parser BVT : Sequence alignment aligned seq {0} validate successfully",
                                            alignedSeq.Sequences[0].ToString()));
            ApplicationLog.WriteLine(string.Format(null,
                                                   "SAM Parser BVT : Sequence alignment aligned seq validate successfully"));
        }
Ejemplo n.º 20
0
        /// <summary>
        /// Parses a range of sequence items starting from the specified index in the sequence.
        /// </summary>
        /// <param name="startIndex">The zero-based index at which to begin parsing.</param>
        /// <param name="count">The number of symbols to parse.</param>
        /// <param name="seqPointer">The sequence pointer of the specified sequence.</param>
        /// <returns>The parsed sequence.</returns>
        public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer)
        {
            if (string.IsNullOrEmpty(_fileName))
            {
                throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile);
            }

            if (startIndex < 0)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (count <= 0)
            {
                throw new ArgumentOutOfRangeException("count");
            }

            IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName));
            Sequence  sequence = new Sequence(alphabet)
            {
                IsReadOnly = false
            };

            if (_mbfStreamReader == null || !_mbfStreamReader.CanRead)
            {
                _mbfStreamReader = new MBFStreamReader(_fileName);
            }

            long fileIndex = startIndex + seqPointer.IndexOffsets[0];

            char[] buffer = _mbfStreamReader.ReadChars(fileIndex, count);
            sequence.InsertRange(0, new string(buffer));

            // default for partial load
            sequence.IsReadOnly = true;

            return(sequence);
        }
Ejemplo n.º 21
0
        /// <summary>
        /// Get sequence ID corresponding to a given sequence pointer
        /// </summary>
        /// <param name="pointer">Sequence pointer</param>
        /// <returns>Sequence ID</returns>
        public string GetSequenceID(SequencePointer pointer)
        {
            if (pointer == null)
            {
                throw new ArgumentNullException("pointer");
            }

            using (StreamReader sourceReader = new StreamReader(_fileName))
            {
                int includesNewline = pointer.StartingLine * Environment.NewLine.Length;

                // Read Sequence ID by looking back from the sequence starting index
                sourceReader.BaseStream.Seek(pointer.IndexOffsets[0] + includesNewline, SeekOrigin.Begin);
                sourceReader.BaseStream.Seek(-2, SeekOrigin.Current);

                while (sourceReader.BaseStream.ReadByte() != '@')
                {
                    sourceReader.BaseStream.Seek(-2, SeekOrigin.Current);
                }

                pointer.Id = sourceReader.ReadLine();
                return(pointer.Id);
            }
        }
Ejemplo n.º 22
0
        /// <summary>
        /// Parses a single FASTQ text from a reader into a QualitativeSequence.
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new QualitativeSequence instance containing parsed data.</returns>
        private IQualitativeSequence ParseOneWithFastQFormat(MBFStreamReader mbfReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = new SequencePointer();

            string message;

            // Check for '@' symbol at the first line.
            if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            string id = mbfReader.GetLineField(2).Trim();

            // save sequence starting index
            sequencePointer.IndexOffsets[0] = mbfReader.Position;

            // Go to second line.
            mbfReader.GoToNextLine();
            if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get sequence from second line.
            string sequenceLine = mbfReader.Line;

            //save sequence ending index
            sequencePointer.IndexOffsets[1] = sequencePointer.IndexOffsets[0] + mbfReader.Line.Length;

            // Goto third line.
            mbfReader.GoToNextLine();

            // Check for '+' symbol in the third line.
            if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            string qualScoreId = mbfReader.GetLineField(2).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Goto fourth line.
            mbfReader.GoToNextLine();
            if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get the quality scores from the fourth line.
            byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line);

            // Check for sequence length and quality score length.
            if (sequenceLine.Length != mbfReader.Line.Length)
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            // Identify alphabet if it is not specified.
            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine);

                if (alphabet == null)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            FastQFormatType fastQType = FastqType;

            // Identify fastq format type if AutoDetectFastQFormat property is set to true.
            if (AutoDetectFastQFormat)
            {
                fastQType = IdentifyFastQFormatType(qualScores);
            }

            QualitativeSequence sequence = null;

            if (Encoding == null)
            {
                sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores);
            }
            else
            {
                sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores);
            }

            sequence.ID         = id;
            sequence.IsReadOnly = isReadOnly;

            sequencePointer.AlphabetName = sequence.Alphabet.Name;
            sequencePointer.Id           = sequence.ID;
            _sequencePointers.Add(sequencePointer);

            FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualQualitativeSequenceProvider = dataProvider;
            return(sequence);
        }
Ejemplo n.º 23
0
        /// <summary>
        /// Parses a single FASTA text from a reader into a sequence.
        /// </summary>
        /// <param name="bioReader">bio text reader</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new Sequence instance containing parsed data.</returns>
        protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = null;

            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            string message;

            if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVAILD_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = bioReader.GetLineField(2).Trim();

            if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize)
            {
                _lineCount++;
                _lineLength    += bioReader.Line.Length;
                sequencePointer = new SequencePointer {
                    StartingLine = _lineCount
                };
            }

            bioReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            bioReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            bool sameSequence = false;

            sequence.ID = id;
            while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                bioReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }


                // full load
                if (_blockSize <= 0)
                {
                    sequence.InsertRange(sequence.Count, bioReader.Line);
                }
                else
                {
                    if (sameSequence == false)
                    {
                        _sequenceBeginsAt = _lineLength;
                        sameSequence      = true;
                    }

                    _lineLength += bioReader.Line.Length;
                    _lineCount++;
                }

                bioReader.GoToNextLine();
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }
            sequence.IsReadOnly = isReadOnly;

            // full load
            if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize)
            {
                return(sequence);
            }

            if (sequencePointer != null)
            {
                sequencePointer.AlphabetName = sequence.Alphabet.Name;
                sequencePointer.Id           = sequence.ID;

                sequencePointer.StartingIndex = _sequenceBeginsAt;
                sequencePointer.EndingIndex   = _lineLength;
                _sequencePointers.Add(sequencePointer);
            }
            _sequenceCount++;
            FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualSequenceProvider = dataprovider;
            return(sequence);
        }
Ejemplo n.º 24
0
        /// <summary>
        /// Parses a single FASTA sequence from a file using MBFStreamReader.
        /// This method is only used in data virtualization scenarios.
        /// </summary>
        /// <param name="mbfReader">The MBFStreamReader of the file to be parsed.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in read-only mode.
        /// If this flag is set to true then the resulting sequence's IsReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The parsed sequence.</returns>
        protected ISequence ParseOneWithSpecificFormat(MBFStreamReader mbfReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = new SequencePointer();

            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            string message;

            if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVALID_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = mbfReader.GetLineField(2).Trim();

            // save initial start and end indices
            sequencePointer.StartingLine    = (int)(mbfReader.Position - mbfReader.CurrentLineStartingIndex);
            sequencePointer.IndexOffsets[0] = mbfReader.Position;
            sequencePointer.IndexOffsets[1] = mbfReader.Position;

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            mbfReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            int currentBlockSize         = 0;
            int symbolCount              = -1;
            int newLineCharacterCount    = mbfReader.NewLineCharacterCount;
            int prenewLineCharacterCount = 0;
            int lineLength = mbfReader.Line.Length;

            sequence.ID = id;

            while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                sequencePointer.IndexOffsets[1] += mbfReader.Line.Length;
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                mbfReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }

                newLineCharacterCount = mbfReader.NewLineCharacterCount;
                lineLength            = mbfReader.Line.Length;

                while (lineLength != 0 && _sidecarFileProvider != null)
                {
                    if (lineLength + currentBlockSize + newLineCharacterCount <= _blockSize)
                    {
                        symbolCount      += lineLength;
                        currentBlockSize += lineLength + newLineCharacterCount;
                        lineLength        = 0;
                    }
                    else
                    {
                        symbolCount += _blockSize - currentBlockSize;
                        lineLength   = lineLength - (_blockSize - currentBlockSize);
                        if (lineLength <= 0)
                        {
                            symbolCount += lineLength;
                            prenewLineCharacterCount = newLineCharacterCount + lineLength;
                            lineLength = 0;
                        }

                        currentBlockSize = _blockSize;
                    }

                    if (currentBlockSize == _blockSize)
                    {
                        // write to file.
                        _sidecarFileProvider.WriteBlockIndex(symbolCount);
                        currentBlockSize         = prenewLineCharacterCount;
                        prenewLineCharacterCount = 0;
                    }
                }

                mbfReader.GoToNextLine();
            }

            if (_sidecarFileProvider != null)
            {
                if (sequencePointer.IndexOffsets[1] - sequencePointer.IndexOffsets[0] > _blockSize &&
                    currentBlockSize - newLineCharacterCount > 0)
                {
                    _sidecarFileProvider.WriteBlockIndex(symbolCount);
                }
                else
                {
                    _sidecarFileProvider.WriteBlockIndex(0);
                }
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }

            sequence.IsReadOnly = isReadOnly;

            sequencePointer.AlphabetName = sequence.Alphabet.Name;
            sequencePointer.Id           = sequence.ID;

            if (_sidecarFileProvider != null)
            {
                // Write each sequence pointer to the sidecar file immediately
                _sidecarFileProvider.WritePointer(sequencePointer);
            }

            FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualSequenceProvider = dataprovider;
            return(sequence);
        }