示例#1
0
        /// <summary>
        /// This parses a single row of data and adds it to a sequence.
        /// </summary>
        /// <param name="bioValidator"></param>
        /// <param name="seqdataline"></param>
        private void LoadSequenceData(IBioValidator bioValidator, Match seqdataline)
        {
            string rowName = seqdataline.Groups["Rowname"].Value;

            Debug.Assert(!string.IsNullOrEmpty(rowName));

            AE2Sequence sequence = (AE2Sequence)_sequences.FirstOrDefault(seq => string.Compare(seq.CommonName, rowName) == 0);

            if (sequence == null)
            {
                sequence = new AE2Sequence(bioValidator)
                {
                    CommonName = rowName, ScientificName = rowName
                };
                _sequences.Add(sequence);
            }

            // Get the data this represents.
            int startIndex = Int32.Parse(seqdataline.Groups["Startindex"].Value);

            // AE2 omits the blanks so skip anything we haven't seen with a gap.
            if (sequence.AlignedData.Count < startIndex)
            {
                for (int i = 0; i < (startIndex - sequence.AlignedData.Count); i++)
                {
                    sequence.AlignedData.Add(BioSymbol.Gap);
                }
            }

            string seqdata = seqdataline.Groups["Seqdata"].Value;

            for (int index = 0; index < seqdata.Length;)
            {
                IBioSymbol symbol;
                if (seqdata[index] == '\\')
                {
                    symbol = CharacterToBioSymbol(bioValidator, ConvertOctal(new[] { seqdata[index + 1], seqdata[index + 2], seqdata[index + 3] }));
                    index += 4;
                }
                else
                {
                    symbol = CharacterToBioSymbol(bioValidator, seqdata[index]);
                    index++;
                }

                sequence.AlignedData.Add(symbol ?? BioSymbol.Gap);
            }
        }
示例#2
0
        internal FastASequence(FastAFile owner, bool loadAllIntoMemory, string header, long startPos, int length,
                               int firstNonGap, int count, IBioValidator validator)
        {
            Debug.Assert(header.Length > 0);

            string[] bits = header.Split(new[] { "::" }, StringSplitOptions.None);
            if (bits.Length == 3)
            {
                ScientificName = /*bits[0] +":" +*/ bits[1];
                CommonName     = bits[2];
            }
            else
            {
                ScientificName = header;
            }

            Validator       = validator;
            FirstDataColumn = firstNonGap;

            if (string.IsNullOrEmpty(CommonName))
            {
                CommonName = "<Not Available>";
            }
            if (string.IsNullOrEmpty(ScientificName))
            {
                ScientificName = CommonName;
            }

            _dp = new FastAFileSequenceDataProvider(owner, startPos, length, count, validator);

            if (loadAllIntoMemory)
            {
                _alignedList = new List <IBioSymbol>(_dp.LoadRange(0, length));
                _dp.Dispose();
                _dp = null;
            }
            else
            {
                _alignedList = new VirtualizingList <IBioSymbol>(_dp, Math.Min(1024, length), 60);
            }
        }
示例#3
0
        /// <summary>
        /// This loads the sequences from the file
        /// </summary>
        /// <param name="bioValidator"></param>
        /// <returns></returns>
        private int LoadSequences(IBioValidator bioValidator)
        {
            _sequences.Clear();

            using (var fs = File.OpenText(Filename))
            {
                string line = fs.ReadLine();
                while (!string.IsNullOrEmpty(line))
                {
                    // Header metadata?
                    if (Regex.IsMatch(line, HeaderRegex))
                    {
                        // Not loading metadata in this release.
                        //var headerLine = Regex.Match(line, HeaderRegex);
                        //if (headerLine.Groups["Statusin"].Value.Equals("in"))
                        //{
                        //    string rowName = headerLine.Groups["Rowname"].Value;
                        //}
                    }
                    else if (!line.StartsWith("#-") && !line.StartsWith("#:") &&
                             Regex.IsMatch(line, SeqdataRowRegex))
                    {
                        LoadSequenceData(bioValidator, Regex.Match(line, SeqdataRowRegex));
                    }

                    line = fs.ReadLine();
                }
            }

            // Calculate the first data column in each found sequence
            Parallel.ForEach(_sequences.Cast <AE2Sequence>(), seq =>
            {
                seq.FirstDataColumn =
                    Enumerable.Range(0, seq.AlignedData.Count).FirstOrDefault(
                        i => seq.AlignedData[i].Type == BioSymbolType.Nucleotide);
            });


            return(_sequences.Count);
        }
示例#4
0
        private static IBioSymbol CharacterToBioSymbol(IBioValidator bioValidator, char ch)
        {
            IBioSymbol symbol = null;

            switch (ch)
            {
            case '~':
            case '|':
                symbol = BioSymbol.None;
                break;

            case '-':
                symbol = BioSymbol.Gap;
                break;

            default:
                if (bioValidator.IsValid(ch))
                {
                    symbol = new BioSymbol(BioSymbolType.Nucleotide, ch);
                }
                break;
            }
            return(symbol);
        }
示例#5
0
        /// <summary>
        /// This loads the sequences from the file
        /// </summary>
        /// <param name="bioValidator"></param>
        /// <returns></returns>
        private int LoadSequences(IBioValidator bioValidator)
        {
            var    headerBuffer = new byte[512];
            string currHeader = string.Empty;
            long   startPos = -1;
            int    totalSequenceCount = 0, startingNonGap = -1;
            bool   loadIntoMemory = _loadAllIntoMemory || (new FileInfo(Filename).Length < SmallFileSize);

            MmFile = MemoryMappedFile.CreateFromFile(Filename, FileMode.Open);
            using (var fs = MmFile.CreateViewStream(0, 0, MemoryMappedFileAccess.Read))
            {
                for (; fs != null;)
                {
                    int db = fs.ReadByte();
                    if (db <= 0)
                    {
                        break;
                    }

                    if ((char)db == '>') // Header for species?
                    {
                        // Just finished a sequence?
                        if (currHeader != string.Empty)
                        {
                            if (MaxSequenceLength < totalSequenceCount)
                            {
                                MaxSequenceLength = totalSequenceCount;
                            }
                            if (startingNonGap == -1)
                            {
                                startingNonGap = totalSequenceCount;
                            }
                            _sequences.Add(
                                new FastASequence(this, loadIntoMemory, currHeader, startPos,
                                                  (int)(fs.Position - startPos), startingNonGap, totalSequenceCount, bioValidator));
                        }

                        // Start new sequence
                        currHeader         = string.Empty;
                        totalSequenceCount = 0;
                        startingNonGap     = -1;

                        for (int i = 0; currHeader == string.Empty;)
                        {
                            db = fs.ReadByte();
                            if (db == -1 || db == 0x0a || db == 0x0d)
                            {
                                currHeader = Encoding.ASCII.GetString(headerBuffer, 0, i);
                            }
                            else
                            {
                                headerBuffer[i++] = (byte)db;
                            }
                        }

                        // This is the start of the nucleotide chain
                        startPos = fs.Position;
                    }
                    else if (db != 0x0a && db != 0x0d)
                    {
                        if (startingNonGap == -1 && bioValidator.IsValid((char)db))
                        {
                            startingNonGap = totalSequenceCount;
                        }
                        totalSequenceCount++;
                    }
                }

                // Handle the final sequence in the file.
                Debug.Assert(currHeader != string.Empty);
                Debug.Assert(totalSequenceCount > 0);
                Debug.Assert(startPos > 0);

                if (MaxSequenceLength < totalSequenceCount)
                {
                    MaxSequenceLength = totalSequenceCount;
                }
                if (startingNonGap == -1)
                {
                    startingNonGap = totalSequenceCount;
                }

                _sequences.Add(
                    new FastASequence(this, loadIntoMemory, currHeader, startPos,
                                      (int)(fs.Position - startPos) - 1, startingNonGap, totalSequenceCount, bioValidator));
            }

            // Force all sequences to have the same "virtual" length
            _sequences.ForEach(ns => ((FastASequence)ns).ForceLength(MaxSequenceLength));

            if (loadIntoMemory)
            {
                MmFile.Dispose();
                MmFile = null;
            }

            return(_sequences.Count);
        }
示例#6
0
 public FastAFileSequenceDataProvider(FastAFile owner, long startPos, int length, int count, IBioValidator bv)
 {
     _fs        = owner.MmFile.CreateViewStream(startPos, length, MemoryMappedFileAccess.Read);
     _count     = count;
     _validator = bv;
 }
示例#7
0
 public AE2Sequence(IBioValidator validator)
 {
     _alignedList = new List <IBioSymbol>();
     Validator    = validator;
 }