Beispiel #1
0
        public static void FillOriginalSequence(this IEnumerable <TrimedSAMAlignedItem> items, string fastqFile)
        {
            var map = items.ToDictionary(m => m.Qname);

            var reader = new FastqReader();

            using (var sr = StreamUtils.GetReader(fastqFile))
            {
                FastqSequence item;
                while ((item = reader.Parse(sr)) != null)
                {
                    var name = item.Name.StringBefore(" ").StringBefore("\t");
                    TrimedSAMAlignedItem titem;
                    if (map.TryGetValue(name, out titem))
                    {
                        titem.OriginalSequence = item.SeqString;
                    }
                }
            }
        }
Beispiel #2
0
        private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic)
        {
            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "...");

            var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value));

            var parser = new FastqReader();
            var writer = new FastqWriter();

            StreamWriter swCount = null;

            if (map.HasCountFile)
            {
                swCount = new StreamWriter(outputFile + ".dupcount");
                swCount.WriteLine("Query\tCount\tSequence");
            }

            try
            {
                int readcount = 0;
                var tmpFile   = outputFile + ".tmp";
                using (var sr = StreamUtils.GetReader(options.InputFile))
                {
                    using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz")))
                    {
                        FastqSequence seq;
                        while ((seq = parser.Parse(sr)) != null)
                        {
                            readcount++;
                            if (readcount % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads processed", readcount);
                            }

                            if (!accept(seq))
                            {
                                continue;
                            }

                            var name        = seq.Name;
                            var sequence    = seq.SeqString;
                            var score       = seq.Score;
                            var len         = sequence.Length;
                            var description = seq.Description;
                            var count       = map.GetCount(seq.Name);

                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }

                            CountItem item;
                            if (!dic.TryGetValue(sequence.Length, out item))
                            {
                                item = new CountItem();
                                dic[sequence.Length] = item;
                            }

                            string clipped;
                            if (sequence.EndsWith("CCAA"))
                            {
                                clipped    = "CCAA";
                                sequence   = sequence.Substring(0, sequence.Length - 4);
                                item.CCAA += count;
                            }
                            else if (sequence.EndsWith("CCA"))
                            {
                                clipped   = "CCA";
                                sequence  = sequence.Substring(0, sequence.Length - 3);
                                item.CCA += count;
                            }
                            else if (sequence.EndsWith("CC"))
                            {
                                bool isCCA;
                                if (ccaMap.TryGetValue(name, out isCCA) && isCCA)
                                {
                                    clipped  = "CC";
                                    sequence = sequence.Substring(0, sequence.Length - 2);
                                    item.CC += count;
                                }
                                else
                                {
                                    clipped      = string.Empty;
                                    item.notNTA += count;
                                }
                            }
                            else
                            {
                                clipped      = string.Empty;
                                item.notNTA += count;
                            }

                            if (!string.IsNullOrEmpty(clipped))
                            {
                                var newlen = sequence.Length;
                                seq.SeqString = sequence;
                                seq.Score     = score.Substring(0, newlen);
                                seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
                            }
                            else
                            {
                                seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG);
                            }
                            writer.Write(sw, seq);
                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }
                        }
                    }
                }

                File.Move(tmpFile, outputFile);
            }
            finally
            {
                if (map.HasCountFile)
                {
                    swCount.Close();
                }
            }
        }
Beispiel #3
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var specificSequencesOnly = options.Sequences != null && options.Sequences.Count > 0;

            var countfiles = options.GetCountFiles();
            var counts     = new Dictionary <string, List <SmallRNASequence> >();


            foreach (var file in countfiles)
            {
                var keptNames = new HashSet <string>();
                Func <string[], bool> accept;

                if (specificSequencesOnly)
                {
                    foreach (var seq in options.Sequences)
                    {
                        keptNames.Add(seq);
                        Console.WriteLine(seq);
                    }
                    accept = m => keptNames.Contains(m[2]);
                }
                else if (File.Exists(file.AdditionalFile)) // keep the read in fastq file only
                {
                    Progress.SetMessage("Reading " + file.AdditionalFile + "...");
                    var fastqReader = new FastqReader();
                    using (var sr = StreamUtils.GetReader(file.AdditionalFile))
                    {
                        FastqSequence fs;
                        while ((fs = fastqReader.Parse(sr)) != null)
                        {
                            var curname = fs.Name.StringBefore(SmallRNAConsts.NTA_TAG);
                            //Console.Error.WriteLine(curname);
                            keptNames.Add(curname);
                        }
                    }

                    accept = m => keptNames.Contains(m[0]);
                }
                else
                {
                    accept = m => true;
                }

                Progress.SetMessage("Reading " + file.File + "...");

                counts[file.Name] = ReadCountFile(file, accept);
            }

            var samples = counts.Keys.OrderBy(m => m).ToArray();

            if (specificSequencesOnly)
            {
                using (var sw = new StreamWriter(options.OutputFile))
                {
                    sw.WriteLine("Sequence\t{0}", samples.Merge("\t"));
                    foreach (var seq in options.Sequences)
                    {
                        sw.WriteLine("{0}\t{1}", seq, (from sample in samples
                                                       let count = counts[sample]
                                                                   let find = count.Where(l => l.Sequence.Equals(seq)).FirstOrDefault()
                                                                              select find == null ? "0" : find.Count.ToString()).Merge("\t"));
                    }
                }
            }
            else
            {
                OutputGroup(result, counts, samples);
                var readOutput = Path.ChangeExtension(options.OutputFile, ".read.count");
                var readFormat = new SmallRNASequenceFormat(options.TopNumber, options.ExportFasta);
                readFormat.WriteToFile(readOutput, counts);
                result.Add(readOutput);
            }
            Progress.End();

            return(result);
        }
Beispiel #4
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var except = new HashSet <string>();

            if (File.Exists(options.XmlFile))
            {
                //exclude the reads mapped to features no matter how many number of mismatch it has
                var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile);
                except.UnionWith(from g in allmapped
                                 from f in g
                                 from l in f.Locations
                                 from sl in l.SamLocations
                                 select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG));
            }

            if (File.Exists(options.ExcludeFile))
            {
                except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile)
                                 select l.StringBefore(SmallRNAConsts.NTA_TAG));
            }

            SmallRNACountMap cm = options.GetCountMap();
            var keys            = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray();

            foreach (var key in keys)
            {
                cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key];
            }
            StreamWriter swCount = null;

            if (File.Exists(options.CountFile))
            {
                swCount = new StreamWriter(options.OutputFile + ".dupcount");
            }

            Progress.SetMessage("output unmapped query...");
            try
            {
                using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz")))
                {
                    using (var sr = StreamUtils.GetReader(options.InputFile))
                    {
                        FastqReader reader = new FastqReader();
                        FastqWriter writer = new FastqWriter();

                        FastqSequence ss;
                        var           count = 0;
                        while ((ss = reader.Parse(sr)) != null)
                        {
                            count++;

                            if (count % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads", count);
                                if (Progress.IsCancellationPending())
                                {
                                    throw new UserTerminatedException();
                                }
                            }

                            ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description;
                            if (except.Contains(ss.Name))
                            {
                                continue;
                            }

                            if (Accept != null && !Accept(ss))
                            {
                                continue;
                            }

                            except.Add(ss.Name);
                            writer.Write(sw, ss);

                            if (swCount != null)
                            {
                                int cmcount;
                                if (!cm.Counts.TryGetValue(ss.Name, out cmcount))
                                {
                                    throw new Exception(string.Format("Cannot find {0} in count map", ss.Name));
                                }
                                swCount.WriteLine("{0}\t{1}", ss.Name, cmcount);
                            }
                        }
                    }
                }
            }
            finally
            {
                if (swCount != null)
                {
                    swCount.Close();
                }
            }

            Progress.End();

            return(result);
        }
Beispiel #5
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var gzipped = options.OutputFile.ToLower().EndsWith(".gz");

            result.Add(options.OutputFile);

            Dictionary <string, FastqSequence> queries = new Dictionary <string, FastqSequence>();

            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + options.OutputFile + "...");
            var parser = new FastqReader();
            var writer = new FastqWriter();

            var          map     = options.GetCountMap();
            StreamWriter swCount = null;

            if (map.HasCountFile)
            {
                var of = options.OutputFile + ".dupcount";
                swCount = new StreamWriter(of);
                swCount.WriteLine("Query\tCount\tSequence");
            }

            int readcount = 0;

            using (var sr = StreamUtils.GetReader(options.InputFile))
            {
                using (var sw = StreamUtils.GetWriter(options.OutputFile, gzipped))
                {
                    FastqSequence seq;
                    while ((seq = parser.Parse(sr)) != null)
                    {
                        readcount++;
                        if (readcount % 100000 == 0)
                        {
                            Progress.SetMessage("{0} reads processed", readcount);
                        }
                        var name        = seq.Name;
                        var sequence    = seq.SeqString;
                        var score       = seq.Score;
                        var len         = sequence.Length;
                        var description = seq.Description;
                        var count       = map.GetCount(seq.Name);

                        for (int i = 0; i < 4; i++)
                        {
                            var newlen = len - i;
                            if (newlen < options.MinimumReadLength)
                            {
                                break;
                            }

                            string clipped;
                            if (i == 0)
                            {
                                clipped = string.Empty;
                            }
                            else
                            {
                                clipped = sequence.Substring(newlen);
                            }

                            seq.SeqString = sequence.Substring(0, newlen);
                            seq.Score     = score.Substring(0, newlen);
                            seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
                            writer.Write(sw, seq);
                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }
                        }
                    }
                }
            }

            if (map.HasCountFile)
            {
                swCount.Close();
            }

            Progress.End();

            return(result);
        }