示例#1
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var except = new HashSet <string>();

            if (File.Exists(options.XmlFile))
            {
                //exclude the reads mapped to features no matter how many number of mismatch it has
                var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile);
                except.UnionWith(from g in allmapped
                                 from f in g
                                 from l in f.Locations
                                 from sl in l.SamLocations
                                 select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG));
            }

            if (File.Exists(options.ExcludeFile))
            {
                except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile)
                                 select l.StringBefore(SmallRNAConsts.NTA_TAG));
            }

            SmallRNACountMap cm = options.GetCountMap();
            var keys            = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray();

            foreach (var key in keys)
            {
                cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key];
            }
            StreamWriter swCount = null;

            if (File.Exists(options.CountFile))
            {
                swCount = new StreamWriter(options.OutputFile + ".dupcount");
            }

            Progress.SetMessage("output unmapped query...");
            try
            {
                using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz")))
                {
                    using (var sr = StreamUtils.GetReader(options.InputFile))
                    {
                        FastqReader reader = new FastqReader();
                        FastqWriter writer = new FastqWriter();

                        FastqSequence ss;
                        var           count = 0;
                        while ((ss = reader.Parse(sr)) != null)
                        {
                            count++;

                            if (count % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads", count);
                                if (Progress.IsCancellationPending())
                                {
                                    throw new UserTerminatedException();
                                }
                            }

                            ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description;
                            if (except.Contains(ss.Name))
                            {
                                continue;
                            }

                            if (Accept != null && !Accept(ss))
                            {
                                continue;
                            }

                            except.Add(ss.Name);
                            writer.Write(sw, ss);

                            if (swCount != null)
                            {
                                int cmcount;
                                if (!cm.Counts.TryGetValue(ss.Name, out cmcount))
                                {
                                    throw new Exception(string.Format("Cannot find {0} in count map", ss.Name));
                                }
                                swCount.WriteLine("{0}\t{1}", ss.Name, cmcount);
                            }
                        }
                    }
                }
            }
            finally
            {
                if (swCount != null)
                {
                    swCount.Close();
                }
            }

            Progress.End();

            return(result);
        }
示例#2
0
        private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic)
        {
            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "...");

            var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value));

            var parser = new FastqReader();
            var writer = new FastqWriter();

            StreamWriter swCount = null;

            if (map.HasCountFile)
            {
                swCount = new StreamWriter(outputFile + ".dupcount");
                swCount.WriteLine("Query\tCount\tSequence");
            }

            try
            {
                int readcount = 0;
                var tmpFile   = outputFile + ".tmp";
                using (var sr = StreamUtils.GetReader(options.InputFile))
                {
                    using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz")))
                    {
                        FastqSequence seq;
                        while ((seq = parser.Parse(sr)) != null)
                        {
                            readcount++;
                            if (readcount % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads processed", readcount);
                            }

                            if (!accept(seq))
                            {
                                continue;
                            }

                            var name        = seq.Name;
                            var sequence    = seq.SeqString;
                            var score       = seq.Score;
                            var len         = sequence.Length;
                            var description = seq.Description;
                            var count       = map.GetCount(seq.Name);

                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }

                            CountItem item;
                            if (!dic.TryGetValue(sequence.Length, out item))
                            {
                                item = new CountItem();
                                dic[sequence.Length] = item;
                            }

                            string clipped;
                            if (sequence.EndsWith("CCAA"))
                            {
                                clipped    = "CCAA";
                                sequence   = sequence.Substring(0, sequence.Length - 4);
                                item.CCAA += count;
                            }
                            else if (sequence.EndsWith("CCA"))
                            {
                                clipped   = "CCA";
                                sequence  = sequence.Substring(0, sequence.Length - 3);
                                item.CCA += count;
                            }
                            else if (sequence.EndsWith("CC"))
                            {
                                bool isCCA;
                                if (ccaMap.TryGetValue(name, out isCCA) && isCCA)
                                {
                                    clipped  = "CC";
                                    sequence = sequence.Substring(0, sequence.Length - 2);
                                    item.CC += count;
                                }
                                else
                                {
                                    clipped      = string.Empty;
                                    item.notNTA += count;
                                }
                            }
                            else
                            {
                                clipped      = string.Empty;
                                item.notNTA += count;
                            }

                            if (!string.IsNullOrEmpty(clipped))
                            {
                                var newlen = sequence.Length;
                                seq.SeqString = sequence;
                                seq.Score     = score.Substring(0, newlen);
                                seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
                            }
                            else
                            {
                                seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG);
                            }
                            writer.Write(sw, seq);
                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }
                        }
                    }
                }

                File.Move(tmpFile, outputFile);
            }
            finally
            {
                if (map.HasCountFile)
                {
                    swCount.Close();
                }
            }
        }
示例#3
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var gzipped = options.OutputFile.ToLower().EndsWith(".gz");

            result.Add(options.OutputFile);

            Dictionary <string, FastqSequence> queries = new Dictionary <string, FastqSequence>();

            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + options.OutputFile + "...");
            var parser = new FastqReader();
            var writer = new FastqWriter();

            var          map     = options.GetCountMap();
            StreamWriter swCount = null;

            if (map.HasCountFile)
            {
                var of = options.OutputFile + ".dupcount";
                swCount = new StreamWriter(of);
                swCount.WriteLine("Query\tCount\tSequence");
            }

            int readcount = 0;

            using (var sr = StreamUtils.GetReader(options.InputFile))
            {
                using (var sw = StreamUtils.GetWriter(options.OutputFile, gzipped))
                {
                    FastqSequence seq;
                    while ((seq = parser.Parse(sr)) != null)
                    {
                        readcount++;
                        if (readcount % 100000 == 0)
                        {
                            Progress.SetMessage("{0} reads processed", readcount);
                        }
                        var name        = seq.Name;
                        var sequence    = seq.SeqString;
                        var score       = seq.Score;
                        var len         = sequence.Length;
                        var description = seq.Description;
                        var count       = map.GetCount(seq.Name);

                        for (int i = 0; i < 4; i++)
                        {
                            var newlen = len - i;
                            if (newlen < options.MinimumReadLength)
                            {
                                break;
                            }

                            string clipped;
                            if (i == 0)
                            {
                                clipped = string.Empty;
                            }
                            else
                            {
                                clipped = sequence.Substring(newlen);
                            }

                            seq.SeqString = sequence.Substring(0, newlen);
                            seq.Score     = score.Substring(0, newlen);
                            seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
                            writer.Write(sw, seq);
                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }
                        }
                    }
                }
            }

            if (map.HasCountFile)
            {
                swCount.Close();
            }

            Progress.End();

            return(result);
        }