public override IEnumerable <string> Process()
        {
            Progress.SetMessage("Reading T2C smallRNA...");
            var mappedSmallRNA = ParclipUtils.GetSmallRNACoverageRegion(options.InputFile, null, new string[] { SmallRNAConsts.lincRNA });

            mappedSmallRNA.Sort((m1, m2) => m2.Coverages.Average().CompareTo(m1.Coverages.Average()));

            Progress.SetMessage("Build target {0} mers...", options.MinimumSeedLength);
            var targetSeedMap = ParclipUtils.BuildTargetSeedMap(options, m => true, progress: this.Progress);

            Progress.SetMessage("Finding target...");
            using (var sw = new StreamWriter(options.OutputFile))
            {
                sw.WriteLine("SmallRNA\tChr\tStart\tEnd\tStrand\tSeed\tSeedOffset\tSeedLength\tSeedCoverage\tTarget\tTargetCoverage\tTargetGeneSymbol\tTargetName");

                foreach (var t2c in mappedSmallRNA)
                {
                    var seq = t2c.Sequence.ToUpper();

                    int[] offsets = GetPossibleOffsets(t2c.Name);

                    foreach (var offset in offsets)
                    {
                        var seed     = seq.Substring(offset, options.MinimumSeedLength);
                        var coverage = t2c.Coverages.Skip(offset).Take(options.MinimumSeedLength).Average();
                        if (coverage < options.MinimumCoverage)
                        {
                            continue;
                        }

                        List <SeedItem> target;
                        if (targetSeedMap.TryGetValue(seed, out target))
                        {
                            target.Sort((m1, m2) =>
                            {
                                return(m2.Coverage.CompareTo(m1.Coverage));
                            });

                            if (!t2c.Name.StartsWith(SmallRNAConsts.miRNA))
                            {
                                target = ParclipUtils.FindLongestTarget(target, t2c, seq, offset, options.MinimumSeedLength, int.MaxValue, options.MinimumCoverage);
                            }

                            for (int j = 0; j < target.Count; j++)
                            {
                                var finalSeed = seq.Substring(offset, target[0].Sequence.Length);

                                sw.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}", t2c.Name, t2c.Seqname, t2c.Start, t2c.End, t2c.Strand, finalSeed, offset, finalSeed.Length, Math.Round(coverage));

                                var t = target[j];
                                sw.WriteLine("\t{0}:{1}-{2}:{3}\t{4}\t{5}\t{6}",
                                             t.Seqname,
                                             t.Start,
                                             t.End,
                                             t.Strand,
                                             t.Coverage,
                                             t.GeneSymbol,
                                             t.Name);
                            }
                        }
                    }
                }
            }

            return(new[] { options.OutputFile });
        }
Example #2
0
        public override IEnumerable <string> Process()
        {
            var candidates = options.ReadSeeds();

            Progress.SetMessage("Total {0} seeds readed.", candidates.Length);
            var offsets = GetPossibleOffsets(string.Empty);

            Progress.SetMessage("Build target {0} mers...", options.MinimumSeedLength);

            var seeds = (from seq in candidates
                         from offset in offsets
                         select seq.Substring(offset, options.MinimumSeedLength)).ToList();
            var targetSeedMap = ParclipUtils.BuildTargetSeedMap(options, seeds, this.Progress);

            //var seeds = new HashSet<string>(from seq in candidates
            //                                from offset in offsets
            //                                select seq.Substring(offset, options.MinimumSeedLength));
            //var targetSeedMap = ParclipUtils.BuildTargetSeedMap(options, m => seeds.Contains(m.Sequence), this.Progress);

            Progress.SetMessage("Finding target...");
            using (var sw = new StreamWriter(options.OutputFile))
            {
                sw.WriteLine("Sequence\tSeed\tSeedOffset\tSeedLength\tTarget\tTargetCoverage\tTargetGeneSymbol\tTargetName");

                foreach (var seq in candidates)
                {
                    foreach (var offset in offsets)
                    {
                        if (seq.Length < offset + options.MinimumSeedLength)
                        {
                            break;
                        }

                        var seed = seq.Substring(offset, options.MinimumSeedLength);

                        List <SeedItem> target;

                        if (targetSeedMap.TryGetValue(seed, out target))
                        {
                            if (target.ConvertAll(l => l.Coverage).Distinct().Count() == 1)
                            {
                                GenomeUtils.SortChromosome(target, m => m.Seqname, m => m.Start);
                            }
                            else
                            {
                                target.Sort((m1, m2) =>
                                {
                                    return(m2.Coverage.CompareTo(m1.Coverage));
                                });
                            }

                            var longest = ParclipUtils.ExtendToLongestTarget(target, null, seq, offset, options.MinimumSeedLength, int.MaxValue, options.MinimumCoverage);

                            for (int j = 0; j < longest.Count; j++)
                            {
                                var t         = longest[j];
                                var finalSeed = seq.Substring(offset, (int)t.Length);
                                sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}:{5}-{6}:{7}\t{8}\t{9}\t{10}",
                                             seq,
                                             finalSeed,
                                             offset,
                                             finalSeed.Length,
                                             t.Seqname,
                                             t.Start,
                                             t.End,
                                             t.Strand,
                                             t.Coverage,
                                             t.GeneSymbol,
                                             t.Name);
                            }
                        }
                    }
                }
            }

            return(new[] { options.OutputFile });
        }
Example #3
0
        public override IEnumerable <string> Process()
        {
            Progress.SetMessage("Reading smallRNA...");

            //exclude lincRNA
            var mappedSmallRNA = ParclipUtils.GetSmallRNACoverageRegion(options.InputFile, includeSmallRNATags: null, excudeSmallRNATags: SmallRNAConsts.lncRNA);

            mappedSmallRNA.Sort((m1, m2) => m2.Coverages.Average(l => l.Coverage).CompareTo(m1.Coverages.Average(l => l.Coverage)));

            Progress.SetMessage("Build target {0} mers...", options.MinimumSeedLength);
            var targetSeedMap = ParclipUtils.BuildTargetSeedMap(options, m => true, progress: this.Progress);

            Progress.SetMessage("Finding target...");
            using (var sw = new StreamWriter(options.OutputFile))
            {
                sw.WriteLine("SmallRNA\tChr\tStart\tEnd\tStrand\tSeed\tSeedOffset\tSeedLength\tSeedCoverage\tTarget\tTargetCoverage\tTargetGeneSymbol\tTargetName\tTargetSeedOffset\tTargetSeedUniqueRead");

                foreach (var t2c in mappedSmallRNA)
                {
                    var seq = t2c.Sequence.ToUpper();

                    int[] offsets = GetPossibleOffsets(t2c.Name);

                    foreach (var offset in offsets)
                    {
                        if (offset + options.MinimumSeedLength >= seq.Length)
                        {
                            Console.Error.WriteLine("t2c={0}/{1}, seq={2}, offset={3}, minseed={4}", t2c.Name, t2c.GeneSymbol, seq, offset, options.MinimumSeedLength);
                            continue;
                        }

                        var seed     = seq.Substring(offset, options.MinimumSeedLength);
                        var coverage = t2c.Coverages.Skip(offset).Take(options.MinimumSeedLength).Average(l => l.Coverage);
                        if (coverage < options.MinimumCoverage)
                        {
                            continue;
                        }

                        List <SeedItem> target;
                        if (targetSeedMap.TryGetValue(seed, out target))
                        {
                            target.Sort((m1, m2) =>
                            {
                                return(m2.Coverage.CompareTo(m1.Coverage));
                            });

                            var longest = ParclipUtils.FindLongestTarget(target, t2c, seq, offset, options.MinimumSeedLength, int.MaxValue, options.MinimumCoverage);

                            for (int j = 0; j < longest.Count; j++)
                            {
                                var t         = longest[j];
                                var finalSeed = seq.Substring(offset, (int)t.Length);

                                sw.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}", t2c.Name, t2c.Seqname, t2c.Start, t2c.End, t2c.Strand, finalSeed, offset, finalSeed.Length, Math.Round(coverage));

                                var uniqueReadCount = (from c in t.Source.Coverages.Skip(t.SourceOffset).Take(finalSeed.Length)
                                                       from q in c.UniqueRead
                                                       select q).Distinct().Count();

                                sw.WriteLine("\t{0}:{1}-{2}:{3}\t{4}\t{5}\t{6}\t{7}\t{8}",
                                             t.Seqname,
                                             t.Start,
                                             t.End,
                                             t.Strand,
                                             t.Coverage,
                                             t.GeneSymbol,
                                             t.Name,
                                             t.SourceOffset,
                                             t.GetSeedUniqueReadCount(finalSeed.Length));
                            }
                        }
                    }
                }
            }

            return(new[] { options.OutputFile });
        }