Exemple #1
        public static void SummarizeExtractedData(string dir)
            var designs = Directory.GetFiles(dir, "*.design.tsv");

            using (var sw = new StreamWriter(dir + "\\design_overlap.tsv"))
                using (var swSummary = new StreamWriter(dir + "\\design_summary.tsv"))
                    var reader = new MapItemReader(2, 4);
                    var data   = (from design in designs
                                  from item in reader.ReadFromFile(design)
                                  select item).ToList();
                    var dataMap   = data.ToGroupDictionary(m => m.Key);
                    var samples   = (from d in data select d.Key).Distinct().OrderBy(m => m).ToList();
                    var platforms = (from d in data select d.Value.Value).Distinct().ToList();

                    var dMap = (from d in data
                                select new { Sample = d.Key, Platform = d.Value.Value }).ToDoubleDictionary(m => m.Platform, m => m.Sample);

                    sw.WriteLine("Sample\t" + platforms.Merge("\t"));
                    foreach (var sample in samples)
                        var sampleMap = new HashSet <string>(dataMap[sample].ConvertAll(m => m.Value.Value));
                                     (from p in platforms
                                      select sampleMap.Contains(p) ? "+" : "").Merge("\t"));

                    swSummary.WriteLine("\t" + platforms.Merge("\t"));
                    for (int i = 0; i < platforms.Count; i++)
                        for (int j = 0; j < platforms.Count; j++)
                            swSummary.Write("\t{0}", dMap[platforms[i]].Keys.Intersect(dMap[platforms[j]].Keys).Count());
Exemple #2
        public override IEnumerable <string> Process()
            var entries = (from line in File.ReadAllLines(options.InputFile)
                           let parts = line.Split('\t')
                                       where parts.Length >= 3
                                       select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList();

            var groups = entries.GroupBy(m => m.GroupName).ToList();

            var result = new List <string>();

            foreach (var group in groups)
                var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount");
                using (var sw = new StreamWriter(catfile))

                    foreach (var entry in group)
                        Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ...");

                        var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile);

                        var totalReads    = Math.Round(double.Parse(map["TotalReads"].Value));
                        var mappedReads   = Math.Round(double.Parse(map["MappedReads"].Value));
                        var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value));

                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads);
                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads);
                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads);
                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads);
                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

                        foreach (var biotype in SmallRNAConsts.Biotypes)
                            if (map.ContainsKey(biotype))
                                sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value)));

                var data = (from line in File.ReadAllLines(catfile).Skip(1)
                            where !string.IsNullOrWhiteSpace(line)
                            let parts = line.Split('\t')
                                        let level = double.Parse(parts[2])
                                                    where !(parts[1].Equals(smallRNAKey) && level == 1)
                                                    select new
                    SampleName = parts[0],
                    Category = parts[1],
                    Level = level,
                    Count = int.Parse(parts[3])

                var tablefile = catfile + ".tsv";
                using (var sw = new StreamWriter(tablefile))
                    var samples = (from d in data
                                   select d.SampleName).Distinct().OrderBy(m => m).ToList();

                    sw.WriteLine("Category\t{0}", samples.Merge("\t"));

                    var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList();


                    var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category);
                    foreach (var cat in categories)
                        sw.WriteLine("{0}\t{1}", cat,
                                     (from sample in samples
                                      let dic = map[sample]
                                                select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t"));

                var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName;
                if (File.Exists(rfile))
                    var targetrfile = catfile + ".r";
                    using (var sw = new StreamWriter(targetrfile))
                        sw.WriteLine("catfile<-\"{0}\"", catfile);
                        sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory);
                        sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0");
                        string line = File.ReadAllText(rfile);
                        using (var sr = new StreamReader(rfile))
                            if (line.Contains("#predefine_end"))
                                while ((line = sr.ReadLine()) != null)
                                    if (line.Contains("#predefine_end"))

                            while ((line = sr.ReadLine()) != null)
                    SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\"");
Exemple #3
        private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic)
            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "...");

            var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value));

            var parser = new FastqReader();
            var writer = new FastqWriter();

            StreamWriter swCount = null;

            if (map.HasCountFile)
                swCount = new StreamWriter(outputFile + ".dupcount");

                int readcount = 0;
                var tmpFile   = outputFile + ".tmp";
                using (var sr = StreamUtils.GetReader(options.InputFile))
                    using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz")))
                        FastqSequence seq;
                        while ((seq = parser.Parse(sr)) != null)
                            if (readcount % 100000 == 0)
                                Progress.SetMessage("{0} reads processed", readcount);

                            if (!accept(seq))

                            var name        = seq.Name;
                            var sequence    = seq.SeqString;
                            var score       = seq.Score;
                            var len         = sequence.Length;
                            var description = seq.Description;
                            var count       = map.GetCount(seq.Name);

                            if (map.HasCountFile)
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);

                            CountItem item;
                            if (!dic.TryGetValue(sequence.Length, out item))
                                item = new CountItem();
                                dic[sequence.Length] = item;

                            string clipped;
                            if (sequence.EndsWith("CCAA"))
                                clipped    = "CCAA";
                                sequence   = sequence.Substring(0, sequence.Length - 4);
                                item.CCAA += count;
                            else if (sequence.EndsWith("CCA"))
                                clipped   = "CCA";
                                sequence  = sequence.Substring(0, sequence.Length - 3);
                                item.CCA += count;
                            else if (sequence.EndsWith("CC"))
                                bool isCCA;
                                if (ccaMap.TryGetValue(name, out isCCA) && isCCA)
                                    clipped  = "CC";
                                    sequence = sequence.Substring(0, sequence.Length - 2);
                                    item.CC += count;
                                    clipped      = string.Empty;
                                    item.notNTA += count;
                                clipped      = string.Empty;
                                item.notNTA += count;

                            if (!string.IsNullOrEmpty(clipped))
                                var newlen = sequence.Length;
                                seq.SeqString = sequence;
                                seq.Score     = score.Substring(0, newlen);
                                seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
                                seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG);
                            writer.Write(sw, seq);
                            if (map.HasCountFile)
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);

                File.Move(tmpFile, outputFile);
                if (map.HasCountFile)
        public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths)
            Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile);
            var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile);
            var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length"));

            if (lengthIndex < 0)
                throw new Exception("Cannot find length column in file " + options.GeneLengthFile);
            var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));

            Progress.SetMessage("Reading count table from {0} ...", options.InputFile);
            var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile);

            if (!string.IsNullOrEmpty(options.KeyRegex))
                var reg = new Regex(options.KeyRegex);
                geneLengthMap           = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
                counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value;

            Dictionary <string, double> sampleReads;

            if (File.Exists(options.SampleReadsFile))
                Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile);
                sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));
            else //use total mapped reads as total reads
                sampleReads = new Dictionary <string, double>();
                for (int iSample = 0; iSample < counts.Samples.Length; iSample++)
                    double itotal = 0.0;
                    for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++)
                        itotal += counts.Count[iGene, iSample];

                    sampleReads[counts.Samples[iSample]] = itotal;

            foreach (var sample in counts.Samples)
                if (!sampleReads.ContainsKey(sample))
                    throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile));

            foreach (var geneValues in counts.GeneValues)
                if (!geneLengthMap.ContainsKey(geneValues[0]))
                    throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile));

            sampleCounts = (from sample in counts.Samples
                            select sampleReads[sample]).ToArray();

            geneLengths = (from geneValues in counts.GeneValues
                           select geneLengthMap[geneValues[0]]).ToArray();

            for (int iGene = 0; iGene < geneLengths.Length; iGene++)
                for (int iSample = 0; iSample < sampleCounts.Length; iSample++)
                    counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]);
    public override IEnumerable<string> Process()
      var result = new List<string>();

      var countFiles = options.GetCountFiles();
      countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

      var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true);

      var countMap = new Dictionary<string, ChromosomeCountSlimItem>();

      int fileIndex = 0;
      foreach (var file in countFiles)
        Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

        var curcounts = format.ReadFromFile(file.File);

        if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence))
          Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname);
        curcounts.ForEach(m =>
          foreach (var q in m.Queries)
            q.Sample = file.Name;

        foreach (var c in curcounts)
          var name = c.Names.First();
          ChromosomeCountSlimItem item;
          if (countMap.TryGetValue(name, out item))
            countMap[name] = c;

      var counts = countMap.Values.ToList();

      WriteOutput(options.OutputFile, countFiles, format, counts);


      if (File.Exists(options.CategoryMapFile))
        Progress.SetMessage("Reading category map ...");
        var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile);
        var queries = new HashSet<SAMChromosomeItem>(from c in counts
                                                     from q in c.Queries
                                                     select q);

        var dic = new Dictionary<string, ChromosomeCountSlimItem>();
        foreach (var q in queries)
          q.Chromosomes = (from chrom in q.Chromosomes
                           select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList();
          foreach (var chrom in q.Chromosomes)
            ChromosomeCountSlimItem item;
            if (!dic.TryGetValue(chrom, out item))
              item = new ChromosomeCountSlimItem();
              dic[chrom] = item;

        var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile));
        WriteOutput(catFile, countFiles, format, dic.Values.ToList());

      if (options.OutputReadTable || options.OutputReadContigTable)
        Progress.SetMessage("Building sequence map...");
        var reads = SmallRNASequenceUtils.ConvertFrom(counts);

        if (options.OutputReadTable)
          Progress.SetMessage("Saving read file...");
          var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile));
          new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads);

        if (options.OutputReadContigTable)
          Progress.SetMessage("Building sequence contig by similarity ...");
          var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase,  progress: Progress);

          Progress.SetMessage("Contig number = {0}", contigs.Count);

          Progress.SetMessage("Saving contig file...");
          var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile));
          new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs);

          Progress.SetMessage("Saving sequence contig details...");
          new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs);
          result.Add(contigOutput + ".details");


      return result;
Exemple #9
        public override IEnumerable <string> Process()
            var countFiles = options.GetCountFiles();

            countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

            var countMap  = new Dictionary <string, Dictionary <string, int> >();
            int fileIndex = 0;

            foreach (var file in countFiles)
                Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

                var queries = new HashSet <string>();
                using (var sr = SAMFactory.GetReader(file.File, true))
                    int    count = 0;
                    string line;
                    while ((line = sr.ReadLine()) != null)

                        if (count % 1000 == 0)
                            if (Progress.IsCancellationPending())
                                throw new UserTerminatedException();

                        var parts = line.Split('\t');

                        SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

                        if (flag.HasFlag(SAMFlags.UnmappedQuery))


                var countDic = new Dictionary <string, int>();
                countMap[file.Name] = countDic;
                var cm = new MapItemReader(0, 1, informationIndex: 2).ReadFromFile(file.AdditionalFile);
                foreach (var query in queries)
                    var count = cm[query];
                    countDic[count.Information] = int.Parse(count.Value);

                Progress.SetMessage("{0} reads mapped.", queries.Count);

            var uniques = (from c in countMap.Values
                           from seq in c.Keys
                           select seq).Distinct().ToArray();
            var uniqueCounts = (from seq in uniques
                                let totalCount = (from c in countMap.Values
                                                  where c.ContainsKey(seq)
                                                  select c[seq]).Sum()
                                                 select new { Sequence = seq, Count = totalCount }).OrderByDescending(m => m.Count).ToArray();

            using (var sw = new StreamWriter(options.OutputFile))
                sw.WriteLine("Sequence\t" + (from cf in countFiles select cf.Name).Merge("\t"));
                foreach (var uc in uniqueCounts)
                    var seq = uc.Sequence;
                    foreach (var cf in countFiles)
                        var map = countMap[cf.Name];
                        int count;
                        if (map.TryGetValue(seq, out count))
                            sw.Write("\t{0}", count);


            return(new string[] { Path.GetFullPath(options.OutputFile) });
