//public void Step_09_BuildCommonProbeFile(string root)
        //{
        //  var celFiles = GetCelFiles(chipDir);

        //  var rmafiles = Directory.GetFiles(root, "Step_08_*_justRMA.tsv");

        //  var commonProbes = new HashSet<string>();
        //  foreach (var rmafile in rmafiles)
        //  {
        //    Console.WriteLine(rmafile);
        //    var map = new MapItemReader(0, 1).ReadFromFile(rmafile);
        //    //var curprobes = (from k in map.Keys select k.ToLower()).ToList();
        //    var curprobes = map.Keys.ToList();
        //    if (commonProbes.Count == 0)
        //    {
        //      commonProbes = new HashSet<string>(curprobes);
        //    }
        //    else
        //    {
        //      commonProbes.IntersectWith(curprobes);
        //    }
        //  }

        //  Console.WriteLine("Common probes = {0}", commonProbes.Count);
        //  var probes = (from p in commonProbes
        //                orderby p
        //                select p).ToList();

        //  var values = (from rmafile in rmafiles
        //                select ReadFile(rmafile, commonProbes)).ToList();

        //  using (var sw = new StreamWriter(root + "\\Step_09_expression_commonprobes.tsv"))
        //  {
        //    sw.WriteLine("Probe\t{0}", (from v in values
        //                                from s in v.Samples
        //                                select s).Merge("\t"));
        //    for (int i = 0; i < probes.Count; i++)
        //    {
        //      var p = probes[i];
        //      sw.Write(p);

        //      foreach (var v in values)
        //      {
        //        var vv = v.Values[i];
        //        sw.Write("\t{0}", v.Values[i].Second);
        //      }
        //      sw.WriteLine();
        //    }
        //  }

        //Assert.IsTrue(RHelper.ExtractCelData(celFiles, false), string.Format("Extrace tsv from cel failed"));

        //var tsvFiles = Directory.GetFiles(chipDir, "*.tsv");
        //var reader = new ExpressionDataRawReader(2, 1);
        //var datas = (from tsvfile in tsvFiles
        //             select reader.ReadFromFile(tsvfile)).ToList();
        //var commonGenes = datas.GetCommonGenes();

        //Console.WriteLine("Common genes = {0}", commonGenes.Count);

        ////get sample files
        //string[] files;
        //if (smallDataset)
        //{
        //  files = (from dir in Directory.GetDirectories(root)
        //           from file in Directory.GetFiles(dir, "*.tsv").Take(3)
        //           select file).ToArray();
        //}
        //else
        //{
        //  files = (from dir in Directory.GetDirectories(root)
        //           from file in Directory.GetFiles(dir, "*.tsv")
        //           select file).ToArray();
        //}

        ////get batch information
        //var list = (from tsvFile in files
        //            let dir = Path.GetDirectoryName(tsvFile)
        //            let file = FileUtils.ChangeExtension(tsvFile, "")
        //            select new BarInfo()
        //            {
        //              FileName = tsvFile,
        //              Dataset = Path.GetFileName(dir).Replace("-", ""),
        //              BarCode = "S_" + GetBarCode(file),
        //              ChipType = CelFile.GetChipType(file).Replace("-", "")
        //            }).ToList();

        //var dsmap = list.GroupBy(m => m.Dataset).ToList();
        //foreach (var ds in dsmap)
        //{
        //  var dss = ds.GroupBy(m => m.ChipType).ToList();
        //  if (dss.Count > 1)
        //  {
        //    foreach (var info in ds)
        //    {
        //      info.BatchName = info.Dataset + "_" + info.ChipType;
        //    }
        //  }
        //  else
        //  {
        //    foreach (var info in ds)
        //    {
        //      info.BatchName = info.Dataset;
        //    }
        //  }
        //}

        //var map = list.ToDictionary(m => m.FileName);

        ////output all informations
        //var batchDefinitionFile = targetFile + ".batchdefinition";

        //Console.WriteLine("Total {0} files", files.Length);

        //var icount = 0;
        //var genes = commonGenes.ToList();
        //genes.Sort();

        //using (StreamWriter sw = new StreamWriter(targetFile))
        //{
        //  using (StreamWriter rw = new StreamWriter(batchDefinitionFile))
        //  {
        //    sw.Write("GENE");
        //    genes.ForEach(m => sw.Write("\tG_" + m));
        //    sw.WriteLine();

        //    foreach (var file in files)
        //    {
        //      icount++;

        //      BarInfo bi = map[file];
        //      rw.WriteLine(bi.BatchName + "\t" + bi.BarCode + "\t" + bi.FileName);

        //      Console.WriteLine("reading {0}/{1} : {2}", icount, files.Length, file);
        //      try
        //      {
        //        var data = reader.ReadFromFile(file);

        //        data.Values.RemoveAll(m => !commonGenes.Contains(m.Name));
        //        data.Values.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

        //        Assert.AreEqual(commonGenes.Count, data.Values.Count, "Gene count should equal to common gene count : " + Path.GetFileName(file));

        //        sw.Write(bi.BarCode);
        //        data.Values.ForEach(m =>
        //        {
        //          if (double.IsNaN(m.Value))
        //          {
        //            sw.Write("\tNA");
        //          }
        //          else
        //          {
        //            sw.Write("\t{0:0.00}", Math.Pow(2, m.Value));
        //          }
        //        });
        //        sw.WriteLine();
        //      }
        //      catch (Exception ex)
        //      {
        //        Console.Error.WriteLine(file + " : " + ex.Message);
        //        throw;
        //      }
        //    }
        //  }
        //}
        //Console.WriteLine("Finished!");
        //}

        private ProbeExpressionFile ReadFile(string rmafile, HashSet <string> commonProbes)
        {
            var result = new ProbeExpressionFile();

            using (var sr = new StreamReader(rmafile))
            {
                var line = sr.ReadLine();
                result.Samples = (from sample in line.Split('\t').Skip(1) select Path.GetFileNameWithoutExtension(sample)).ToArray();
                result.Values  = new List <Pair <string, string> >();

                while ((line = sr.ReadLine()) != null)
                {
                    var name = line.StringBefore("\t");
                    if (commonProbes.Contains(name))
                    {
                        result.Values.Add(new Pair <string, string>(name, line.StringAfter("\t")));
                    }
                }
            }
            result.Values.Sort((m1, m2) => m1.First.CompareTo(m2.First));
            return(result);
        }
예제 #2
0
    //public void Step_09_BuildCommonProbeFile(string root)
    //{
    //  var celFiles = GetCelFiles(chipDir);

    //  var rmafiles = Directory.GetFiles(root, "Step_08_*_justRMA.tsv");

    //  var commonProbes = new HashSet<string>();
    //  foreach (var rmafile in rmafiles)
    //  {
    //    Console.WriteLine(rmafile);
    //    var map = new MapItemReader(0, 1).ReadFromFile(rmafile);
    //    //var curprobes = (from k in map.Keys select k.ToLower()).ToList();
    //    var curprobes = map.Keys.ToList();
    //    if (commonProbes.Count == 0)
    //    {
    //      commonProbes = new HashSet<string>(curprobes);
    //    }
    //    else
    //    {
    //      commonProbes.IntersectWith(curprobes);
    //    }
    //  }

    //  Console.WriteLine("Common probes = {0}", commonProbes.Count);
    //  var probes = (from p in commonProbes
    //                orderby p
    //                select p).ToList();

    //  var values = (from rmafile in rmafiles
    //                select ReadFile(rmafile, commonProbes)).ToList();

    //  using (var sw = new StreamWriter(root + "\\Step_09_expression_commonprobes.tsv"))
    //  {
    //    sw.WriteLine("Probe\t{0}", (from v in values
    //                                from s in v.Samples
    //                                select s).Merge("\t"));
    //    for (int i = 0; i < probes.Count; i++)
    //    {
    //      var p = probes[i];
    //      sw.Write(p);

    //      foreach (var v in values)
    //      {
    //        var vv = v.Values[i];
    //        sw.Write("\t{0}", v.Values[i].Second);
    //      }
    //      sw.WriteLine();
    //    }
    //  }

    //Assert.IsTrue(RHelper.ExtractCelData(celFiles, false), string.Format("Extrace tsv from cel failed"));

    //var tsvFiles = Directory.GetFiles(chipDir, "*.tsv");
    //var reader = new ExpressionDataRawReader(2, 1);
    //var datas = (from tsvfile in tsvFiles
    //             select reader.ReadFromFile(tsvfile)).ToList();
    //var commonGenes = datas.GetCommonGenes();

    //Console.WriteLine("Common genes = {0}", commonGenes.Count);

    ////get sample files
    //string[] files;
    //if (smallDataset)
    //{
    //  files = (from dir in Directory.GetDirectories(root)
    //           from file in Directory.GetFiles(dir, "*.tsv").Take(3)
    //           select file).ToArray();
    //}
    //else
    //{
    //  files = (from dir in Directory.GetDirectories(root)
    //           from file in Directory.GetFiles(dir, "*.tsv")
    //           select file).ToArray();
    //}

    ////get batch information
    //var list = (from tsvFile in files
    //            let dir = Path.GetDirectoryName(tsvFile)
    //            let file = FileUtils.ChangeExtension(tsvFile, "")
    //            select new BarInfo()
    //            {
    //              FileName = tsvFile,
    //              Dataset = Path.GetFileName(dir).Replace("-", ""),
    //              BarCode = "S_" + GetBarCode(file),
    //              ChipType = CelFile.GetChipType(file).Replace("-", "")
    //            }).ToList();

    //var dsmap = list.GroupBy(m => m.Dataset).ToList();
    //foreach (var ds in dsmap)
    //{
    //  var dss = ds.GroupBy(m => m.ChipType).ToList();
    //  if (dss.Count > 1)
    //  {
    //    foreach (var info in ds)
    //    {
    //      info.BatchName = info.Dataset + "_" + info.ChipType;
    //    }
    //  }
    //  else
    //  {
    //    foreach (var info in ds)
    //    {
    //      info.BatchName = info.Dataset;
    //    }
    //  }
    //}

    //var map = list.ToDictionary(m => m.FileName);

    ////output all informations
    //var batchDefinitionFile = targetFile + ".batchdefinition";

    //Console.WriteLine("Total {0} files", files.Length);

    //var icount = 0;
    //var genes = commonGenes.ToList();
    //genes.Sort();

    //using (StreamWriter sw = new StreamWriter(targetFile))
    //{
    //  using (StreamWriter rw = new StreamWriter(batchDefinitionFile))
    //  {
    //    sw.Write("GENE");
    //    genes.ForEach(m => sw.Write("\tG_" + m));
    //    sw.WriteLine();

    //    foreach (var file in files)
    //    {
    //      icount++;

    //      BarInfo bi = map[file];
    //      rw.WriteLine(bi.BatchName + "\t" + bi.BarCode + "\t" + bi.FileName);

    //      Console.WriteLine("reading {0}/{1} : {2}", icount, files.Length, file);
    //      try
    //      {
    //        var data = reader.ReadFromFile(file);

    //        data.Values.RemoveAll(m => !commonGenes.Contains(m.Name));
    //        data.Values.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

    //        Assert.AreEqual(commonGenes.Count, data.Values.Count, "Gene count should equal to common gene count : " + Path.GetFileName(file));

    //        sw.Write(bi.BarCode);
    //        data.Values.ForEach(m =>
    //        {
    //          if (double.IsNaN(m.Value))
    //          {
    //            sw.Write("\tNA");
    //          }
    //          else
    //          {
    //            sw.Write("\t{0:0.00}", Math.Pow(2, m.Value));
    //          }
    //        });
    //        sw.WriteLine();
    //      }
    //      catch (Exception ex)
    //      {
    //        Console.Error.WriteLine(file + " : " + ex.Message);
    //        throw;
    //      }
    //    }
    //  }
    //}
    //Console.WriteLine("Finished!");
    //}

    private ProbeExpressionFile ReadFile(string rmafile, HashSet<string> commonProbes)
    {
      var result = new ProbeExpressionFile();
      using (var sr = new StreamReader(rmafile))
      {
        var line = sr.ReadLine();
        result.Samples = (from sample in line.Split('\t').Skip(1) select Path.GetFileNameWithoutExtension(sample)).ToArray();
        result.Values = new List<Pair<string, string>>();

        while ((line = sr.ReadLine()) != null)
        {
          var name = line.StringBefore("\t");
          if (commonProbes.Contains(name))
          {
            result.Values.Add(new Pair<string, string>(name, line.StringAfter("\t")));
          }
        }
      }
      result.Values.Sort((m1, m2) => m1.First.CompareTo(m2.First));
      return result;
    }