Beispiel #1
0
        public static void ExtractData(string tcgaDir, string targetDir, string prefix, string[] tumors, string datatype, string[] platforms, TCGASampleCode[] sampleCodes = null, bool outputCountDataOnly = false)
        {
            var tec           = TCGATechnology.Parse(datatype);
            var platforms_str = (from p in platforms select p.StringBefore("_")).Merge("_");

            var counts = outputCountDataOnly ? new[] { true } : new[] { true, false };

            foreach (var count in counts)
            {
                string resultFile;
                if (tec.HasCountData)
                {
                    resultFile = string.Format(@"{0}\{1}_{2}_{3}_{4}.tsv", targetDir, prefix, datatype, platforms_str, count ? "Count" : tec.ValueName);
                }
                else
                {
                    resultFile = string.Format(@"{0}\{1}_{2}_{3}.tsv", targetDir, prefix, datatype, platforms_str);
                }

                var options = new TCGADatatableBuilderOptions();
                options.DataType                    = datatype;
                options.TCGADirectory               = tcgaDir;
                options.TumorTypes                  = tumors.ToList();
                options.Platforms                   = platforms;
                options.IsCount                     = count;
                options.OutputFile                  = resultFile;
                options.TCGASampleCodeStrings       = sampleCodes == null ? new List <string>() : sampleCodes.ToList().ConvertAll(m => m.ShortLetterCode).ToList();
                options.WithClinicalInformationOnly = true;

                if (!options.PrepareOptions())
                {
                    throw new Exception("Error:\n" + options.ParsingErrors.Merge("\n"));
                }

                new TCGADatatableBuilder(options).Process();

                if (!tec.HasCountData)
                {
                    break;
                }
            }
        }
 public TCGADatatableBuilder(TCGADatatableBuilderOptions options)
 {
     _options = options;
     _options.PrepareOptions();
     this.TemplateDirectory = FileUtils.GetTemplateDir();
 }
        public override IEnumerable <string> Process()
        {
            if (!_options.PrepareOptions())
            {
                throw new Exception(_options.ParsingErrors.Merge("\n"));
            }

            HashSet <int>       sampleCodes   = new HashSet <int>(_options.GetTCGASampleCodes().ToList().ConvertAll(m => m.Code));
            Func <string, bool> acceptBarcode = m => sampleCodes.Contains(new BarInfo(m, null).Sample);
            var tec = _options.GetTechnology();

            var items = new List <MutationItem>();

            foreach (var tumor in _options.TumorTypes)
            {
                var dir = Path.Combine(_options.TCGADirectory, tumor);
                if (!Directory.Exists(dir))
                {
                    continue;
                }

                var tecdir = tec.GetTechnologyDirectory(dir);

                if (!Directory.Exists(tecdir))
                {
                    continue;
                }

                foreach (var platform in _options.Platforms)
                {
                    var platdir = Path.Combine(tecdir, platform);

                    var datadirs = Directory.GetDirectories(platdir, "*Level_2*");
                    foreach (var datadir in datadirs)
                    {
                        var maffiles = Directory.GetFiles(datadir, "*.somatic.maf");
                        if (maffiles.Length == 0)
                        {
                            continue;
                        }

                        foreach (var maffile in maffiles)
                        {
                            using (var sr = new StreamReader(maffile))
                            {
                                string line;

                                //skip comments
                                while ((line = sr.ReadLine()) != null && line.StartsWith("#"))
                                {
                                }

                                if (string.IsNullOrEmpty(line))
                                {
                                    continue;
                                }

                                //read header
                                var headers                    = line.Split('\t');
                                var nameIndex                  = Array.IndexOf(headers, "Hugo_Symbol");
                                var ncbiIndex                  = Array.IndexOf(headers, "NCBI_Build");
                                var chromosomeIndex            = Array.IndexOf(headers, "Chromosome");
                                var startIndex                 = Array.IndexOf(headers, "Start_position");
                                var endIndex                   = Array.IndexOf(headers, "End_position");
                                var strandIndex                = Array.IndexOf(headers, "Strand");
                                var variantClassificationIndex = Array.IndexOf(headers, "Variant_Classification");
                                var variantTypeIndex           = Array.IndexOf(headers, "Variant_Type");
                                var barcodeIndex               = Array.IndexOf(headers, "Tumor_Sample_Barcode");

                                while ((line = sr.ReadLine()) != null)
                                {
                                    var parts = line.Split('\t');
                                    var item  = new MutationItem()
                                    {
                                        Tumor                 = tumor,
                                        Platform              = platform,
                                        Name                  = parts[nameIndex],
                                        NcbiBuild             = parts[ncbiIndex],
                                        Chromosome            = parts[chromosomeIndex],
                                        Start                 = parts[startIndex],
                                        End                   = parts[endIndex],
                                        Strand                = parts[strandIndex],
                                        VariantClassification = parts[variantClassificationIndex],
                                        VariantType           = parts[variantTypeIndex],
                                        TumorBarcode          = parts[barcodeIndex]
                                    };
                                    item.InitLocus();
                                    item.InitPaticipant();

                                    items.Add(item);
                                }
                            }
                        }
                    }
                }
            }

            using (var sw = new StreamWriter(_options.OutputFile))
            {
                var paticipants = (from item in items
                                   select item.Paticipant).Distinct().OrderBy(m => m).ToList();
                var itemMap   = items.ToDoubleDictionaryGroup(m => m.Locus, m => m.Paticipant);
                var locusList = itemMap.Keys.ToList();
                GenomeUtils.SortChromosome(locusList, m => m.StringBefore(":"), m => int.Parse(m.StringAfter(":").StringBefore("-")));

                sw.WriteLine("Hugo_Symbol\tNCBI_Build\tChromosome\tStart_position\tEnd_position\tStrand\tVariant_Classification\tVariant_Type\t{0}", paticipants.Merge("\t"));
                foreach (var locus in locusList)
                {
                    var dic  = itemMap[locus];
                    var item = dic.Values.First().First();
                    sw.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}",
                             item.Name,
                             item.NcbiBuild,
                             item.Chromosome,
                             item.Start,
                             item.End,
                             item.Start,
                             item.VariantClassification,
                             item.VariantType);

                    foreach (var paticipant in paticipants)
                    {
                        if (dic.ContainsKey(paticipant))
                        {
                            sw.Write("\t1");
                        }
                        else
                        {
                            sw.Write("\t0");
                        }
                    }
                    sw.WriteLine();
                }
            }

            var genefile = FileUtils.ChangeExtension(_options.OutputFile, ".gene.tsv");

            using (var sw = new StreamWriter(genefile))
            {
                var paticipants = (from item in items
                                   select item.Paticipant).Distinct().OrderBy(m => m).ToList();
                var itemMap  = items.ToDoubleDictionaryGroup(m => m.Name, m => m.Paticipant);
                var nameList = itemMap.Keys.OrderBy(m => m).ToList();

                sw.WriteLine("Hugo_Symbol\t{0}", paticipants.Merge("\t"));
                foreach (var name in nameList)
                {
                    var dic  = itemMap[name];
                    var item = dic.Values.First().First();
                    sw.Write("{0}", item.Name);

                    foreach (var paticipant in paticipants)
                    {
                        if (dic.ContainsKey(paticipant))
                        {
                            sw.Write("\t1");
                        }
                        else
                        {
                            sw.Write("\t0");
                        }
                    }
                    sw.WriteLine();
                }
            }

            return(new[] { _options.OutputFile, genefile });
        }
 public TCGADatatableBuilder(TCGADatatableBuilderOptions options)
 {
   _options = options;
   _options.PrepareOptions();
   this.TemplateDirectory = FileUtils.GetTemplateDir();
 }