/// <summary>
        /// Make sure no data duplicated
        /// </summary>
        public void FilterDatasetByMD5(string[] datasetByPriority)
        {
            var dsMap = (from dir in GetDatasetDirectories()
                         let dataset = Path.GetFileName(dir)
                                       from file in CelFile.GetCelFiles(dir)
                                       select new { Dataset = dataset, File = file, MD5 = HashUtils.GetDecompressedMD5Hash(file) }).ToGroupDictionary(m => m.Dataset);

            foreach (var dataset in datasetByPriority)
            {
                var fileMap = dsMap[dataset].ToDictionary(m => m.MD5);
                foreach (var ds in dsMap)
                {
                    if (ds.Key.Equals(dataset))
                    {
                        continue;
                    }

                    var dsfiles = ds.Value;
                    foreach (var cel in dsfiles)
                    {
                        if (!File.Exists(cel.File))
                        {
                            continue;
                        }

                        if (fileMap.ContainsKey(cel.MD5))
                        {
                            Console.WriteLine("Excluding " + cel.File);
                            ExcludeFile(cel.File);
                        }
                    }
                }
            }
        }
Пример #2
0
        public static Dictionary <string, string> GetGsmNameFileMap(string directory)
        {
            Console.WriteLine(directory);
            var files  = CelFile.GetCelFiles(directory);
            var groups = files.GroupBy(m => GetGsmName(m));

            foreach (var group in groups)
            {
                if (group.Count() > 1)
                {
                    Console.WriteLine("{0} : {1}", group.Key, group.ToList().ConvertAll(m => Path.GetFileName(m)).Merge(", "));
                }
            }

            return(groups.ToDictionary(m => m.Key, m => m.First()));
        }
        /// <summary>
        /// Filter GSM files based on accept function. The rejected files will be removed to exclusion directory.
        /// </summary>
        /// <param name="gseName"></param>
        /// <param name="accept"></param>
        public void FilterGseSource(string gseName, Func <SampleItem, bool> accept)
        {
            var gseDirectory = Path.Combine(DataDir, gseName);

            if (!Directory.Exists(gseDirectory))
            {
                return;
            }

            var gseExcluded = Path.Combine(ExclusionDir, gseName);

            var samples = ReadSampleItem(gseDirectory);

            var sg = samples.ToDictionary(m => m.Sample.ToLower());

            var celFiles = CelFile.GetCelFiles(gseDirectory);

            foreach (var cel in celFiles)
            {
                var gsm = GeoUtils.GetGsmName(cel);

                if (!sg.ContainsKey(gsm))
                {
                    throw new Exception("Cannot find sample information of " + gsm + " in " + gseDirectory);
                }

                var sample = sg[gsm];
                if (!accept(sample))
                {
                    if (!Directory.Exists(gseExcluded))
                    {
                        Directory.CreateDirectory(gseExcluded);
                    }
                    var target = Path.Combine(gseExcluded, Path.GetFileName(cel));
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}", Path.GetFileName(cel), sample.Sample, sample.SourceName, sample.SampleTitle);
                    if (File.Exists(target))
                    {
                        File.Delete(target);
                    }
                    File.Move(cel, target);
                }
            }
        }
Пример #4
0
        public List <BreastCancerSampleItem> ParseDataset(string datasetDirectory)
        {
            var files = new HashSet <string>(from f in CelFile.GetCelFiles(datasetDirectory, false)
                                             select Path.GetFileNameWithoutExtension(f));

            var sdrfFile = Directory.GetFiles(datasetDirectory, "*.sdrf.txt");

            if (sdrfFile.Length == 0)
            {
                throw new ArgumentException("Cannot find sdrf file in directory " + datasetDirectory);
            }

            var ann = new AnnotationFormat("^#").ReadFromFile(sdrfFile[0]);

            return((from a in ann
                    let filename = Path.GetFileNameWithoutExtension(FindValue(a, ColumnName.Sample))
                                   where files.Contains(filename)
                                   select new BreastCancerSampleItem()
            {
                Dataset = Path.GetFileName(datasetDirectory),
                Sample = filename,
                Age = FindValue(a, ColumnName.Age),
                ER = new StatusValue(FindValue(a, ColumnName.ER)).Value,
                PR = new StatusValue(FindValue(a, ColumnName.PR)).Value,
                HER2 = new StatusValue(FindValue(a, ColumnName.HER2)).Value,
                Stage = FindValue(a, ColumnName.Stage),
                TumorStatus = FindValue(a, ColumnName.TumorStage),
                Grade = FindValue(a, ColumnName.Grade),
                NodalStatus = FindValue(a, ColumnName.NodalStatus),
                PCR = FindValue(a, ColumnName.PCR),
                DFS = FindValue(a, ColumnName.DFS),
                DFSTime = FindValue(a, ColumnName.DFSTime),
                RFS = FindValue(a, ColumnName.RFS),
                RFSTime = FindValue(a, ColumnName.RFSTime),
                DMFS = FindValue(a, ColumnName.DMFS),
                DMFSTime = FindValue(a, ColumnName.DMFSTime),
                OverallSurvival = FindValue(a, ColumnName.OverallServive),
                DeadOfDisease = FindValue(a, ColumnName.DeadOfDisease)
            }).ToList());
        }
 public void Step_07_CelListFile(string root)
 {
     using (var sw = new StreamWriter(root + @"\Step_07_CelFileList.tsv"))
     {
         sw.WriteLine("File\tType\tName\tDataset");
         foreach (var dir in Directory.GetDirectories(root))
         {
             Console.WriteLine(dir);
             var cels = CelFile.GetCelFiles(dir);
             //var cels = GetCelFiles(dir).Take(3);
             foreach (var cel in cels)
             {
                 var name = Path.GetFileNameWithoutExtension(cel).Replace("-", ".");
                 if (char.IsNumber(name[0]))
                 {
                     name = "X" + name;
                 }
                 sw.WriteLine("{0}\t{1}\t{2}\t{3}", cel.Substring(root.Length + 1).Replace("\\", "/"), CelFile.GetChipType(cel), name, Path.GetFileName(dir));
             }
         }
     }
 }
        /// <summary>
        /// Build cel count and chip type summary file
        /// </summary>
        public void BuildChipTypeTable(string tableFileName, string summaryFileName)
        {
            if (!FileUtils.IsAbsolutePath(tableFileName))
            {
                tableFileName = Path.Combine(DataDir, tableFileName);
            }

            if (!FileUtils.IsAbsolutePath(summaryFileName))
            {
                summaryFileName = Path.Combine(DataDir, summaryFileName);
            }

            var dic = CelFile.GetChipTypes(this.rExecute, DataDir, true, tableFileName);

            using (StreamWriter sw = new StreamWriter(summaryFileName))
            {
                sw.WriteLine("Dataset\tChiptype\tSample");

                var dsMap   = dic.ToGroupDictionary(m => Path.GetFileName(Path.GetDirectoryName(m.Key)));
                var subdirs = GetDatasetDirectories();

                foreach (var subdir in subdirs)
                {
                    var dsName = Path.GetFileName(subdir);
                    if (dsMap.ContainsKey(dsName))
                    {
                        var grp    = dsMap[dsName].GroupBy(m => m.Value);
                        var bFirst = true;
                        foreach (var g in grp)
                        {
                            var name = bFirst ? new DirectoryInfo(subdir).Name : string.Empty;
                            bFirst = false;
                            sw.WriteLine("{0}\t{1}\t{2}", name, g.Key, g.Count());
                        }
                    }
                }
            }
        }
        /// <summary>
        /// Normalization cel files and return the file contains all cel file names
        /// </summary>
        /// <param name="root"></param>
        /// <param name="outputFile"></param>
        /// <returns></returns>
        public string Normalization(string root, string outputFile)
        {
            var cels = CelFile.GetCelFiles(root);

            if (cels.Count == 0)
            {
                Progress.SetMessage("No cel file found in directory " + DataDir);
                return(string.Empty);
            }

            var inputFile = Path.Combine(root, "celfiles.tsv");

            using (var sw = new StreamWriter(inputFile))
            {
                foreach (var cel in cels)
                {
                    sw.WriteLine(FileUtils.ToLinuxFormat(cel));
                }
            }

            var roptions = new RTemplateProcessorOptions();

            roptions.RExecute       = rExecute;
            roptions.InputFile      = inputFile;
            roptions.OutputFile     = inputFile;
            roptions.NoResultFile   = true;
            roptions.RTemplate      = FileUtils.GetTemplateDir() + "\\frma.r";
            roptions.CreateNoWindow = true;
            new RTemplateProcessor(roptions)
            {
                Progress = this.Progress
            }.Process();

            CelFile.GetChipTypes(this.rExecute, root, true, outputFile);
            return(outputFile);
        }
        /// <summary>
        /// Make sure no data duplicated
        /// </summary>
        public void CheckDuplication(string outputFile)
        {
            outputFile = CheckFileName(outputFile);

            var cels = new Dictionary <string, HashSet <string> >();

            var dirs = GetDatasetDirectories();

            foreach (var subdir in dirs)
            {
                Console.Out.WriteLine(subdir);
                var files = CelFile.GetCelFiles(subdir);
                foreach (var file in files)
                {
                    try
                    {
                        string md5 = HashUtils.GetDecompressedMD5Hash(file);

                        var name = Path.GetFileName(subdir) + ":" + Path.GetFileName(file);
                        if (cels.ContainsKey(md5))
                        {
                            cels[md5].Add(name);
                        }
                        else
                        {
                            cels[md5] = new HashSet <string>(new string[] { name });
                        }
                    }
                    catch (Exception)
                    {
                        Console.Error.WriteLine(file);
                        if (Path.GetFileName(file).ToLower().StartsWith("gsm"))
                        {
                            var gsm      = GeoUtils.GetGsmName(file).ToUpper();
                            var filename = Path.GetFileNameWithoutExtension(file);

                            WebClient webClient = new WebClient();
                            var       uri       = string.Format(@"http://www.ncbi.nlm.nih.gov/geosuppl/?acc={0}&file={1}%2ECEL%2Egz", gsm, filename);
                            Console.WriteLine(uri);
                            webClient.DownloadFile(uri, file + ".gz");
                        }

                        throw;
                    }
                }
            }

            var dupfile = Path.Combine(DataDir, outputFile);

            using (StreamWriter sw = new StreamWriter(dupfile))
            {
                sw.WriteLine("Duplicated entries = " + cels.Count(m => m.Value.Count > 1).ToString());
                sw.WriteLine("Duplicated cels = " + (from cel in cels
                                                     where cel.Value.Count > 1
                                                     select cel.Value.Count).Sum().ToString());

                var dscount = (from c in cels.Values
                               from v in c
                               select v).GroupBy(m => m.StringBefore(":")).ToDictionary(m => m.Key, m => m.Count());

                var keys = cels.Keys.OrderBy(m => m).ToList();

                var sets = (from c in cels
                            where c.Value.Count > 1
                            from v in c.Value
                            select new { MD5 = c.Key, Dataset = v.StringBefore(":"), FileName = v.StringAfter(":") });
                var grp = sets.GroupBy(m => m.Dataset).OrderByDescending(m => m.Count()).ToList();

                sw.WriteLine();
                sw.Write("md5");
                foreach (var g in grp)
                {
                    sw.Write(string.Format("\t{0}({1}/{2})", g.Key, g.Count(), dscount[g.Key]));
                }
                sw.WriteLine();

                foreach (var md5 in keys)
                {
                    if (cels[md5].Count > 1)
                    {
                        sw.Write(md5);

                        foreach (var g in grp)
                        {
                            var m = g.FirstOrDefault(n => n.MD5.Equals(md5));
                            if (m != null)
                            {
                                sw.Write("\t" + m.FileName);
                            }
                            else
                            {
                                sw.Write("\t");
                            }
                        }
                        sw.WriteLine();
                    }
                }
            }
        }
Пример #9
0
        public void ParseDataset(string datasetDirectory, Dictionary <string, BreastCancerSampleItem> sampleMap)
        {
            var files = new HashSet <string>(from f in CelFile.GetCelFiles(datasetDirectory, false)
                                             select Path.GetFileNameWithoutExtension(f));

            var sdrfFile = Directory.GetFiles(datasetDirectory, "*.sdrf.txt");

            if (sdrfFile.Length == 0)
            {
                throw new ArgumentException("Cannot find sdrf file in directory " + datasetDirectory);
            }

            var ann     = new AnnotationFormat("^#").ReadFromFile(sdrfFile[0]);
            var dataset = Path.GetFileName(datasetDirectory);

            foreach (var a in ann)
            {
                var filename = Path.GetFileNameWithoutExtension(FindValue(a, ColumnName.Sample));
                if (files.Contains(filename))
                {
                    if (!sampleMap.ContainsKey(filename))
                    {
                        sampleMap[filename]         = new BreastCancerSampleItem();
                        sampleMap[filename].Dataset = dataset;
                        sampleMap[filename].Sample  = filename;
                    }
                    var item = sampleMap[filename];

                    string value;
                    if (FindValue(a, ColumnName.Age, out value))
                    {
                        item.Age = value;
                    }

                    if (FindValue(a, ColumnName.ER, out value))
                    {
                        item.ER = StatusValue.TransferStatus(value);
                    }

                    if (FindValue(a, ColumnName.PR, out value))
                    {
                        item.PR = StatusValue.TransferStatus(value);
                    }

                    if (FindValue(a, ColumnName.HER2, out value))
                    {
                        item.HER2 = StatusValue.TransferStatus(value);
                    }

                    if (FindValue(a, ColumnName.Stage, out value))
                    {
                        item.Stage = value;
                    }

                    if (FindValue(a, ColumnName.TumorStage, out value))
                    {
                        item.TumorStatus = value;
                    }

                    if (FindValue(a, ColumnName.Grade, out value))
                    {
                        item.Grade = value;
                    }

                    if (FindValue(a, ColumnName.NodalStatus, out value))
                    {
                        item.NodalStatus = value;
                    }

                    if (FindValue(a, ColumnName.PCR, out value))
                    {
                        item.PCR = value;
                    }

                    if (FindValue(a, ColumnName.DFS, out value))
                    {
                        item.DFS = value;
                    }

                    if (FindValue(a, ColumnName.DFSTime, out value))
                    {
                        item.DFSTime = value;
                    }

                    if (FindValue(a, ColumnName.RFS, out value))
                    {
                        item.RFS = value;
                    }

                    if (FindValue(a, ColumnName.RFSTime, out value))
                    {
                        item.RFSTime = value;
                    }

                    if (FindValue(a, ColumnName.DMFS, out value))
                    {
                        item.DMFS = value;
                    }

                    if (FindValue(a, ColumnName.DMFSTime, out value))
                    {
                        item.DMFSTime = value;
                    }

                    if (FindValue(a, ColumnName.OverallServive, out value))
                    {
                        item.OverallSurvival = value;
                    }

                    if (FindValue(a, ColumnName.DeadOfDisease, out value))
                    {
                        item.DeadOfDisease = value;
                    }
                }
            }
        }
Пример #10
0
        public void NewFromData(string subdir)
        {
            var siformat = Directory.GetFiles(subdir, "*.siformat");

            TextFileDefinition prefile = new TextFileDefinition();

            if (siformat.Length > 0)
            {
                prefile.ReadFromFile(siformat[0]);

                bool bFound = false;
                prefile.ForEach(m =>
                {
                    if (m.PropertyName.Equals("TumorStage"))
                    {
                        m.PropertyName = "TumorStatus";
                        bFound         = true;
                    }

                    if (m.PropertyName.Equals("Metastasis"))
                    {
                        m.PropertyName = "MetastasisStatus";
                        bFound         = true;
                    }
                });

                if (bFound)
                {
                    prefile.WriteToFile(siformat[0]);
                }
            }

            var map = new RawSampleInfoReader().ReadDescriptionFromDirectory(subdir);

            lastDirectory = subdir;
            lastFile      = String.Empty;

            var files = new HashSet <string>(from f in CelFile.GetCelFiles(subdir, false)
                                             select GeoUtils.GetGsmName(f));

            Dictionary <string, HashSet <string> > headers = new Dictionary <string, HashSet <string> >();

            foreach (var m in map)
            {
                var gsm = m.Key.ToLower();

                if (!files.Contains(gsm))
                {
                    continue;
                }

                var curmap = m.Value;

                foreach (var entry in curmap)
                {
                    if (!headers.ContainsKey(entry.Key))
                    {
                        headers[entry.Key] = new HashSet <string>();
                    }
                    headers[entry.Key].UnionWith(entry.Value);
                }
            }

            ClearDataSource();

            items.Clear();
            foreach (var part in headers)
            {
                items.Add(new FileDefinitionItem()
                {
                    AnnotationName = part.Key,
                    Example        = (from v in part.Value
                                      orderby v
                                      select v).Merge(";")
                });
            }

            foreach (var olditem in prefile)
            {
                if (!string.IsNullOrEmpty(olditem.PropertyName))
                {
                    var newitem = items.Find(m => m.AnnotationName.Equals(olditem.AnnotationName));
                    if (newitem != null)
                    {
                        newitem.PropertyName = olditem.PropertyName;
                    }
                }
            }

            items.DefaultValues.Clear();
            foreach (var olddv in prefile.DefaultValues)
            {
                if (propertyNames.Contains(olddv.PropertyName))
                {
                    items.DefaultValues.Add(new DefaultValue()
                    {
                        PropertyName = olddv.PropertyName,
                        Value        = olddv.Value
                    });
                }
            }

            items.Sort((m1, m2) => m1.AnnotationName.CompareTo(m2.AnnotationName));

            UpdateDataSource();

            this.Text = title + " - " + Path.GetFileName(subdir);
        }
Пример #11
0
        public void NewFromData(string subdir)
        {
            try
            {
                var siformat = Directory.GetFiles(subdir, "*.siformat");

                TextFileDefinition prefile = new TextFileDefinition();
                if (siformat.Length > 0)
                {
                    prefile.ReadFromFile(siformat[0]);
                }

                var map = new RawSampleInfoReader().ReadDescriptionFromDirectory(subdir);

                LastDirectory = subdir;
                lastFile      = String.Empty;

                var files = new HashSet <string>(from f in CelFile.GetCelFiles(subdir, false)
                                                 select GeoUtils.GetGsmName(f));

                Dictionary <string, HashSet <string> > headers = new Dictionary <string, HashSet <string> >();
                foreach (var m in map)
                {
                    var gsm = m.Key.ToUpper();

                    if (!files.Contains(gsm))
                    {
                        continue;
                    }

                    var curmap = m.Value;

                    foreach (var entry in curmap)
                    {
                        if (!headers.ContainsKey(entry.Key))
                        {
                            headers[entry.Key] = new HashSet <string>();
                        }
                        headers[entry.Key].UnionWith(entry.Value);
                    }
                }

                ClearDataSource();

                items.Clear();
                foreach (var part in headers)
                {
                    items.Add(new FileDefinitionItem()
                    {
                        AnnotationName = part.Key,
                        Example        = (from v in part.Value
                                          orderby v
                                          select v).Merge(";")
                    });
                }

                foreach (var olditem in prefile)
                {
                    if (!string.IsNullOrEmpty(olditem.PropertyName))
                    {
                        var newitem = items.Find(m => m.AnnotationName.Equals(olditem.AnnotationName));
                        if (newitem != null)
                        {
                            newitem.PropertyName = olditem.PropertyName;
                        }
                    }
                }

                items.DefaultValues.Clear();

                items.Sort((m1, m2) => m1.AnnotationName.CompareTo(m2.AnnotationName));

                UpdateDataSource();

                label1.Text = "Annotation/property mapping - " + Path.GetFileName(subdir);

                dlgOpenDirectory.SelectedPath = subdir;
                dlgSaveFormatFile.FileName    = Path.Combine(subdir, Path.GetFileName(subdir) + ".siformat");
            }
            catch (Exception ex)
            {
                MessageBox.Show(this, ex.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }