/// <summary> /// Make sure no data duplicated /// </summary> public void FilterDatasetByMD5(string[] datasetByPriority) { var dsMap = (from dir in GetDatasetDirectories() let dataset = Path.GetFileName(dir) from file in CelFile.GetCelFiles(dir) select new { Dataset = dataset, File = file, MD5 = HashUtils.GetDecompressedMD5Hash(file) }).ToGroupDictionary(m => m.Dataset); foreach (var dataset in datasetByPriority) { var fileMap = dsMap[dataset].ToDictionary(m => m.MD5); foreach (var ds in dsMap) { if (ds.Key.Equals(dataset)) { continue; } var dsfiles = ds.Value; foreach (var cel in dsfiles) { if (!File.Exists(cel.File)) { continue; } if (fileMap.ContainsKey(cel.MD5)) { Console.WriteLine("Excluding " + cel.File); ExcludeFile(cel.File); } } } } }
public static Dictionary <string, string> GetGsmNameFileMap(string directory) { Console.WriteLine(directory); var files = CelFile.GetCelFiles(directory); var groups = files.GroupBy(m => GetGsmName(m)); foreach (var group in groups) { if (group.Count() > 1) { Console.WriteLine("{0} : {1}", group.Key, group.ToList().ConvertAll(m => Path.GetFileName(m)).Merge(", ")); } } return(groups.ToDictionary(m => m.Key, m => m.First())); }
/// <summary> /// Filter GSM files based on accept function. The rejected files will be removed to exclusion directory. /// </summary> /// <param name="gseName"></param> /// <param name="accept"></param> public void FilterGseSource(string gseName, Func <SampleItem, bool> accept) { var gseDirectory = Path.Combine(DataDir, gseName); if (!Directory.Exists(gseDirectory)) { return; } var gseExcluded = Path.Combine(ExclusionDir, gseName); var samples = ReadSampleItem(gseDirectory); var sg = samples.ToDictionary(m => m.Sample.ToLower()); var celFiles = CelFile.GetCelFiles(gseDirectory); foreach (var cel in celFiles) { var gsm = GeoUtils.GetGsmName(cel); if (!sg.ContainsKey(gsm)) { throw new Exception("Cannot find sample information of " + gsm + " in " + gseDirectory); } var sample = sg[gsm]; if (!accept(sample)) { if (!Directory.Exists(gseExcluded)) { Directory.CreateDirectory(gseExcluded); } var target = Path.Combine(gseExcluded, Path.GetFileName(cel)); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}", Path.GetFileName(cel), sample.Sample, sample.SourceName, sample.SampleTitle); if (File.Exists(target)) { File.Delete(target); } File.Move(cel, target); } } }
public List <BreastCancerSampleItem> ParseDataset(string datasetDirectory) { var files = new HashSet <string>(from f in CelFile.GetCelFiles(datasetDirectory, false) select Path.GetFileNameWithoutExtension(f)); var sdrfFile = Directory.GetFiles(datasetDirectory, "*.sdrf.txt"); if (sdrfFile.Length == 0) { throw new ArgumentException("Cannot find sdrf file in directory " + datasetDirectory); } var ann = new AnnotationFormat("^#").ReadFromFile(sdrfFile[0]); return((from a in ann let filename = Path.GetFileNameWithoutExtension(FindValue(a, ColumnName.Sample)) where files.Contains(filename) select new BreastCancerSampleItem() { Dataset = Path.GetFileName(datasetDirectory), Sample = filename, Age = FindValue(a, ColumnName.Age), ER = new StatusValue(FindValue(a, ColumnName.ER)).Value, PR = new StatusValue(FindValue(a, ColumnName.PR)).Value, HER2 = new StatusValue(FindValue(a, ColumnName.HER2)).Value, Stage = FindValue(a, ColumnName.Stage), TumorStatus = FindValue(a, ColumnName.TumorStage), Grade = FindValue(a, ColumnName.Grade), NodalStatus = FindValue(a, ColumnName.NodalStatus), PCR = FindValue(a, ColumnName.PCR), DFS = FindValue(a, ColumnName.DFS), DFSTime = FindValue(a, ColumnName.DFSTime), RFS = FindValue(a, ColumnName.RFS), RFSTime = FindValue(a, ColumnName.RFSTime), DMFS = FindValue(a, ColumnName.DMFS), DMFSTime = FindValue(a, ColumnName.DMFSTime), OverallSurvival = FindValue(a, ColumnName.OverallServive), DeadOfDisease = FindValue(a, ColumnName.DeadOfDisease) }).ToList()); }
public void Step_07_CelListFile(string root) { using (var sw = new StreamWriter(root + @"\Step_07_CelFileList.tsv")) { sw.WriteLine("File\tType\tName\tDataset"); foreach (var dir in Directory.GetDirectories(root)) { Console.WriteLine(dir); var cels = CelFile.GetCelFiles(dir); //var cels = GetCelFiles(dir).Take(3); foreach (var cel in cels) { var name = Path.GetFileNameWithoutExtension(cel).Replace("-", "."); if (char.IsNumber(name[0])) { name = "X" + name; } sw.WriteLine("{0}\t{1}\t{2}\t{3}", cel.Substring(root.Length + 1).Replace("\\", "/"), CelFile.GetChipType(cel), name, Path.GetFileName(dir)); } } } }
/// <summary> /// Build cel count and chip type summary file /// </summary> public void BuildChipTypeTable(string tableFileName, string summaryFileName) { if (!FileUtils.IsAbsolutePath(tableFileName)) { tableFileName = Path.Combine(DataDir, tableFileName); } if (!FileUtils.IsAbsolutePath(summaryFileName)) { summaryFileName = Path.Combine(DataDir, summaryFileName); } var dic = CelFile.GetChipTypes(this.rExecute, DataDir, true, tableFileName); using (StreamWriter sw = new StreamWriter(summaryFileName)) { sw.WriteLine("Dataset\tChiptype\tSample"); var dsMap = dic.ToGroupDictionary(m => Path.GetFileName(Path.GetDirectoryName(m.Key))); var subdirs = GetDatasetDirectories(); foreach (var subdir in subdirs) { var dsName = Path.GetFileName(subdir); if (dsMap.ContainsKey(dsName)) { var grp = dsMap[dsName].GroupBy(m => m.Value); var bFirst = true; foreach (var g in grp) { var name = bFirst ? new DirectoryInfo(subdir).Name : string.Empty; bFirst = false; sw.WriteLine("{0}\t{1}\t{2}", name, g.Key, g.Count()); } } } } }
/// <summary> /// Normalization cel files and return the file contains all cel file names /// </summary> /// <param name="root"></param> /// <param name="outputFile"></param> /// <returns></returns> public string Normalization(string root, string outputFile) { var cels = CelFile.GetCelFiles(root); if (cels.Count == 0) { Progress.SetMessage("No cel file found in directory " + DataDir); return(string.Empty); } var inputFile = Path.Combine(root, "celfiles.tsv"); using (var sw = new StreamWriter(inputFile)) { foreach (var cel in cels) { sw.WriteLine(FileUtils.ToLinuxFormat(cel)); } } var roptions = new RTemplateProcessorOptions(); roptions.RExecute = rExecute; roptions.InputFile = inputFile; roptions.OutputFile = inputFile; roptions.NoResultFile = true; roptions.RTemplate = FileUtils.GetTemplateDir() + "\\frma.r"; roptions.CreateNoWindow = true; new RTemplateProcessor(roptions) { Progress = this.Progress }.Process(); CelFile.GetChipTypes(this.rExecute, root, true, outputFile); return(outputFile); }
/// <summary> /// Make sure no data duplicated /// </summary> public void CheckDuplication(string outputFile) { outputFile = CheckFileName(outputFile); var cels = new Dictionary <string, HashSet <string> >(); var dirs = GetDatasetDirectories(); foreach (var subdir in dirs) { Console.Out.WriteLine(subdir); var files = CelFile.GetCelFiles(subdir); foreach (var file in files) { try { string md5 = HashUtils.GetDecompressedMD5Hash(file); var name = Path.GetFileName(subdir) + ":" + Path.GetFileName(file); if (cels.ContainsKey(md5)) { cels[md5].Add(name); } else { cels[md5] = new HashSet <string>(new string[] { name }); } } catch (Exception) { Console.Error.WriteLine(file); if (Path.GetFileName(file).ToLower().StartsWith("gsm")) { var gsm = GeoUtils.GetGsmName(file).ToUpper(); var filename = Path.GetFileNameWithoutExtension(file); WebClient webClient = new WebClient(); var uri = string.Format(@"http://www.ncbi.nlm.nih.gov/geosuppl/?acc={0}&file={1}%2ECEL%2Egz", gsm, filename); Console.WriteLine(uri); webClient.DownloadFile(uri, file + ".gz"); } throw; } } } var dupfile = Path.Combine(DataDir, outputFile); using (StreamWriter sw = new StreamWriter(dupfile)) { sw.WriteLine("Duplicated entries = " + cels.Count(m => m.Value.Count > 1).ToString()); sw.WriteLine("Duplicated cels = " + (from cel in cels where cel.Value.Count > 1 select cel.Value.Count).Sum().ToString()); var dscount = (from c in cels.Values from v in c select v).GroupBy(m => m.StringBefore(":")).ToDictionary(m => m.Key, m => m.Count()); var keys = cels.Keys.OrderBy(m => m).ToList(); var sets = (from c in cels where c.Value.Count > 1 from v in c.Value select new { MD5 = c.Key, Dataset = v.StringBefore(":"), FileName = v.StringAfter(":") }); var grp = sets.GroupBy(m => m.Dataset).OrderByDescending(m => m.Count()).ToList(); sw.WriteLine(); sw.Write("md5"); foreach (var g in grp) { sw.Write(string.Format("\t{0}({1}/{2})", g.Key, g.Count(), dscount[g.Key])); } sw.WriteLine(); foreach (var md5 in keys) { if (cels[md5].Count > 1) { sw.Write(md5); foreach (var g in grp) { var m = g.FirstOrDefault(n => n.MD5.Equals(md5)); if (m != null) { sw.Write("\t" + m.FileName); } else { sw.Write("\t"); } } sw.WriteLine(); } } } }
public void ParseDataset(string datasetDirectory, Dictionary <string, BreastCancerSampleItem> sampleMap) { var files = new HashSet <string>(from f in CelFile.GetCelFiles(datasetDirectory, false) select Path.GetFileNameWithoutExtension(f)); var sdrfFile = Directory.GetFiles(datasetDirectory, "*.sdrf.txt"); if (sdrfFile.Length == 0) { throw new ArgumentException("Cannot find sdrf file in directory " + datasetDirectory); } var ann = new AnnotationFormat("^#").ReadFromFile(sdrfFile[0]); var dataset = Path.GetFileName(datasetDirectory); foreach (var a in ann) { var filename = Path.GetFileNameWithoutExtension(FindValue(a, ColumnName.Sample)); if (files.Contains(filename)) { if (!sampleMap.ContainsKey(filename)) { sampleMap[filename] = new BreastCancerSampleItem(); sampleMap[filename].Dataset = dataset; sampleMap[filename].Sample = filename; } var item = sampleMap[filename]; string value; if (FindValue(a, ColumnName.Age, out value)) { item.Age = value; } if (FindValue(a, ColumnName.ER, out value)) { item.ER = StatusValue.TransferStatus(value); } if (FindValue(a, ColumnName.PR, out value)) { item.PR = StatusValue.TransferStatus(value); } if (FindValue(a, ColumnName.HER2, out value)) { item.HER2 = StatusValue.TransferStatus(value); } if (FindValue(a, ColumnName.Stage, out value)) { item.Stage = value; } if (FindValue(a, ColumnName.TumorStage, out value)) { item.TumorStatus = value; } if (FindValue(a, ColumnName.Grade, out value)) { item.Grade = value; } if (FindValue(a, ColumnName.NodalStatus, out value)) { item.NodalStatus = value; } if (FindValue(a, ColumnName.PCR, out value)) { item.PCR = value; } if (FindValue(a, ColumnName.DFS, out value)) { item.DFS = value; } if (FindValue(a, ColumnName.DFSTime, out value)) { item.DFSTime = value; } if (FindValue(a, ColumnName.RFS, out value)) { item.RFS = value; } if (FindValue(a, ColumnName.RFSTime, out value)) { item.RFSTime = value; } if (FindValue(a, ColumnName.DMFS, out value)) { item.DMFS = value; } if (FindValue(a, ColumnName.DMFSTime, out value)) { item.DMFSTime = value; } if (FindValue(a, ColumnName.OverallServive, out value)) { item.OverallSurvival = value; } if (FindValue(a, ColumnName.DeadOfDisease, out value)) { item.DeadOfDisease = value; } } } }
public void NewFromData(string subdir) { var siformat = Directory.GetFiles(subdir, "*.siformat"); TextFileDefinition prefile = new TextFileDefinition(); if (siformat.Length > 0) { prefile.ReadFromFile(siformat[0]); bool bFound = false; prefile.ForEach(m => { if (m.PropertyName.Equals("TumorStage")) { m.PropertyName = "TumorStatus"; bFound = true; } if (m.PropertyName.Equals("Metastasis")) { m.PropertyName = "MetastasisStatus"; bFound = true; } }); if (bFound) { prefile.WriteToFile(siformat[0]); } } var map = new RawSampleInfoReader().ReadDescriptionFromDirectory(subdir); lastDirectory = subdir; lastFile = String.Empty; var files = new HashSet <string>(from f in CelFile.GetCelFiles(subdir, false) select GeoUtils.GetGsmName(f)); Dictionary <string, HashSet <string> > headers = new Dictionary <string, HashSet <string> >(); foreach (var m in map) { var gsm = m.Key.ToLower(); if (!files.Contains(gsm)) { continue; } var curmap = m.Value; foreach (var entry in curmap) { if (!headers.ContainsKey(entry.Key)) { headers[entry.Key] = new HashSet <string>(); } headers[entry.Key].UnionWith(entry.Value); } } ClearDataSource(); items.Clear(); foreach (var part in headers) { items.Add(new FileDefinitionItem() { AnnotationName = part.Key, Example = (from v in part.Value orderby v select v).Merge(";") }); } foreach (var olditem in prefile) { if (!string.IsNullOrEmpty(olditem.PropertyName)) { var newitem = items.Find(m => m.AnnotationName.Equals(olditem.AnnotationName)); if (newitem != null) { newitem.PropertyName = olditem.PropertyName; } } } items.DefaultValues.Clear(); foreach (var olddv in prefile.DefaultValues) { if (propertyNames.Contains(olddv.PropertyName)) { items.DefaultValues.Add(new DefaultValue() { PropertyName = olddv.PropertyName, Value = olddv.Value }); } } items.Sort((m1, m2) => m1.AnnotationName.CompareTo(m2.AnnotationName)); UpdateDataSource(); this.Text = title + " - " + Path.GetFileName(subdir); }
public void NewFromData(string subdir) { try { var siformat = Directory.GetFiles(subdir, "*.siformat"); TextFileDefinition prefile = new TextFileDefinition(); if (siformat.Length > 0) { prefile.ReadFromFile(siformat[0]); } var map = new RawSampleInfoReader().ReadDescriptionFromDirectory(subdir); LastDirectory = subdir; lastFile = String.Empty; var files = new HashSet <string>(from f in CelFile.GetCelFiles(subdir, false) select GeoUtils.GetGsmName(f)); Dictionary <string, HashSet <string> > headers = new Dictionary <string, HashSet <string> >(); foreach (var m in map) { var gsm = m.Key.ToUpper(); if (!files.Contains(gsm)) { continue; } var curmap = m.Value; foreach (var entry in curmap) { if (!headers.ContainsKey(entry.Key)) { headers[entry.Key] = new HashSet <string>(); } headers[entry.Key].UnionWith(entry.Value); } } ClearDataSource(); items.Clear(); foreach (var part in headers) { items.Add(new FileDefinitionItem() { AnnotationName = part.Key, Example = (from v in part.Value orderby v select v).Merge(";") }); } foreach (var olditem in prefile) { if (!string.IsNullOrEmpty(olditem.PropertyName)) { var newitem = items.Find(m => m.AnnotationName.Equals(olditem.AnnotationName)); if (newitem != null) { newitem.PropertyName = olditem.PropertyName; } } } items.DefaultValues.Clear(); items.Sort((m1, m2) => m1.AnnotationName.CompareTo(m2.AnnotationName)); UpdateDataSource(); label1.Text = "Annotation/property mapping - " + Path.GetFileName(subdir); dlgOpenDirectory.SelectedPath = subdir; dlgSaveFormatFile.FileName = Path.Combine(subdir, Path.GetFileName(subdir) + ".siformat"); } catch (Exception ex) { MessageBox.Show(this, ex.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } }