public void ParseDataset(string datasetDirectory, Dictionary<string, BreastCancerSampleItem> sampleMap) { var files = GeoUtils.GetGsmNameFileMap(datasetDirectory); var dirname = Path.GetFileName(datasetDirectory); //The status of ER, PR is on the sample title var samples = new GseSeriesMatrixReader().ReadFromDirectory(datasetDirectory); foreach (var a in samples) { var filename = a.Key.ToLower(); if (files.ContainsKey(filename.ToLower())) { var title = a.Value[GsmConsts.SampleTitle]; var m = r.Match(title.First()); var er = m.Groups[1].Value.Equals("p") ? "pos" : "neg"; var pr = m.Groups[2].Value.Equals("p") ? "pos" : "neg"; var ts = m.Groups[3].Value; var n = m.Groups[4].Value; var grade = m.Groups[5].Value; var key = filename.ToUpper(); if (!sampleMap.ContainsKey(key)) { sampleMap[key] = new BreastCancerSampleItem(dirname, filename.ToUpper()); } BreastCancerSampleItem item = sampleMap[key]; item.ER = er; item.PR = pr; item.TumorStatus = ts; item.Grade = grade; } } }
public override IEnumerable <string> Process() { var gses = File.ReadAllLines(options.GseListFile).Where(m => !string.IsNullOrWhiteSpace(m)).ToList().ConvertAll(m => m.Split(new[] { '\t', ' ' })[0]).ToArray(); var gseInfoMap = new Dictionary <string, Dictionary <string, Dictionary <string, List <string> > > >(); for (int gseIndex = 0; gseIndex < gses.Length; gseIndex++) { var gse = gses[gseIndex]; Func <GsmRecord, bool> accept; if (!options.AcceptMap.TryGetValue(gse, out accept)) { accept = m => true; } Func <Dictionary <string, List <string> >, bool> acceptDescription; if (!options.AcceptDescriptionMap.TryGetValue(gse, out acceptDescription)) { acceptDescription = null; } var sql = string.Format(@"select gse.gse, gsm.supplementary_file, gsm.title, gsm.gpl, gsm.source_name_ch1 from gse JOIN gse_gsm ON gse.gse=gse_gsm.gse JOIN gsm ON gse_gsm.gsm=gsm.gsm where gse.gse == '{0}' and (gsm.supplementary_file like '%CEL.gz%' or gsm.supplementary_file like '%cel.gz%') group by gse.gse, gsm.supplementary_file ", gse); SQLiteDBHelper sqlite = new SQLiteDBHelper(options.GeoMetaDatabase); Progress.SetMessage("{0}/{1}: querying files ...", gseIndex + 1, gses.Length); var data = sqlite.ExecuteDataTable(sql, null); var records = new List <GsmRecord>(); for (int row = 0; row < data.Rows.Count; row++) { var rec = new GsmRecord() { GSE = data.Rows[row].ItemArray[0].ToString(), Url = data.Rows[row].ItemArray[1].ToString(), Title = data.Rows[row].ItemArray[2].ToString(), GPL = data.Rows[row].ItemArray[3].ToString(), SourceName = data.Rows[row].ItemArray[4].ToString() }; records.Add(rec); } var totalValids = records.Where(m => accept(m)).Count(); int currentValid = 0; foreach (var rec in records) { var file = rec.Url.Split(';').Where(m => m.Trim().ToLower().EndsWith("cel.gz")).First(); var dataDir = builder.DataDir + "/"; var dir = dataDir + rec.GSE; var gzipped = dir + "/" + Path.GetFileName(file); var tmp = gzipped + "tmp"; if (!accept(rec)) { DeleteGzippedFile(gzipped); continue; } currentValid++; var prefix = string.Format("{0}/{1} ~ {2} : {3}/{4}", gseIndex + 1, gses.Length, rec.GSE, currentValid, totalValids); Progress.SetMessage(prefix + " ~ " + file); if (!Directory.Exists(dataDir)) { Directory.CreateDirectory(dataDir); } if (!Directory.Exists(dir)) { Directory.CreateDirectory(dir); } if (acceptDescription != null) { Dictionary <string, Dictionary <string, List <string> > > curInfoMap; if (!gseInfoMap.TryGetValue(rec.GSE, out curInfoMap)) { if (!GseSeriesMatrixReader.HasMatrixFiles(dir)) { new GseMatrixDownloader(new GseMatrixDownloaderOptions() { InputDirectory = dir }).Process(); } if (!GseSeriesMatrixReader.HasMatrixFiles(dir)) { throw new Exception("Failed to download matrix file for " + gse); } curInfoMap = new GseSeriesMatrixReader().ReadDescriptionFromDirectory(dir); gseInfoMap[rec.GSE] = curInfoMap; } var gsmName = Path.GetFileName(file).StringBefore(".cel.gz").StringBefore(".CEL.gz"); var sampleInfoMap = curInfoMap[gsmName]; if (!acceptDescription(sampleInfoMap)) { DeleteGzippedFile(gzipped); continue; } } if (File.Exists(gzipped) && new FileInfo(gzipped).Length == 0) { File.Delete(gzipped); } if (!File.Exists(gzipped)) { Progress.SetMessage(prefix + " ~ downloading " + file + " ..."); if (!WebUtils.DownloadFile(file, tmp)) { Console.Error.WriteLine("Download {0} failed.", file); break; } File.Move(tmp, gzipped); } } } var nocels = (from gse in gses let dir = builder.DataDir + "/" + gse where !Directory.Exists(dir) select gse).Merge("\n"); if (!string.IsNullOrEmpty(nocels)) { throw new Exception("No cel file found for\n" + nocels); } return(new[] { options.RootDirectory }); }
public override IEnumerable<string> Process() { var gses = File.ReadAllLines(options.GseListFile).Where(m => !string.IsNullOrWhiteSpace(m)).ToList().ConvertAll(m => m.Split(new[] { '\t', ' ' })[0]).ToArray(); var gseInfoMap = new Dictionary<string, Dictionary<string, Dictionary<string, List<string>>>>(); for (int gseIndex = 0; gseIndex < gses.Length; gseIndex++) { var gse = gses[gseIndex]; Func<GsmRecord, bool> accept; if (!options.AcceptMap.TryGetValue(gse, out accept)) { accept = m => true; } Func<Dictionary<string, List<string>>, bool> acceptDescription; if (!options.AcceptDescriptionMap.TryGetValue(gse, out acceptDescription)) { acceptDescription = null; } var sql = string.Format(@"select gse.gse, gsm.supplementary_file, gsm.title, gsm.gpl, gsm.source_name_ch1 from gse JOIN gse_gsm ON gse.gse=gse_gsm.gse JOIN gsm ON gse_gsm.gsm=gsm.gsm where gse.gse == '{0}' and (gsm.supplementary_file like '%CEL.gz%' or gsm.supplementary_file like '%cel.gz%') group by gse.gse, gsm.supplementary_file ", gse); SQLiteDBHelper sqlite = new SQLiteDBHelper(options.GeoMetaDatabase); Progress.SetMessage("{0}/{1}: querying files ...", gseIndex + 1, gses.Length); var data = sqlite.ExecuteDataTable(sql, null); var records = new List<GsmRecord>(); for (int row = 0; row < data.Rows.Count; row++) { var rec = new GsmRecord() { GSE = data.Rows[row].ItemArray[0].ToString(), Url = data.Rows[row].ItemArray[1].ToString(), Title = data.Rows[row].ItemArray[2].ToString(), GPL = data.Rows[row].ItemArray[3].ToString(), SourceName = data.Rows[row].ItemArray[4].ToString() }; records.Add(rec); } var totalValids = records.Where(m => accept(m)).Count(); int currentValid = 0; foreach (var rec in records) { var file = rec.Url.Split(';').Where(m => m.Trim().ToLower().EndsWith("cel.gz")).First(); var dataDir = builder.DataDir + "/"; var dir = dataDir + rec.GSE; var gzipped = dir + "/" + Path.GetFileName(file); var tmp = gzipped + "tmp"; if (!accept(rec)) { DeleteGzippedFile(gzipped); continue; } currentValid++; var prefix = string.Format("{0}/{1} ~ {2} : {3}/{4}", gseIndex + 1, gses.Length, rec.GSE, currentValid, totalValids); Progress.SetMessage(prefix + " ~ " + file); if (!Directory.Exists(dataDir)) { Directory.CreateDirectory(dataDir); } if (!Directory.Exists(dir)) { Directory.CreateDirectory(dir); } if (acceptDescription != null) { Dictionary<string, Dictionary<string, List<string>>> curInfoMap; if (!gseInfoMap.TryGetValue(rec.GSE, out curInfoMap)) { if (!GseSeriesMatrixReader.HasMatrixFiles(dir)) { new GseMatrixDownloader(new GseMatrixDownloaderOptions() { InputDirectory = dir }).Process(); } if (!GseSeriesMatrixReader.HasMatrixFiles(dir)) { throw new Exception("Failed to download matrix file for " + gse); } curInfoMap = new GseSeriesMatrixReader().ReadDescriptionFromDirectory(dir); gseInfoMap[rec.GSE] = curInfoMap; } var gsmName = Path.GetFileName(file).StringBefore(".cel.gz").StringBefore(".CEL.gz"); var sampleInfoMap = curInfoMap[gsmName]; if (!acceptDescription(sampleInfoMap)) { DeleteGzippedFile(gzipped); continue; } } if (File.Exists(gzipped) && new FileInfo(gzipped).Length == 0) { File.Delete(gzipped); } if (!File.Exists(gzipped)) { Progress.SetMessage(prefix + " ~ downloading " + file + " ..."); if (!WebUtils.DownloadFile(file, tmp)) { Console.Error.WriteLine("Download {0} failed.", file); break; } File.Move(tmp, gzipped); } } } var nocels = (from gse in gses let dir = builder.DataDir + "/" + gse where !Directory.Exists(dir) select gse).Merge("\n"); if (!string.IsNullOrEmpty(nocels)) { throw new Exception("No cel file found for\n" + nocels); } return new[] { options.RootDirectory }; }