/// <summary> /// Load data into PubChem database /// PubChem assay data files are downloaded from the PubChem site: /// http://pubchem.ncbi.nlm.nih.gov/ using a program like SmartFTP. /// The files are in GNU Zip (.gz) format and can be unzipped with /// the following gzip commands: /// c:\gzip\gzip -d c:\pubchem\bioassay\csv\description\*.gz /// c:\gzip\gzip -d c:\pubchem\bioassay\csv\data\*.gz /// After downloading and decompression this method can be called on the files. /// </summary> /// <param name="args"></param> /// <returns></returns> public static string LoadData( string aid) { int recCount = 0; string mtName = "PubChem_aid_" + aid; MetaTable mt = MetaTableCollection.Get(mtName); if (mt == null) { return("Failed to get metatable"); } // if (Math.Sqrt(4) == 2) goto UpdateCids; string fileName = PubChemAssayDirectory + @"\CSV\Data\" + aid + ".csv"; StreamReader sr; try { sr = new StreamReader(fileName); } catch (Exception ex) { return("File not found: " + fileName); } string header = sr.ReadLine(); // read headers line List <string> headers = Csv.SplitCsvString(header); int cidIdx = -1; for (cidIdx = 0; cidIdx < headers.Count; cidIdx++) { if (headers[cidIdx].ToUpper() == "PUBCHEM_CID") { break; } } if (cidIdx >= headers.Count) { sr.Close(); return("PUBCHEM_CID column not found in data headers"); } Dictionary <string, MetaColumn> mcd = new Dictionary <string, MetaColumn>(); foreach (MetaColumn mc2 in mt.MetaColumns) { mcd[mc2.Name.ToUpper()] = mc2; // build dict for quick metacolumn lookup } DbConnectionMx conn = DbConnectionMx.MapSqlToConnection(ref PubChemWarehouseTable); conn.BeginTransaction(); // do multiple updates per transaction GenericDwDao dao = new GenericDwDao( PubChemWarehouseTable, // table for results PubChemWarehouseSeq); // sequence to use dao.BufferInserts(true); // buffer inserts for better speed SequenceDao.SetCacheSize(PubChemWarehouseSeq, 100); // number of ids to cache locally from sequence //string progressMsg = "Deleting existing data..."; int i1 = dao.DeleteTable(Int32.Parse(mt.TableFilterValues[0]), true); //if (Progress.CancelRequested()) //{ // dao.Dispose(); // return "Cancelled during data delete"; //} //Progress.Show("Loading file..."); recCount = 0; int t1 = 0; while (true) { int t2 = TimeOfDay.Milliseconds(); if (t2 - t1 > 1000) { if (Progress.CancelRequested) { dao.ExecuteBufferedInserts(); conn.Commit(); conn.Close(); sr.Close(); Progress.Hide(); return(recCount.ToString() + " rows loaded"); } Progress.Show("Loading file (" + recCount.ToString() + ") ..."); t1 = t2; } string rec = sr.ReadLine(); if (rec == null) { break; } List <string> vals = Csv.SplitCsvString(rec); int cid; try { cid = Int32.Parse(vals[cidIdx]); } // get compound id catch (Exception ex) { string txtCid = vals[cidIdx]; if (txtCid == null) { txtCid = ""; } DebugLog.Message("Load PubChem bad CID " + txtCid + ", AID = " + aid); continue; } long rslt_grp_id = dao.GetNextIdLong(); // id to hold row together for (int vi = 0; vi < vals.Count; vi++) { string s = vals[vi]; if (s == "") { continue; } string[] sa = rec.Split(','); if (vi >= headers.Count) { continue; } string mcName = headers[vi].ToUpper(); if (mcName.Length > 26) { mcName = mcName.Substring(0, 26); // limit length to 26 } if (mcName == "PUBCHEM_CID") { continue; } if (Lex.IsInteger(mcName)) { mcName = "R_" + mcName; // result number } MetaColumn mc = mcd[mcName]; if (mc == null) { continue; } AnnotationVo vo = new AnnotationVo(); vo.rslt_grp_id = rslt_grp_id; if (mc.DataType == MetaColumnType.String) { vo.rslt_val_txt = s; } else if (mc.DataType == MetaColumnType.Number || mc.DataType == MetaColumnType.Integer) { try { vo.rslt_val_nbr = Convert.ToDouble(s); } catch (Exception e) { continue; } // just continue if bad } else if (mc.DataType == MetaColumnType.Date) { s = DateTimeMx.Normalize(s); if (s == null) { continue; } vo.rslt_val_dt = DateTimeMx.NormalizedToDateTime(s); } else if (mc.Name == "PUBCHEM_ACTIVITY_OUTCOME") // activity outcome is a dict value stored as an integer { try { vo.rslt_val_nbr = Convert.ToInt32(s); } catch (Exception e) { continue; } // just continue if bad } else if (mc.DataType == MetaColumnType.Hyperlink || mc.DataType == MetaColumnType.DictionaryId) { vo.rslt_val_txt = s; } else { continue; } vo.ext_cmpnd_id_nbr = cid; vo.ext_cmpnd_id_txt = cid.ToString(); vo.mthd_vrsn_id = Int32.Parse(mt.TableFilterValues[0]); vo.rslt_typ_id = Int32.Parse(mc.PivotValues[0]); vo.chng_op_cd = "I"; vo.chng_usr_id = Security.UserInfo.UserName; dao.Insert(vo); } // end of field loop recCount++; if (recCount % 100 == 0) { // commit after group of updates dao.ExecuteBufferedInserts(); conn.Commit(); conn.BeginTransaction(); // do multiple updates per transaction } } // end of record loop dao.ExecuteBufferedInserts(); conn.Commit(); conn.Close(); dao.Dispose(); sr.Close(); //UpdateCids: // Add any missing CIDs under method 1000000 Progress.Show("Updating CID table..."); string sql = "INSERT INTO " + PubChemWarehouseTable + "(ext_cmpnd_id_nbr,rslt_id,mthd_vrsn_id,rslt_typ_id,rslt_grp_id) " + "SELECT ext_cmpnd_id_nbr, " + PubChemWarehouseSeq + ".NEXTVAL,1000000,0,0 " + "FROM ( " + "SELECT UNIQUE ext_cmpnd_id_nbr " + "FROM " + PubChemWarehouseTable + " r1 " + "WHERE mthd_vrsn_id = " + aid + " " + "AND NOT EXISTS ( " + " SELECT * " + "FROM " + PubChemWarehouseTable + " r2 " + "WHERE mthd_vrsn_id = 1000000 " + "AND r2.ext_cmpnd_id_nbr = r1.ext_cmpnd_id_nbr) " + "and rownum <= 10000)"; DbCommandMx drd = new DbCommandMx(); drd.Prepare(sql); drd.BeginTransaction(); int newCids = 0; while (true) { int addedCids = drd.ExecuteNonReader(); if (addedCids == 0) { break; } newCids += addedCids; drd.Commit(); drd.BeginTransaction(); // do multiple updates per transaction Progress.Show("Updating CID table (" + newCids.ToString() + ")..."); } drd.Dispose(); Progress.Hide(); return(recCount.ToString() + " rows loaded for AID " + aid + " plus " + newCids.ToString() + " new CIDs"); }
private static Alert DeserializeHeaderOld( UserObject uo, bool fast) { string[] sa; Alert a = new Alert(); a.Id = uo.Id; //if (uo.Description.Trim().Length > 0 && uo.Description.Split('\t').Length == 2) //{ // sometimes the desc contains just the queryId \t lastNewDate // sa = uo.Description.Split('\t'); // string qidString = sa[0].Substring(sa[0].IndexOf(" ") + 1); // a.QueryObjId = int.Parse(qidString); // a.LastNewData = DateTimeEx.DateTimeParseUS(sa[1]); // return a; //} if (uo.Description.Trim().Length > 0) { // have old new-style info? try { sa = uo.Description.Split('\t'); a.Owner = sa[0]; a.QueryObjId = Int32.Parse(sa[1]); a.Interval = Int32.Parse(sa[2]); a.MailTo = sa[3]; if (a.MailTo == null || a.MailTo.Trim().Length == 0) // need to get email address { a.MailTo = Security.GetUserEmailAddress(a.Owner); } a.LastCheck = DateTimeMx.NormalizedToDateTime(sa[4]); a.NewCompounds = Int32.Parse(sa[5]); a.ChangedCompounds = Int32.Parse(sa[6]); a.TotalCompounds = Int32.Parse(sa[7]); a.NewRows = Int32.Parse(sa[8]); a.TotalRows = Int32.Parse(sa[9]); if (sa.Length > 10) { a.CheckTablesWithCriteriaOnly = bool.Parse(sa[10]); } if (sa.Length > 11) { a.LastNewData = DateTimeMx.NormalizedToDateTime(sa[11]); } } catch (Exception ex) { throw new Exception(ex.Message, ex); } // debug } else // old style alert, get partial header info { a.Owner = uo.Owner; a.QueryObjId = Int32.Parse(uo.Name.Substring(uo.Name.IndexOf("_") + 1)); a.Interval = 1; // assume just 1 day if (!fast) { try // get old last check date { Query q = QbUtil.ReadQuery(a.QueryObjId); if (q.AlertInterval > 0) { a.Interval = q.AlertInterval; } sa = uo.Content.Split('\t'); a.LastCheck = DateTimeMx.NormalizedToDateTime(sa[4]); } catch (Exception ex) { string msg = ex.Message; } } a.MailTo = Security.GetUserEmailAddress(a.Owner); // need to get email address } return(a); }