/// <summary> /// Get list of CIDS that are in the oracle database but not the Mobius fingerprint files /// </summary> /// <returns></returns> static List <string> GetMissingCidList() { string sql = "", cid; int readCnt = 0; HashSet <string> knownCidsSet = FpDao.GetExistingCidSet(); // get list of cids in DB ExistingUndefinedStructureCids = FpDao.ReadUndefinedStructuresCids(); // cids that have undefined structures and aren't in DB knownCidsSet.UnionWith(ExistingUndefinedStructureCids); List <string> cidList = new List <string>(); if (CorpDatabase) { sql = SelectAllCorpIds; //sql = Lex.Replace(sql, "s.corp_nbr = m.corp_nbr", "s.corp_nbr = m.corp_nbr and s.corp_nbr = 3431641"); // debug //sql = Lex.Replace(sql, "s.corp_nbr = m.corp_nbr", "s.corp_nbr = m.corp_nbr and s.corp_nbr between 1000000 and 1100000"); // debug } else // chembl { sql = SelectAllChemblIds; } DbCommandMx rdr = DbCommandMx.PrepareAndExecuteReader(sql); while (rdr.Read()) { readCnt++; if (CorpDatabase) { int corpId = rdr.GetInt(0); // corp_nbr cid = corpId.ToString(); cid = CompoundId.NormalizeForDatabase(cid); } else { cid = rdr.GetString(0); // chembl } if (!knownCidsSet.Contains(cid)) { cidList.Add(cid); } } rdr.CloseReader(); return(cidList); }
/// <summary> /// Get a list of CorpIds that have been added or modified since the checkpoint date /// </summary> /// <returns></returns> static List <string> GetNewAndModifiedCorpIdList( out Dictionary <string, DateTime> cidUpdateDateDict) { int readCnt = 0; List <string> cidList = new List <string>(); cidUpdateDateDict = new Dictionary <string, DateTime>(); CheckpointDateTime = FpDao.ReadFingerPrintSimMxDataUpdateCheckpointDate(); // , "20-mar-2016 000000"); if (Lex.IsUndefined(CheckpointDateTime)) { throw new Exception("Mobius UpdateSimMxCorpIdDataCheckpointDate is not defined"); } //CheckpointDateTime = "20-mar-2016 000000"; // debug string sql = Lex.Replace(SelectCorpIdsByDateRange, "1-jan-1900 000000", CheckpointDateTime); DbCommandMx rdr = DbCommandMx.PrepareAndExecuteReader(sql); while (rdr.Read()) { readCnt++; int corpId = rdr.GetInt(0); // corp_nbr string cid = corpId.ToString(); cid = CompoundId.NormalizeForDatabase(cid); cidList.Add(cid); DateTime dt = rdr.GetDateTime(1); cidUpdateDateDict[cid] = dt; } rdr.CloseReader(); return(cidList); }
static ICdkMol CdkMolUtil => StaticCdkMol.I; // static molecule shortcut for utility methods /// <summary> /// UpdateCorpFingerprintDatabaseMx /// /////////////////////////////////////////////////////// /// Syntax: Update FingerprintDatabaseMx [Corp | ChEMBL] [MACCS | ECFP4] [Load | ByCidRange | SinceLastCheckpoint | LoadMissing | <SingleCorpId>] /// /// Corp Examples: /// Update FingerprintDatabaseMx Corp MACCS Load /// Update FingerprintDatabaseMx Corp MACCS LoadMissing /// Update FingerprintDatabaseMx Corp MACCS SinceLastCheckpoint /// /// Update FingerprintDatabaseMx Corp ECFP4 Load /// Update FingerprintDatabaseMx Corp ECFP4 LoadMissing /// Update FingerprintDatabaseMx Corp ECFP4 SinceLastCheckpoint /// /// ChEMBL Examples: /// Update FingerprintDatabaseMx Chembl MACCS Load /// Update FingerprintDatabaseMx Chembl MACCS LoadMissing /// /// Update FingerprintDatabaseMx Chembl ECFP4 Load /// Update FingerprintDatabaseMx Chembl ECFP4 LoadMissing /// /////////////////////////////////////////////////////// /// </summary> /// <param name="argString"></param> /// <returns></returns> static public string Update( string argString) { MoleculeMx mol; double mw; string chime, smiles, molString, molFile = ""; string msg = "", sql = "", chemblId, cid = "", maxCorpIdSql, maxIdSql2, mf, missingFixCriteria = "", CorpIdList = ""; int storeChunkCount = 0, CorpId, molregno, molId, lowId = 0, highId = 0, maxDestId = 0, maxSrcId = 0; int readCount = 0, storeCount = 0; ByCheckpoint = ByCidRange = ByCidList = LoadIfMissing = false; ReadChunkSize = DefaultReadChunkSize; WriteChunkSize = DefaultWriteChunkSize; Failures = new Dictionary <string, string>(); NewUndefinedStructureCids = new List <string>(); LastFailure = ""; FailureCount = 0; // global try loop try { //////////////////////// /// Parse Parameters /// //////////////////////// // See which database argString = argString.Trim(); if (Lex.StartsWith(argString, "Corp")) { Database = "Corp"; argString = argString.Substring(5).Trim(); } else if (Lex.StartsWith(argString, "Chembl")) { Database = "ChEMBL"; argString = argString.Substring(6).Trim(); } else { return(SyntaxMsg); } // See which fingerprint type FingerprintType = FingerprintType.MACCS; // default to MACCS if type not defined if (Lex.TryReplace(ref argString, "MACCS", "")) { FingerprintType = FingerprintType.MACCS; argString = argString.Trim(); } else if (Lex.TryReplace(ref argString, "ECFP4", "")) { FingerprintType = FingerprintType.Circular; // (i.e. ECFP4) argString = argString.Trim(); } FpDao = new FingerprintDao(Database, FingerprintType); List <FingerprintRec> fpRecList = new List <FingerprintRec>(); string args = argString.Trim(); string initialMsg = "Update FingerprintDatabase started: " + args; CidList = new List <string>(); // init empty list ////////////////////// /// Corp Database /// ////////////////////// if (CorpDatabase) { if (Lex.Eq(args, "Load")) { ByCidRange = true; ShowProgress("Getting range of CorpIds to insert..."); maxCorpIdSql = SelectMaxCorpId; // get highest id in source db maxSrcId = SelectSingleValueDao.SelectInt(maxCorpIdSql); if (maxSrcId < 0) { maxSrcId = 0; } maxDestId = GetMaxDestMolId(); //maxIdSql2 = "select max(src_compound_id_nbr) from dev_mbs_owner.corp_uc_xref where src_id = 0"; // get highest id in UniChemDb db //highCorpId = SelectSingleValueDao.SelectInt(maxIdSql2); if (maxDestId < 0) { maxDestId = 0; } } else if (Lex.Eq(args, "SinceLastCheckpoint")) { ByCheckpoint = true; ShowProgress("Getting list of CorpIds updated since last checkpoint..."); CidList = GetNewAndModifiedCorpIdList(out CidUpdateDateDict); //CidUpdateList = new List<string>(); // debug with single cmpd //CidUpdateList.Add("03435269"); if (CidList.Count == 0) { return("There have been no updates since the last checkpoint"); } initialMsg += ", CorpIds to add/update: " + CidList.Count; } else if (Lex.StartsWith(args, "ByCorpIdList")) { ByCidList = true; CorpIdList = args.Substring("ByCorpIdList".Length).Trim(); if (Lex.IsUndefined(CorpIdList)) { throw new Exception("Undefined CorpId list"); } } else if (Lex.StartsWith(args, "LoadMissing")) { LoadIfMissing = true; if (args.Contains(" ")) { missingFixCriteria = args.Substring("LoadMissing".Length).Trim(); } ShowProgress("Getting list of missing CorpIds..."); CidList = GetMissingCidList(); if (CidList.Count == 0) { return("There are no missing CorpIds"); } initialMsg += ", Missing CorpIds: " + CidList.Count; } else if (int.TryParse(args, out maxSrcId)) // single CorpId { ByCidRange = true; maxDestId = maxSrcId - 1; // say 1 less is the max we have } else { return(SyntaxMsg); } } /////////////////////// /// ChEMBL Database /// /////////////////////// else if (ChemblDatabase) { if (Lex.Eq(args, "Load")) { ByCidRange = true; ShowProgress("Getting range of MolRegNos to insert..."); sql = "select max(molregno) from chembl_owner.compound_struct_xxxxxx"; maxSrcId = SelectSingleValueDao.SelectInt(sql); if (maxSrcId < 0) { maxSrcId = 0; } maxDestId = GetMaxDestMolId(); if (maxDestId < 0) { maxDestId = 0; } } else if (Lex.StartsWith(args, "LoadMissing")) { LoadIfMissing = true; ShowProgress("Getting list of missing ChEMBL Ids..."); CidList = GetMissingCidList(); if (CidList.Count == 0) { return("There are no missing Ids"); } initialMsg += ", Missing Chembl Ids: " + CidList.Count; } else { return(SyntaxMsg); } } else { return(SyntaxMsg); } CidListOriginalCount = CidList.Count; Log(initialMsg); ///////////////////////////// // Loop over chunks of data ///////////////////////////// for (int chunk = 1; ; chunk++) { ////////////////////// /// Corp Database /// ////////////////////// if (CorpDatabase) { if (ByCheckpoint) // single chunk { string cidList = GetNextListChunk(); if (Lex.IsUndefined(cidList)) { break; } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + cidList + ")"); string matchString = "order by m.corp_nbr"; if (!Lex.Contains(sql, matchString)) { throw new Exception(matchString + " not found"); } sql = Lex.Replace(sql, matchString, "order by m.molecule_date"); msg = "Processing " + CidListOriginalCount + " updates since " + CheckpointDateTime; // + " (" + Mobius.Data.CidList.FormatCidListForDisplay(null, chunkCidList) + ")"; } else if (ByCidRange) // by CorpId range { if (maxDestId >= maxSrcId) { break; // done } lowId = maxDestId + 1; // start of next chunk highId = lowId + ReadChunkSize; maxDestId = highId; //lowCorpId = highCorpId = 12345; // debug if (highId >= maxSrcId) { highId = maxSrcId; } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "between " + lowId + " and " + highId); msg = "Processing CorpId range: " + lowId + " to " + highId; } else if (ByCidList) // by single user-supplied CorpId list { if (chunk > 1) { break; // break 2nd time through } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + CorpIdList + ")"); msg = "Processing CorpId list: " + CorpIdList; } else if (LoadIfMissing) { string cidList = GetNextListChunk(); if (Lex.IsUndefined(cidList)) { break; // all done } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + cidList + ")"); msg = "Processing missing CorpId Chunk: " + Mobius.Data.CidList.FormatAbbreviatedCidListForDisplay(null, cidList) + ", Total Ids: " + CidListOriginalCount; Log(msg); } else { return(SyntaxMsg); } } /////////////////////// /// ChEMBL Database /// /////////////////////// else if (ChemblDatabase) { if (ByCidRange) // by CID range { if (maxDestId >= maxSrcId) { break; // done } lowId = maxDestId + 1; // start of next chunk highId = lowId + ReadChunkSize; maxDestId = highId; //lowId = highId = 12345; // debug if (maxDestId >= maxSrcId) { maxDestId = maxSrcId; } sql = SelectChemblSql; sql = Lex.Replace(sql, "<molregnoCriteria>", "between " + lowId + " and " + highId); msg = "Processing ChEMBL MolRegNo range: " + lowId + " to " + highId; } else if (LoadIfMissing) { string cidList = GetNextListChunk(); if (Lex.IsUndefined(cidList)) { break; // all done } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + cidList + ")"); msg = "Processing missing ChEMBL Id Chunk: " + Mobius.Data.CidList.FormatAbbreviatedCidListForDisplay(null, cidList) + ", Total Ids: " + CidListOriginalCount; } else { return(SyntaxMsg); } } else { return(SyntaxMsg); } ShowProgress(msg); // Execute the SQL to get the rows for the chunk DbCommandMx rdr = DbCommandMx.PrepareAndExecuteReader(sql); DateTime lastShowProgressTime = DateTime.MinValue; /////////////////////////////////////////// /// Loop over rows in the current chunk /// /////////////////////////////////////////// while (true) { // Update progress display if (DateTime.Now.Subtract(lastShowProgressTime).TotalSeconds > 1) // show progress { int storeTotalCount = storeCount + storeChunkCount; string msg2 = msg + "\r\n" + "Reads: " + readCount + "\r\n" + "Undefined: " + NewUndefinedStructureCids.Count + "\r\n" + "Insert/Updates: " + storeTotalCount + "\r\n" + "Failures: " + FailureCount + "\r\n" + "Failure Types: " + Failures.Count + "\r\n" + "Last Failure: " + LastFailure; ShowProgress(msg2); lastShowProgressTime = DateTime.Now; } // Read and process next compound bool readOk = rdr.Read(); if (readOk) { readCount++; try { double t1 = 0, t2 = 0, t3 = 0, t4 = 0; DateTime t0 = DateTime.Now; mol = null; //t2 = TimeOfDay.Delta(ref t0); ////////////////////// /// Corp Database /// ////////////////////// if (CorpDatabase) { CorpId = rdr.GetInt(0); // corp_nbr //Log("CorpId: " + CorpId); // debug molId = CorpId; cid = CorpId.ToString(); cid = CompoundId.NormalizeForDatabase(cid); if (!rdr.IsNull(1)) // be sure chime field isn't null { chime = rdr.GetClob(1); if (Lex.IsDefined(chime)) { molFile = MoleculeMx.ChimeStringToMolfileString(chime); // convert Chime to MolFile mol = new MoleculeMx(MoleculeFormat.Molfile, molFile); } } MoleculeDateTime = rdr.GetDateTimeByName("Molecule_Date"); // Date molecule was updated in the CorpDB cartridge DB } /////////////////////// /// ChEMBL Database /// /////////////////////// else // chembl { molId = molregno = rdr.GetInt(0); cid = chemblId = rdr.GetString(1); smiles = rdr.GetString(2); if (Lex.IsDefined(smiles)) { mol = new MoleculeMx(MoleculeFormat.Smiles, smiles); } } if (MoleculeMx.IsUndefined(mol) || mol.AtomCount <= 1) { NewUndefinedStructureCids.Add(cid); continue; //mol = new AtomContainer(); // write empty structure } bool includeOverallFingerprint = true; List <BitSetFingerprint> fps = CdkMol.BuildBitSetFingerprints(mol.MolfileString, includeOverallFingerprint, FingerprintType); //t3 = TimeOfDay.Delta(ref t0); foreach (BitSetFingerprint fp in fps) { FingerprintRec fpr = new FingerprintRec(); fpr.molId = molId; fpr.SrcId = SrcDbId; fpr.Cid = cid; fpr.Cardinality = fp.cardinality(); fpr.Fingerprint = fp.asBitSet().toLongArray(); fpRecList.Add(fpr); } //t4 = TimeOfDay.Delta(ref t0); t4 = t4; } catch (Exception ex) { if (!Failures.ContainsKey(ex.Message)) { Failures.Add(ex.Message, cid); } else { Failures[ex.Message] += ", " + cid; } LastFailure = "Cid: " + cid + " - " + ex.Message; Log(LastFailure); //ShowProgress(ex.Message + "\r\n" + ex.StackTrace.ToString()); // debug FailureCount++; continue; } storeChunkCount++; } bool commitTransaction = (storeChunkCount >= WriteChunkSize || (!readOk && storeChunkCount > 0)); if (commitTransaction) // end of chunk of data to store? { // if updating by CheckPoint date range then merge existing data with new/updated data if (ByCheckpoint) { if (readCount > 0 && (storeCount > 0 || FailureCount == 0)) // make sure not everything has failed) { MergeRecordsIntoFiles(fpRecList); } } // Simple append of records to files else { FpDao.OpenWriters("bin", FileMode.Append); // open bin files for append foreach (FingerprintRec fpr in fpRecList) // write out buffered recs { FpDao.WriteFingerprintRec(fpr); } FpDao.CloseWriters(); int cnt = fpRecList.Count; if (cnt > 0) { string cid1 = fpRecList[0].Cid; string cid2 = fpRecList[cnt - 1].Cid; Log("Records Appended: " + cnt + ", CIDS: " + cid1 + " - " + cid2); } else { Log("Records Appended: 0"); } } fpRecList.Clear(); storeCount += storeChunkCount; storeChunkCount = 0; } if (!readOk) { break; } } // end of read loop for rows in a chunk rdr.Dispose(); } // end for loop of chunks DeleteTempFiles(); if (LoadIfMissing) // update list of cids with missing structures { ExistingUndefinedStructureCids.UnionWith(NewUndefinedStructureCids); FpDao.WriteUndefinedStructuresCids(ExistingUndefinedStructureCids); } msg = "*** Update Complete ***\r\n\r\n" + msg; ShowProgress(msg); System.Threading.Thread.Sleep(100); string logMsg = "UpdateFingerprintDb - CIDs stored: " + storeCount + ", Undefined structures: " + NewUndefinedStructureCids.Count + ", failures: " + FailureCount + "\r\n"; foreach (string key in Failures.Keys) { logMsg += key + " - CIDs: " + Failures[key] + "\r\n"; } Log(logMsg); return(logMsg); } // end of main try loop catch (Exception ex) { Log(DebugLog.FormatExceptionMessage(ex)); throw new Exception(ex.Message, ex); } }
/// <summary> /// Reorder the rows in all annotation tables that haven't been reordered since last call /// </summary> public static string ReorderRows(string singleAnnotTableId) { long methodId = 0, rowCnt = 0, rowCnt2 = 0, totalRows = 0; int failCnt = 0, skipCnt = 0; Lex lex = new Lex(); lex.OpenString(singleAnnotTableId); string singleMvId = lex.Get(); // item to set string sql = @" select mthd_vrsn_id, count from ( select mthd_vrsn_id, count(mthd_vrsn_id) count FROM MBS_OWNER.mbs_adw_rslt WHERE mthd_vrsn_id > 0 group by mthd_vrsn_id) order by mthd_vrsn_id " ; if (Lex.IsDefined(singleMvId)) // single annot table { if (!Lex.IsLong(singleMvId)) { throw new ArgumentException(); } sql = Lex.Replace(sql, "> 0", "= " + singleMvId); } Mobius.UAL.Progress.Show("Getting list of annotation tables to update..."); DbCommandMx cmd = DbCommandMx.PrepareAndExecuteReader(sql); List <long> methodIds = new List <long>(); List <long> methodRowCounts = new List <long>(); while (cmd.Read()) { methodId = cmd.GetLong(0); methodIds.Add(methodId); rowCnt = cmd.GetLong(1); methodRowCounts.Add(rowCnt); } cmd.Dispose(); totalRows = 0; int vmIdCnt = 0; for (int mi = 0; mi < methodIds.Count; mi++) { methodId = methodIds[mi]; //if (methodId <= 442290) continue; // continue where left off rowCnt = methodRowCounts[mi]; string msg = "Table Id: " + methodId + " (" + (mi + 1) + " / " + methodIds.Count + "), Rows: " + rowCnt + ", Total Rows Reordered: " + totalRows + ", Failed: " + failCnt + ", Skipped: " + skipCnt; if (rowCnt > 1000000 && Lex.IsUndefined(singleMvId)) // skip if > 1000000 rows and not a specific single annot table { skipCnt++; DebugLog.Message("Skipping " + msg); continue; } Mobius.UAL.Progress.Show(msg); DebugLog.Message(msg); try { rowCnt2 = ReorderRows(methodId); } catch (Exception ex) { DebugLog.Message(DebugLog.FormatExceptionMessage(ex)); failCnt++; continue; } vmIdCnt++; totalRows += rowCnt; } return("Annotation tables reordered: " + vmIdCnt + ", Failed: " + failCnt + ", Skipped: " + skipCnt + ", Total rows reordered: " + totalRows); }