/// <summary> /// GetMaxCorpId /// </summary> /// <returns></returns> public static int GetMaxDestMolId() { int molId, maxMolId = 0; FingerprintDao fpd = new FingerprintDao(Database, FingerprintType); List <string> ids = new List <string>(); if (!fpd.DataFilesExist()) { return(maxMolId); } fpd.OpenReaders(); while (true) { FingerprintRec rec = fpd.ReadFingerprintRec(); if (rec == null) { break; } //CorpIds.Add(rec.Cid); // debug if (CorpDatabase) { int.TryParse(rec.Cid, out molId); } else { molId = rec.molId; } if (molId > maxMolId) { maxMolId = molId; } } fpd.CloseReaders(); return(maxMolId); }
/// <summary> /// ExecuteSearch /// </summary> /// <param name="queryMol"></param> public List <StructSearchMatch> ExecuteSearch( IAtomContainer queryMol) { AssertMx.IsTrue(FingerprintType == FingerprintType.MACCS || FingerprintType == FingerprintType.Circular, "Invalid FingerprintType: " + FingerprintType); QueryMol = queryMol; BitSetFingerprint fp = // generate a fingerprint CdkMol.BuildBitSetFingerprintForLargestFragment(queryMol, FingerprintType); QueryFpCardinality = fp.cardinality(); QueryFpLongArray = fp.asBitSet().toLongArray(); MatchList = new List <StructSearchMatch>(); ThreadException = null; foreach (string databaseName in FingerprintDbMx.Databases) // loop on all databases { int srcId = -1; if (Lex.Contains(databaseName, "corp")) { if (!GetCorpSim) { continue; } srcId = StructSearchMatch.CorpDbId; } else if (Lex.Contains(databaseName, "chembl")) { if (!GetChemblSim) { continue; } srcId = StructSearchMatch.ChemblDbId; } if (Debug) { DebugLog.Message("Starting sim search on " + databaseName + " database"); } FpDao = new FingerprintDao(databaseName, FingerprintType); if (!FpDao.DataFilesExist()) { continue; // no files for this database } FileStreamReaders = FpDao.OpenReaders(); FileMatchLists = new List <StructSearchMatch> [FileStreamReaders.Length]; for (int i1 = 0; i1 < FileMatchLists.Length; i1++) { FileMatchLists[i1] = new List <StructSearchMatch>(); } DateTime t0 = DateTime.Now; if (UseMultipleThreads) { ExecuteMultiThreadSearch(); } else { ExecuteSingleThreadSearch(); } double et = TimeOfDay.Delta(ref t0); FpDao.CloseReaders(); List <StructSearchMatch> matchList = MergeIndividualFileMatchLists(); if (KeysToExclude != null || SearchKeySubset != null) // filter by any allowed/disallowed keys { List <StructSearchMatch> matchList2 = new List <StructSearchMatch>(); foreach (StructSearchMatch m0 in matchList) { if (KeysToExclude != null && KeysToExclude.Contains(m0.SrcCid)) { continue; } if (SearchKeySubset != null && !SearchKeySubset.Contains(m0.SrcCid)) { continue; } matchList2.Add(m0); } matchList = matchList2; } matchList.Sort(StructSearchMatch.CompareByMatchQuality); //int removeCount = matchList.Count - MaxHits; // limit to maxhits per database //if (removeCount > 0) // matchList.RemoveRange(MaxHits, removeCount); //foreach (StructSearchMatch ssm0 in matchList) // if (ssm0.SrcId != srcId) ssm0.SrcId = srcId; // debug MatchList.AddRange(matchList); double et2 = TimeOfDay.Delta(ref t0); string msg = string.Format("Search complete (" + databaseName + ").Time : {0:0.00} ", et) + string.Format("{0} Hits: ", FileMatchLists[0].Count); if (Debug) { DebugLog.Message(msg); } for (int hi = 0; hi < 5 && hi < FileMatchLists[0].Count; hi++) { StructSearchMatch sm = FileMatchLists[0][hi]; msg += sm.SrcCid + string.Format(" = {0:0.00}, ", sm.MatchScore); } } // database loop if (ThreadException != null) { throw new Exception(ThreadException.Message, ThreadException); } MatchList.Sort( // sort by decreasing sim value delegate(StructSearchMatch p1, StructSearchMatch p2) { return(p2.MatchScore.CompareTo(p1.MatchScore)); }); if (MaxHits > 0 && MatchList.Count > MaxHits) // remove hits beyond maximum if defined { MatchList.RemoveRange(MaxHits, MatchList.Count - MaxHits); } //ShowProgress(msg); //Thread.Sleep(10000000); return(MatchList); }
static ICdkMol CdkMolUtil => StaticCdkMol.I; // static molecule shortcut for utility methods /// <summary> /// UpdateCorpFingerprintDatabaseMx /// /////////////////////////////////////////////////////// /// Syntax: Update FingerprintDatabaseMx [Corp | ChEMBL] [MACCS | ECFP4] [Load | ByCidRange | SinceLastCheckpoint | LoadMissing | <SingleCorpId>] /// /// Corp Examples: /// Update FingerprintDatabaseMx Corp MACCS Load /// Update FingerprintDatabaseMx Corp MACCS LoadMissing /// Update FingerprintDatabaseMx Corp MACCS SinceLastCheckpoint /// /// Update FingerprintDatabaseMx Corp ECFP4 Load /// Update FingerprintDatabaseMx Corp ECFP4 LoadMissing /// Update FingerprintDatabaseMx Corp ECFP4 SinceLastCheckpoint /// /// ChEMBL Examples: /// Update FingerprintDatabaseMx Chembl MACCS Load /// Update FingerprintDatabaseMx Chembl MACCS LoadMissing /// /// Update FingerprintDatabaseMx Chembl ECFP4 Load /// Update FingerprintDatabaseMx Chembl ECFP4 LoadMissing /// /////////////////////////////////////////////////////// /// </summary> /// <param name="argString"></param> /// <returns></returns> static public string Update( string argString) { MoleculeMx mol; double mw; string chime, smiles, molString, molFile = ""; string msg = "", sql = "", chemblId, cid = "", maxCorpIdSql, maxIdSql2, mf, missingFixCriteria = "", CorpIdList = ""; int storeChunkCount = 0, CorpId, molregno, molId, lowId = 0, highId = 0, maxDestId = 0, maxSrcId = 0; int readCount = 0, storeCount = 0; ByCheckpoint = ByCidRange = ByCidList = LoadIfMissing = false; ReadChunkSize = DefaultReadChunkSize; WriteChunkSize = DefaultWriteChunkSize; Failures = new Dictionary <string, string>(); NewUndefinedStructureCids = new List <string>(); LastFailure = ""; FailureCount = 0; // global try loop try { //////////////////////// /// Parse Parameters /// //////////////////////// // See which database argString = argString.Trim(); if (Lex.StartsWith(argString, "Corp")) { Database = "Corp"; argString = argString.Substring(5).Trim(); } else if (Lex.StartsWith(argString, "Chembl")) { Database = "ChEMBL"; argString = argString.Substring(6).Trim(); } else { return(SyntaxMsg); } // See which fingerprint type FingerprintType = FingerprintType.MACCS; // default to MACCS if type not defined if (Lex.TryReplace(ref argString, "MACCS", "")) { FingerprintType = FingerprintType.MACCS; argString = argString.Trim(); } else if (Lex.TryReplace(ref argString, "ECFP4", "")) { FingerprintType = FingerprintType.Circular; // (i.e. ECFP4) argString = argString.Trim(); } FpDao = new FingerprintDao(Database, FingerprintType); List <FingerprintRec> fpRecList = new List <FingerprintRec>(); string args = argString.Trim(); string initialMsg = "Update FingerprintDatabase started: " + args; CidList = new List <string>(); // init empty list ////////////////////// /// Corp Database /// ////////////////////// if (CorpDatabase) { if (Lex.Eq(args, "Load")) { ByCidRange = true; ShowProgress("Getting range of CorpIds to insert..."); maxCorpIdSql = SelectMaxCorpId; // get highest id in source db maxSrcId = SelectSingleValueDao.SelectInt(maxCorpIdSql); if (maxSrcId < 0) { maxSrcId = 0; } maxDestId = GetMaxDestMolId(); //maxIdSql2 = "select max(src_compound_id_nbr) from dev_mbs_owner.corp_uc_xref where src_id = 0"; // get highest id in UniChemDb db //highCorpId = SelectSingleValueDao.SelectInt(maxIdSql2); if (maxDestId < 0) { maxDestId = 0; } } else if (Lex.Eq(args, "SinceLastCheckpoint")) { ByCheckpoint = true; ShowProgress("Getting list of CorpIds updated since last checkpoint..."); CidList = GetNewAndModifiedCorpIdList(out CidUpdateDateDict); //CidUpdateList = new List<string>(); // debug with single cmpd //CidUpdateList.Add("03435269"); if (CidList.Count == 0) { return("There have been no updates since the last checkpoint"); } initialMsg += ", CorpIds to add/update: " + CidList.Count; } else if (Lex.StartsWith(args, "ByCorpIdList")) { ByCidList = true; CorpIdList = args.Substring("ByCorpIdList".Length).Trim(); if (Lex.IsUndefined(CorpIdList)) { throw new Exception("Undefined CorpId list"); } } else if (Lex.StartsWith(args, "LoadMissing")) { LoadIfMissing = true; if (args.Contains(" ")) { missingFixCriteria = args.Substring("LoadMissing".Length).Trim(); } ShowProgress("Getting list of missing CorpIds..."); CidList = GetMissingCidList(); if (CidList.Count == 0) { return("There are no missing CorpIds"); } initialMsg += ", Missing CorpIds: " + CidList.Count; } else if (int.TryParse(args, out maxSrcId)) // single CorpId { ByCidRange = true; maxDestId = maxSrcId - 1; // say 1 less is the max we have } else { return(SyntaxMsg); } } /////////////////////// /// ChEMBL Database /// /////////////////////// else if (ChemblDatabase) { if (Lex.Eq(args, "Load")) { ByCidRange = true; ShowProgress("Getting range of MolRegNos to insert..."); sql = "select max(molregno) from chembl_owner.compound_struct_xxxxxx"; maxSrcId = SelectSingleValueDao.SelectInt(sql); if (maxSrcId < 0) { maxSrcId = 0; } maxDestId = GetMaxDestMolId(); if (maxDestId < 0) { maxDestId = 0; } } else if (Lex.StartsWith(args, "LoadMissing")) { LoadIfMissing = true; ShowProgress("Getting list of missing ChEMBL Ids..."); CidList = GetMissingCidList(); if (CidList.Count == 0) { return("There are no missing Ids"); } initialMsg += ", Missing Chembl Ids: " + CidList.Count; } else { return(SyntaxMsg); } } else { return(SyntaxMsg); } CidListOriginalCount = CidList.Count; Log(initialMsg); ///////////////////////////// // Loop over chunks of data ///////////////////////////// for (int chunk = 1; ; chunk++) { ////////////////////// /// Corp Database /// ////////////////////// if (CorpDatabase) { if (ByCheckpoint) // single chunk { string cidList = GetNextListChunk(); if (Lex.IsUndefined(cidList)) { break; } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + cidList + ")"); string matchString = "order by m.corp_nbr"; if (!Lex.Contains(sql, matchString)) { throw new Exception(matchString + " not found"); } sql = Lex.Replace(sql, matchString, "order by m.molecule_date"); msg = "Processing " + CidListOriginalCount + " updates since " + CheckpointDateTime; // + " (" + Mobius.Data.CidList.FormatCidListForDisplay(null, chunkCidList) + ")"; } else if (ByCidRange) // by CorpId range { if (maxDestId >= maxSrcId) { break; // done } lowId = maxDestId + 1; // start of next chunk highId = lowId + ReadChunkSize; maxDestId = highId; //lowCorpId = highCorpId = 12345; // debug if (highId >= maxSrcId) { highId = maxSrcId; } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "between " + lowId + " and " + highId); msg = "Processing CorpId range: " + lowId + " to " + highId; } else if (ByCidList) // by single user-supplied CorpId list { if (chunk > 1) { break; // break 2nd time through } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + CorpIdList + ")"); msg = "Processing CorpId list: " + CorpIdList; } else if (LoadIfMissing) { string cidList = GetNextListChunk(); if (Lex.IsUndefined(cidList)) { break; // all done } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + cidList + ")"); msg = "Processing missing CorpId Chunk: " + Mobius.Data.CidList.FormatAbbreviatedCidListForDisplay(null, cidList) + ", Total Ids: " + CidListOriginalCount; Log(msg); } else { return(SyntaxMsg); } } /////////////////////// /// ChEMBL Database /// /////////////////////// else if (ChemblDatabase) { if (ByCidRange) // by CID range { if (maxDestId >= maxSrcId) { break; // done } lowId = maxDestId + 1; // start of next chunk highId = lowId + ReadChunkSize; maxDestId = highId; //lowId = highId = 12345; // debug if (maxDestId >= maxSrcId) { maxDestId = maxSrcId; } sql = SelectChemblSql; sql = Lex.Replace(sql, "<molregnoCriteria>", "between " + lowId + " and " + highId); msg = "Processing ChEMBL MolRegNo range: " + lowId + " to " + highId; } else if (LoadIfMissing) { string cidList = GetNextListChunk(); if (Lex.IsUndefined(cidList)) { break; // all done } sql = SelectByCorpIdCriteria; sql = Lex.Replace(sql, "<CorpIdCriteria>", "in (" + cidList + ")"); msg = "Processing missing ChEMBL Id Chunk: " + Mobius.Data.CidList.FormatAbbreviatedCidListForDisplay(null, cidList) + ", Total Ids: " + CidListOriginalCount; } else { return(SyntaxMsg); } } else { return(SyntaxMsg); } ShowProgress(msg); // Execute the SQL to get the rows for the chunk DbCommandMx rdr = DbCommandMx.PrepareAndExecuteReader(sql); DateTime lastShowProgressTime = DateTime.MinValue; /////////////////////////////////////////// /// Loop over rows in the current chunk /// /////////////////////////////////////////// while (true) { // Update progress display if (DateTime.Now.Subtract(lastShowProgressTime).TotalSeconds > 1) // show progress { int storeTotalCount = storeCount + storeChunkCount; string msg2 = msg + "\r\n" + "Reads: " + readCount + "\r\n" + "Undefined: " + NewUndefinedStructureCids.Count + "\r\n" + "Insert/Updates: " + storeTotalCount + "\r\n" + "Failures: " + FailureCount + "\r\n" + "Failure Types: " + Failures.Count + "\r\n" + "Last Failure: " + LastFailure; ShowProgress(msg2); lastShowProgressTime = DateTime.Now; } // Read and process next compound bool readOk = rdr.Read(); if (readOk) { readCount++; try { double t1 = 0, t2 = 0, t3 = 0, t4 = 0; DateTime t0 = DateTime.Now; mol = null; //t2 = TimeOfDay.Delta(ref t0); ////////////////////// /// Corp Database /// ////////////////////// if (CorpDatabase) { CorpId = rdr.GetInt(0); // corp_nbr //Log("CorpId: " + CorpId); // debug molId = CorpId; cid = CorpId.ToString(); cid = CompoundId.NormalizeForDatabase(cid); if (!rdr.IsNull(1)) // be sure chime field isn't null { chime = rdr.GetClob(1); if (Lex.IsDefined(chime)) { molFile = MoleculeMx.ChimeStringToMolfileString(chime); // convert Chime to MolFile mol = new MoleculeMx(MoleculeFormat.Molfile, molFile); } } MoleculeDateTime = rdr.GetDateTimeByName("Molecule_Date"); // Date molecule was updated in the CorpDB cartridge DB } /////////////////////// /// ChEMBL Database /// /////////////////////// else // chembl { molId = molregno = rdr.GetInt(0); cid = chemblId = rdr.GetString(1); smiles = rdr.GetString(2); if (Lex.IsDefined(smiles)) { mol = new MoleculeMx(MoleculeFormat.Smiles, smiles); } } if (MoleculeMx.IsUndefined(mol) || mol.AtomCount <= 1) { NewUndefinedStructureCids.Add(cid); continue; //mol = new AtomContainer(); // write empty structure } bool includeOverallFingerprint = true; List <BitSetFingerprint> fps = CdkMol.BuildBitSetFingerprints(mol.MolfileString, includeOverallFingerprint, FingerprintType); //t3 = TimeOfDay.Delta(ref t0); foreach (BitSetFingerprint fp in fps) { FingerprintRec fpr = new FingerprintRec(); fpr.molId = molId; fpr.SrcId = SrcDbId; fpr.Cid = cid; fpr.Cardinality = fp.cardinality(); fpr.Fingerprint = fp.asBitSet().toLongArray(); fpRecList.Add(fpr); } //t4 = TimeOfDay.Delta(ref t0); t4 = t4; } catch (Exception ex) { if (!Failures.ContainsKey(ex.Message)) { Failures.Add(ex.Message, cid); } else { Failures[ex.Message] += ", " + cid; } LastFailure = "Cid: " + cid + " - " + ex.Message; Log(LastFailure); //ShowProgress(ex.Message + "\r\n" + ex.StackTrace.ToString()); // debug FailureCount++; continue; } storeChunkCount++; } bool commitTransaction = (storeChunkCount >= WriteChunkSize || (!readOk && storeChunkCount > 0)); if (commitTransaction) // end of chunk of data to store? { // if updating by CheckPoint date range then merge existing data with new/updated data if (ByCheckpoint) { if (readCount > 0 && (storeCount > 0 || FailureCount == 0)) // make sure not everything has failed) { MergeRecordsIntoFiles(fpRecList); } } // Simple append of records to files else { FpDao.OpenWriters("bin", FileMode.Append); // open bin files for append foreach (FingerprintRec fpr in fpRecList) // write out buffered recs { FpDao.WriteFingerprintRec(fpr); } FpDao.CloseWriters(); int cnt = fpRecList.Count; if (cnt > 0) { string cid1 = fpRecList[0].Cid; string cid2 = fpRecList[cnt - 1].Cid; Log("Records Appended: " + cnt + ", CIDS: " + cid1 + " - " + cid2); } else { Log("Records Appended: 0"); } } fpRecList.Clear(); storeCount += storeChunkCount; storeChunkCount = 0; } if (!readOk) { break; } } // end of read loop for rows in a chunk rdr.Dispose(); } // end for loop of chunks DeleteTempFiles(); if (LoadIfMissing) // update list of cids with missing structures { ExistingUndefinedStructureCids.UnionWith(NewUndefinedStructureCids); FpDao.WriteUndefinedStructuresCids(ExistingUndefinedStructureCids); } msg = "*** Update Complete ***\r\n\r\n" + msg; ShowProgress(msg); System.Threading.Thread.Sleep(100); string logMsg = "UpdateFingerprintDb - CIDs stored: " + storeCount + ", Undefined structures: " + NewUndefinedStructureCids.Count + ", failures: " + FailureCount + "\r\n"; foreach (string key in Failures.Keys) { logMsg += key + " - CIDs: " + Failures[key] + "\r\n"; } Log(logMsg); return(logMsg); } // end of main try loop catch (Exception ex) { Log(DebugLog.FormatExceptionMessage(ex)); throw new Exception(ex.Message, ex); } }