// ----------------- private methods ----------------- // make sure that for a single molecule, the way that the hashes are created & folded is consistent with a reference private static void CheckFP(string molstr, CircularFingerprinterClass classType, int folding, int[] refHash) { var strType = classType == CircularFingerprinterClass.ECFP6 ? "ECFP6" : "FCFP6"; WriteLine($"Comparing hash codes for {strType}/folding={folding}"); var mol = new EnumerableSDFReader(new StringReader(molstr), ChemObjectBuilder.Instance).First(); var model = new Bayesian(classType, folding); model.AddMolecule(mol, false); var calcHash = model.Training[0]; var same = calcHash.Length == refHash.Length; if (same) { for (int n = 0; n < calcHash.Length; n++) { if (calcHash[n] != refHash[n]) { same = false; break; } } } if (!same) { WriteLine($" ** calculated: {ArrayStr(calcHash)}"); WriteLine($" ** reference: {ArrayStr(refHash)}"); throw new CDKException("Hashes differ."); } }
// compares a series of molecules for folding fingerprints being literally identical private static void CompareFolding(string sdfile, string fpField, CircularFingerprinterClass classType, int folding) { WriteLine($"[{sdfile}] calculation of: {fpField}"); using (var ins = ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}")) { var rdr = new EnumerableSDFReader(ins, ChemObjectBuilder.Instance); int row = 0; foreach (var mol in rdr) { row++; var model = new Bayesian(classType, folding); model.AddMolecule(mol, false); var hashes = model.Training[0]; var gotHashes = ArrayStr(hashes); var reqHashes = (string)mol.GetProperties()[fpField]; Assert.AreEqual(reqHashes, gotHashes, $"Folded hashes do not match reference at {row}."); } } }
/// <summary> /// Instantiate a Bayesian model with no data. /// </summary> /// <param name="classType">one of the <see cref="CircularFingerprinterClass"/> enum</param> /// <param name="folding">the maximum number of fingerprint bits, which must be a power of 2 (e.g. 1024, 2048) or 0 for no folding</param> public Bayesian(CircularFingerprinterClass classType, int folding) { this.ClassType = classType; this.Folding = folding; // make sure the folding is valid bool bad = false; if (folding > 0) { for (int f = folding; f > 0; f = f >> 1) { if ((f & 1) == 1 && f != 1) { bad = true; break; } } } if (folding < 0 || bad) { throw new ArithmeticException("Fingerprint folding " + folding + " invalid: must be 0 or power of 2."); } }
private void RunTest(string sdfile, string actvField, CircularFingerprinterClass classType, int folding, int xval, string modelFN, bool perceiveStereo) { WriteLine("[" + modelFN + "]"); WriteLine(" Loading " + sdfile); try { var model = new Bayesian(classType, folding) { PerceiveStereo = perceiveStereo }; int row = 0, numActives = 0; using (var rdr = new EnumerableSDFReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}"), ChemObjectBuilder.Instance)) { foreach (var mol in rdr) { row++; var stractv = (string)mol.GetProperties()[actvField]; int active = stractv.Equals("true", StringComparison.Ordinal) ? 1 : stractv.Equals("false", StringComparison.Ordinal) ? 0 : int.Parse(stractv, NumberFormatInfo.InvariantInfo); if (active != 0 && active != 1) { throw new CDKException("Activity field not found or invalid"); } model.AddMolecule(mol, active == 1); numActives += active; } } WriteLine($" Training with {row} rows, {numActives} actives, {(row - numActives)} inactives"); model.Build(); if (xval == 3) { model.ValidateThreeFold(); } else if (xval == 5) { model.ValidateFiveFold(); } else { model.ValidateLeaveOneOut(); } WriteLine($" Validation: ROC AUC={model.RocAUC}"); WriteLine($" Parsing reference model"); //FileReader frdr=new FileReader(modelFN); Bayesian reference; using (var mrdr = new StreamReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{modelFN}"))) { reference = Bayesian.Deserialise(mrdr); } // start comparing the details... bool failed = false; if (model.Folding != reference.Folding) { WriteLine($" ** reference folding size={reference.Folding}"); failed = true; } if (model.TrainingSize != reference.TrainingSize) { WriteLine($" ** reference training size={reference.TrainingSize}"); failed = true; } if (model.TrainingActives != reference.TrainingActives) { WriteLine($" ** reference training actives={reference.TrainingActives}"); failed = true; } if (model.RocType != reference.RocType) { WriteLine($" ** reference ROC type={reference.RocType}"); failed = true; } if (!DblEqual(model.RocAUC, reference.RocAUC)) { WriteLine($" ** reference ROC AUC={reference.RocAUC}"); failed = true; } if (Math.Abs(model.LowThreshold - reference.LowThreshold) > 0.00000000000001) { WriteLine($" ** reference lowThresh={reference.LowThreshold} different to calculated {model.LowThreshold}"); failed = true; } if (Math.Abs(model.HighThreshold - reference.HighThreshold) > 0.00000000000001) { WriteLine($" ** reference highThresh={reference.HighThreshold} different to calculated {model.HighThreshold}"); failed = true; } // make sure individual hash bit contributions match var mbits = model.Contributions; var rbits = reference.Contributions; if (mbits.Count != rbits.Count) { WriteLine($" ** model has {mbits.Count} contribution bits, reference has {rbits.Count}"); failed = true; } foreach (var h in mbits.Keys) { if (!rbits.ContainsKey(h)) { WriteLine($" ** model hash bit {h} not found in reference"); failed = true; break; // one is enough } } foreach (var h in rbits.Keys) { if (!mbits.ContainsKey(h)) { WriteLine($" ** reference hash bit {h} not found in model"); failed = true; break; // one is enough } } foreach (var h in mbits.Keys) { if (rbits.ContainsKey(h)) { double c1 = mbits[h], c2 = rbits[h]; if (!DblEqual(c1, c2)) { WriteLine($" ** contribution for bit {h}: model={c1}, reference={c2}"); failed = true; break; // one is enough } } } if (failed) { throw new CDKException("Comparison to reference failed"); } } catch (CDKException) { throw; } catch (Exception ex) { throw new CDKException("Test failed", ex); } }
// performs a bulk test: loads an SDfile, builds a model with the given parameters, and compares it to a reference model // that has been previously serialised private void RunTest(string sdfile, string actvField, CircularFingerprinterClass classType, int folding, int xval, string modelFN) { RunTest(sdfile, actvField, classType, folding, xval, modelFN, false); }
// ----------------- public methods ----------------- /// <summary> /// Instantiate a Bayesian model with no data. /// </summary> /// <param name="classType">one of the CircularFingerprinter.CLASS_* constants</param> public Bayesian(CircularFingerprinterClass classType) { this.ClassType = classType; }
/// <summary> /// BuildTest /// </summary> public static void BuildTest() { CircularFingerprinter cfp = null; CircularFingerprinterClass FpClass = CircularFingerprinterClass.ECFP6; // FP diameter int FpLen = 2048; // folded binary fp length IAtomContainer mol, mol2; //string molfile = FileUtil.ReadFile(@"C:\Download\CorpId-12345.mol"); //java.io.StringReader sr = new java.io.StringReader(molfile); //if (Lex.Contains(molfile, "v2000")) // cor = new MDLV2000Reader(sr); //else // cor = new MDLV3000Reader(sr); //cor.setReaderMode(IChemObjectReader.Mode.RELAXED); //ac = (IAtomContainer)cor.read(new AtomContainer()); //cor.close(); FpClass = CircularFingerprinterClass.ECFP4; // debug cfp = new CircularFingerprinter(FpClass, FpLen); StreamReader reader = new StreamReader(@"C:\Download\CorpId-12345.mol"); //FileReader FileReader = new FileReader(@"C:\Download\V3000 Mols.sdf"); EnumerableSDFReader rdr = new EnumerableSDFReader(reader.BaseStream, ChemObjectBuilder.Instance); rdr.ReaderMode = ChemObjectReaderMode.Relaxed; IEnumerator <IAtomContainer> cursor = rdr.GetEnumerator(); while (cursor.MoveNext()) { mol = cursor.Current; mol = CdkMol.GetLargestMoleculeFragment(mol); ICountFingerprint cfp1 = cfp.GetCountFingerprint(mol); // get hash values and counts for each cfp.Calculate(mol); int fpCount = cfp.FPCount; for (int fpi = 0; fpi < fpCount; fpi++) // gets { CircularFingerprint cfp2 = cfp.GetFP(fpi); // gets hash, iteration and lists of atoms (dups appear multiple times) } IBitFingerprint bfp = cfp.GetBitFingerprint(mol); BitArray bs = bfp.AsBitSet(); int bsCard = bfp.Cardinality; long bsSize = bfp.Length; continue; } reader.Close(); return; //java.io.StringReader sr = new java.io.StringReader(molfile); //AtomContainer mol = new AtomContainer(); //mol.addAtom(new Atom("C")); //mol.addAtom(new Atom("H")); //mol.addAtom(new Atom("H")); //mol.addAtom(new Atom("H")); //mol.addAtom(new Atom("H")); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(1))); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(2))); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(3))); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(4))); //FileReader FileReader = new FileReader(@"C:\Download\CorpId-12345.mol"); //MolReader mr = new MolReader(FileReader, DefaultChemObjectBuilder.getInstance()); //java.io.StringReader sr = new java.io.StringReader(molfile); //IMol m = (IMol)mr.next(); //FileReader.close(); }
/// <summary> /// Build fingerprint /// </summary> /// <param name="mol"></param> public static BitSetFingerprint BuildBitSetFingerprint( IAtomContainer mol, FingerprintType fpType, int fpSubtype = -1, int fpLen = -1) { // Data for Tanimoto similarity using various fingerprint types for CorpId 123456 query. // Cart - Standard MDL Oracle Cartridge scores // // Similarity Score // ------------------------------------------------ // Size -> 192 896 1024 1024 128 1024 320 // CorpId Cart MACCS PbChm ECFP4 EXT EState Basic Sbstr // ------ ---- ---- ---- ---- ---- ---- ---- ---- // 123456 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.99 0.98 0.96 0.77 0.95 1.00 0.95 1.00 // 123456 0.99 0.98 0.96 0.77 0.95 1.00 0.94 1.00 // 123456 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.99 0.91 1.00 0.81 1.00 1.00 1.00 1.00 // 123456 0.98 0.95 1.00 0.74 0.92 1.00 0.93 0.94 // 123456 0.98 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.98 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.98 1.00 0.83 0.76 0.77 0.90 0.76 0.94 // LSH Bin Count - The number of LSH bins (of 25) that match the query bin values //-------------- // CorpId MAC PbC ECFP EX // ------ --- --- --- --- // 123456 25 25 25 25 // 123456 25 20 7 16 // 123456 25 20 9 19 // 123456 25 25 25 25 // 123456 25 25 25 25 // 123456 20 25 9 25 // 123456 21 25 11 17 // 123456 25 25 25 25 // 123456 25 25 25 25 // 123456 25 9 6 11 // Data for Tanimoto similarity using various Circular fingerprint types. // Using 2 molecules where the 2nd just has an added methyl group. // // Measure Score // -------- ----- // ECFP0 1.00 // ECFP2 .88 // ECFP4 .75 // ECFP6 .64 // FCFP0 1.00 // FCFP2 .92 // FCFP4 .84 // FCFP6 .74 IFingerprinter ifptr = null; IBitFingerprint ibfp = null; BitSetFingerprint bfp = null; IAtomContainer mol2; string s = ""; DateTime t0 = DateTime.Now; double getFptrTime = 0, buildFpTime = 0; if (fpType == FingerprintType.Basic) // size = 1024 { ifptr = new Fingerprinter(); } else if (fpType == FingerprintType.Circular) // size variable { CircularFingerprinterClass cfpClass = (CircularFingerprinterClass)fpSubtype; if (cfpClass < CircularFingerprinterClass.ECFP0 || cfpClass > CircularFingerprinterClass.ECFP6) { cfpClass = (CircularFingerprinterClass)CircularFingerprintType.DefaultCircularClass; // default class } if (fpLen < 0) { fpLen = CircularFingerprintType.DefaultCircularLength; // default length } ifptr = new CircularFingerprinter(cfpClass, fpLen); //CircularFingerprinter cfp = (CircularFingerprinter)ifptr; //ICountFingerprint cntFp = cfp.getCountFingerprint(mol); // debug //s = CircularFpToString(cfp); // debug } else if (fpType == FingerprintType.Extended) // size = 1024 { ifptr = new ExtendedFingerprinter(); // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH } else if (fpType == FingerprintType.EState) // size = 128 { ifptr = new EStateFingerprinter(); // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH } else if (fpType == FingerprintType.MACCS) // size = 192 { if (MACCSFp == null) { MACCSFp = new MACCSFingerprinter(); } ifptr = MACCSFp; } else if (fpType == FingerprintType.PubChem) // size = 896 { //IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance(); ifptr = new PubchemFingerprinter(); } else if (fpType == FingerprintType.ShortestPath) // size = { ifptr = new ShortestPathFingerprinter(); // fails with atom type issue for many structures (e.g. 123456) } else if (fpType == FingerprintType.Signature) // size = { ifptr = new SignatureFingerprinter(); // can't convert array fingerprint to bitsetfingerprint } else if (fpType == FingerprintType.Substructure) // size = 320 { ifptr = new SubstructureFingerprinter(); } else { throw new Exception("Invalid CdkFingerprintType: " + fpType); } getFptrTime = TimeOfDay.Delta(ref t0); ibfp = ifptr.GetBitFingerprint(mol); bfp = (BitSetFingerprint)ibfp; buildFpTime = TimeOfDay.Delta(ref t0); //long size = bfp.size(); //int card = bfp.Cardinality; return(bfp); }