private static void CheckFPSmartsForMolecule(string moleculeSmiles, string[][] expectedFPSmarts) { var expected = new HashSet <string>(); foreach (var strs in expectedFPSmarts) { foreach (var str in strs) { expected.Add(str); } } // expectedFPSmarts[][] is a double array because for each smarts // several equivalent variants // of the smarts are given e.g. CCC C(C)C var mol = parser.ParseSmiles(moleculeSmiles); CircularFingerprinter circ = new CircularFingerprinter(); circ.Calculate(mol); var subsmarts = new SmartsFragmentExtractor(mol); subsmarts.SetMode(SubstructureSelectionMode.JCompoundMapper); var numFP = circ.FPCount; var actual = new HashSet <string>(); for (int i = 0; i < numFP; i++) { var fp = circ.GetFP(i); actual.Add(subsmarts.Generate(fp.Atoms)); } Assert.IsTrue(expected.IsSupersetOf(actual)); }
public static string CircularFpToString(CircularFingerprinter cfp) { CircularFingerprint fp = null; string s = "fp\thashCode\titeration\tatoms\r\n"; int fpCount = cfp.FPCount; for (int fpi = 0; fpi < fpCount; fpi++) { fp = cfp.GetFP(fpi); s += fpi.ToString() + "\t" + fp.Hash + "\t" + fp.Iteration + "\t(" + string.Join(", ", fp.Atoms) + ")\r\n"; } return(s); }
/// <summary> /// Appends a new row to the model source data, which consists of a molecule and whether or not it /// is considered active. /// </summary> /// <param name="mol">molecular structure, which must be non-blank</param> /// <param name="active">whether active or not</param> public void AddMolecule(IAtomContainer mol, bool active) { if (mol == null || mol.Atoms.Count == 0) { throw new CDKException("Molecule cannot be blank or null."); } var circ = new CircularFingerprinter(ClassType) { PerceiveStereo = this.PerceiveStereo }; circ.Calculate(mol); // gather all of the (folded) fingerprints into a sorted set int AND_BITS = Folding - 1; // e.g. 1024/0x400 -> 1023/0x3FF: chop off higher order bits var hashset = new SortedSet <int>(); for (int n = circ.FPCount - 1; n >= 0; n--) { int code = circ.GetFP(n).Hash; if (Folding > 0) { code &= AND_BITS; } hashset.Add(code); } // convert the set into a sorted primitive array var hashes = new int[hashset.Count]; int p = 0; foreach (var h in hashset) { hashes[p++] = h; } // record the processed information for model building purposes if (active) { numActive++; } training.Add(hashes); activity.Add(active); foreach (var h in hashes) { if (!inHash.TryGetValue(h, out int[] stash))
public void TestGetBitFingerprint() { Assert.IsTrue(trivialMol != null); var circ = new CircularFingerprinter(); var result = circ.GetBitFingerprint(trivialMol); BitArray wantBits = new BitArray(0), gotBits = result.AsBitSet(); int[] REQUIRE_BITS = { 19, 152, 293, 340, 439, 480, 507, 726, 762, 947, 993 }; foreach (var b in REQUIRE_BITS) { BitArrays.SetValue(wantBits, b, true); } if (!BitArrays.Equals(wantBits, gotBits)) { throw new CDKException("Got " + gotBits + ", wanted " + wantBits); } }
public void TestUseStereoElements() { const string smiles1 = "CC[C@@H](C)O"; const string smiles2 = "CC[C@H](O)C"; const string molfile = "\n" + " CDK 10121722462D \n" + "\n" + " 5 4 0 0 0 0 999 V2000\n" + " -4.1837 2.6984 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + " -3.4692 3.1109 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + " -2.7547 2.6984 0.0000 C 0 0 1 0 0 0 0 0 0 0 0 0\n" + " -2.0403 3.1109 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n" + " -2.7547 1.8734 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n" + " 1 2 1 0 0 0 0\n" + " 2 3 1 0 0 0 0\n" + " 3 4 1 0 0 0 0\n" + " 3 5 1 1 0 0 0\n" + "M END\n"; IChemObjectBuilder bldr = CDK.Builder; MDLV2000Reader mdlr = new MDLV2000Reader(new StringReader(molfile)); SmilesParser smipar = new SmilesParser(bldr); var mol1 = smipar.ParseSmiles(smiles1); var mol2 = smipar.ParseSmiles(smiles2); IAtomContainer mol3 = mdlr.Read(bldr.NewAtomContainer()); var fpr = new CircularFingerprinter { // when stereo-chemistry is perceived we don't have coordinates from the // SMILES and so get a different fingerprint PerceiveStereo = true }; Assert.IsTrue(Compares.AreEqual(fpr.GetFingerprint(mol1), fpr.GetFingerprint(mol2))); Assert.IsFalse(Compares.AreEqual(fpr.GetFingerprint(mol2), fpr.GetFingerprint(mol3))); fpr.PerceiveStereo = false; Assert.IsTrue(Compares.AreEqual(fpr.GetFingerprint(mol1), fpr.GetFingerprint(mol2))); Assert.IsTrue(Compares.AreEqual(fpr.GetFingerprint(mol2), fpr.GetFingerprint(mol3))); }
public void TestGetCountFingerprint() { Assert.IsTrue(trivialMol != null); var circ = new CircularFingerprinter(); var result = circ.GetCountFingerprint(trivialMol); int[] ANSWER_KEY = { -414937772, 1, -1027418143, 1, 1627608083, 1, -868007456, 1, -1006701866, 1, -1059145289, 1, -801752141, 1, 790592664, 1, -289109509, 1, -1650154758, 1,1286833445, 1 }; int wantBits = ANSWER_KEY.Length >> 1; bool fail = result.GetNumberOfPopulatedBins() != wantBits; for (int n = 0; !fail && n < result.GetNumberOfPopulatedBins(); n++) { int gotHash = result.GetHash(n), gotCount = result.GetCount(n); bool found = false; for (int i = 0; i < wantBits; i++) { int wantHash = ANSWER_KEY[i * 2], wantCount = ANSWER_KEY[i * 2 + 1]; if (gotHash == wantHash) { found = true; if (gotCount != wantCount) { throw new CDKException("For hash " + gotHash + " got count " + gotCount + " but wanted " + wantCount); } } } if (!found) { fail = true; break; } } if (fail) { throw new CDKException("Hash values do not match."); } }
/// <summary> /// BuildTest /// </summary> public static void BuildTest() { CircularFingerprinter cfp = null; CircularFingerprinterClass FpClass = CircularFingerprinterClass.ECFP6; // FP diameter int FpLen = 2048; // folded binary fp length IAtomContainer mol, mol2; //string molfile = FileUtil.ReadFile(@"C:\Download\CorpId-12345.mol"); //java.io.StringReader sr = new java.io.StringReader(molfile); //if (Lex.Contains(molfile, "v2000")) // cor = new MDLV2000Reader(sr); //else // cor = new MDLV3000Reader(sr); //cor.setReaderMode(IChemObjectReader.Mode.RELAXED); //ac = (IAtomContainer)cor.read(new AtomContainer()); //cor.close(); FpClass = CircularFingerprinterClass.ECFP4; // debug cfp = new CircularFingerprinter(FpClass, FpLen); StreamReader reader = new StreamReader(@"C:\Download\CorpId-12345.mol"); //FileReader FileReader = new FileReader(@"C:\Download\V3000 Mols.sdf"); EnumerableSDFReader rdr = new EnumerableSDFReader(reader.BaseStream, ChemObjectBuilder.Instance); rdr.ReaderMode = ChemObjectReaderMode.Relaxed; IEnumerator <IAtomContainer> cursor = rdr.GetEnumerator(); while (cursor.MoveNext()) { mol = cursor.Current; mol = CdkMol.GetLargestMoleculeFragment(mol); ICountFingerprint cfp1 = cfp.GetCountFingerprint(mol); // get hash values and counts for each cfp.Calculate(mol); int fpCount = cfp.FPCount; for (int fpi = 0; fpi < fpCount; fpi++) // gets { CircularFingerprint cfp2 = cfp.GetFP(fpi); // gets hash, iteration and lists of atoms (dups appear multiple times) } IBitFingerprint bfp = cfp.GetBitFingerprint(mol); BitArray bs = bfp.AsBitSet(); int bsCard = bfp.Cardinality; long bsSize = bfp.Length; continue; } reader.Close(); return; //java.io.StringReader sr = new java.io.StringReader(molfile); //AtomContainer mol = new AtomContainer(); //mol.addAtom(new Atom("C")); //mol.addAtom(new Atom("H")); //mol.addAtom(new Atom("H")); //mol.addAtom(new Atom("H")); //mol.addAtom(new Atom("H")); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(1))); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(2))); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(3))); //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(4))); //FileReader FileReader = new FileReader(@"C:\Download\CorpId-12345.mol"); //MolReader mr = new MolReader(FileReader, DefaultChemObjectBuilder.getInstance()); //java.io.StringReader sr = new java.io.StringReader(molfile); //IMol m = (IMol)mr.next(); //FileReader.close(); }
/// <summary> /// Build fingerprint /// </summary> /// <param name="mol"></param> public static BitSetFingerprint BuildBitSetFingerprint( IAtomContainer mol, FingerprintType fpType, int fpSubtype = -1, int fpLen = -1) { // Data for Tanimoto similarity using various fingerprint types for CorpId 123456 query. // Cart - Standard MDL Oracle Cartridge scores // // Similarity Score // ------------------------------------------------ // Size -> 192 896 1024 1024 128 1024 320 // CorpId Cart MACCS PbChm ECFP4 EXT EState Basic Sbstr // ------ ---- ---- ---- ---- ---- ---- ---- ---- // 123456 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.99 0.98 0.96 0.77 0.95 1.00 0.95 1.00 // 123456 0.99 0.98 0.96 0.77 0.95 1.00 0.94 1.00 // 123456 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.99 0.91 1.00 0.81 1.00 1.00 1.00 1.00 // 123456 0.98 0.95 1.00 0.74 0.92 1.00 0.93 0.94 // 123456 0.98 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.98 1.00 1.00 1.00 1.00 1.00 1.00 1.00 // 123456 0.98 1.00 0.83 0.76 0.77 0.90 0.76 0.94 // LSH Bin Count - The number of LSH bins (of 25) that match the query bin values //-------------- // CorpId MAC PbC ECFP EX // ------ --- --- --- --- // 123456 25 25 25 25 // 123456 25 20 7 16 // 123456 25 20 9 19 // 123456 25 25 25 25 // 123456 25 25 25 25 // 123456 20 25 9 25 // 123456 21 25 11 17 // 123456 25 25 25 25 // 123456 25 25 25 25 // 123456 25 9 6 11 // Data for Tanimoto similarity using various Circular fingerprint types. // Using 2 molecules where the 2nd just has an added methyl group. // // Measure Score // -------- ----- // ECFP0 1.00 // ECFP2 .88 // ECFP4 .75 // ECFP6 .64 // FCFP0 1.00 // FCFP2 .92 // FCFP4 .84 // FCFP6 .74 IFingerprinter ifptr = null; IBitFingerprint ibfp = null; BitSetFingerprint bfp = null; IAtomContainer mol2; string s = ""; DateTime t0 = DateTime.Now; double getFptrTime = 0, buildFpTime = 0; if (fpType == FingerprintType.Basic) // size = 1024 { ifptr = new Fingerprinter(); } else if (fpType == FingerprintType.Circular) // size variable { CircularFingerprinterClass cfpClass = (CircularFingerprinterClass)fpSubtype; if (cfpClass < CircularFingerprinterClass.ECFP0 || cfpClass > CircularFingerprinterClass.ECFP6) { cfpClass = (CircularFingerprinterClass)CircularFingerprintType.DefaultCircularClass; // default class } if (fpLen < 0) { fpLen = CircularFingerprintType.DefaultCircularLength; // default length } ifptr = new CircularFingerprinter(cfpClass, fpLen); //CircularFingerprinter cfp = (CircularFingerprinter)ifptr; //ICountFingerprint cntFp = cfp.getCountFingerprint(mol); // debug //s = CircularFpToString(cfp); // debug } else if (fpType == FingerprintType.Extended) // size = 1024 { ifptr = new ExtendedFingerprinter(); // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH } else if (fpType == FingerprintType.EState) // size = 128 { ifptr = new EStateFingerprinter(); // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH } else if (fpType == FingerprintType.MACCS) // size = 192 { if (MACCSFp == null) { MACCSFp = new MACCSFingerprinter(); } ifptr = MACCSFp; } else if (fpType == FingerprintType.PubChem) // size = 896 { //IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance(); ifptr = new PubchemFingerprinter(); } else if (fpType == FingerprintType.ShortestPath) // size = { ifptr = new ShortestPathFingerprinter(); // fails with atom type issue for many structures (e.g. 123456) } else if (fpType == FingerprintType.Signature) // size = { ifptr = new SignatureFingerprinter(); // can't convert array fingerprint to bitsetfingerprint } else if (fpType == FingerprintType.Substructure) // size = 320 { ifptr = new SubstructureFingerprinter(); } else { throw new Exception("Invalid CdkFingerprintType: " + fpType); } getFptrTime = TimeOfDay.Delta(ref t0); ibfp = ifptr.GetBitFingerprint(mol); bfp = (BitSetFingerprint)ibfp; buildFpTime = TimeOfDay.Delta(ref t0); //long size = bfp.size(); //int card = bfp.Cardinality; return(bfp); }