public void CosineDistanceTest() { CosineDistance dist = new CosineDistance(); // TODO: Re-enable argument checking only for debugging // Assert.Throws<ArgumentException>(() => dist.GetDistance(p0, q4)); double result = dist.GetDistance(p0, q0); Assert.AreEqual(result, .2, 0.00001); result = dist.GetDistance(p1, q1); Assert.AreEqual(result, 0.029857, 0.00001); result = dist.GetDistance(p2, q2); Assert.AreEqual(result, 1); result = dist.GetDistance(p3, q3); Assert.AreEqual(result, 0, 0.00001); result = dist.GetDistance(p4, q4); Assert.AreEqual(result, 0.039354, 0.00001); result = dist.GetDistance(p5, q5); Assert.AreEqual(result, 0.031026, 0.00001); }
public static void Main() { modshogun.init_shogun_with_defaults(); double[,] traindata_real = Load.load_numbers("../data/fm_train_real.dat"); double[,] testdata_real = Load.load_numbers("../data/fm_test_real.dat"); RealFeatures feats_train = new RealFeatures(traindata_real); RealFeatures feats_test = new RealFeatures(testdata_real); CosineDistance distance = new CosineDistance(feats_train, feats_train); double[,] dm_train = distance.get_distance_matrix(); distance.init(feats_train, feats_test); double[,] dm_test = distance.get_distance_matrix(); foreach (double item in dm_train) { Console.Write(item); } foreach (double item in dm_test) { Console.Write(item); } modshogun.exit_shogun(); }
public static void Main() { modshogun.init_shogun_with_defaults(); double[,] traindata_real = Load.load_numbers("../data/fm_train_real.dat"); double[,] testdata_real = Load.load_numbers("../data/fm_test_real.dat"); RealFeatures feats_train = new RealFeatures(traindata_real); RealFeatures feats_test = new RealFeatures(testdata_real); CosineDistance distance = new CosineDistance(feats_train, feats_train); double[,] dm_train = distance.get_distance_matrix(); distance.init(feats_train, feats_test); double[,] dm_test = distance.get_distance_matrix(); foreach(double item in dm_train) { Console.Write(item); } foreach(double item in dm_test) { Console.Write(item); } modshogun.exit_shogun(); }
public void CosineDistanceTest( ) { CosineDistance dist = new CosineDistance( ); Assert.Throws <ArgumentException>(() => dist.GetDistance(p0, q4)); double result = dist.GetDistance(p0, q0); Assert.AreApproximatelyEqual(result, .2, 0.00001); result = dist.GetDistance(p1, q1); Assert.AreApproximatelyEqual(result, 0.029857, 0.00001); result = dist.GetDistance(p2, q2); Assert.AreEqual(result, 1); result = dist.GetDistance(p3, q3); Assert.AreApproximatelyEqual(result, 0, 0.00001); result = dist.GetDistance(p4, q4); Assert.AreApproximatelyEqual(result, 0.039354, 0.00001); result = dist.GetDistance(p5, q5); Assert.AreApproximatelyEqual(result, 0.031026, 0.00001); }
static void Main(string[] argv) { modshogun.init_shogun_with_defaults(); DoubleMatrix traindata_real = Load.load_numbers("../data/fm_train_real.dat"); DoubleMatrix testdata_real = Load.load_numbers("../data/fm_test_real.dat"); RealFeatures feats_train = new RealFeatures(traindata_real); RealFeatures feats_test = new RealFeatures(testdata_real); CosineDistance distance = new CosineDistance(feats_train, feats_train); DoubleMatrix dm_train = distance.get_distance_matrix(); distance.init(feats_train, feats_test); DoubleMatrix dm_test = distance.get_distance_matrix(); Console.WriteLine(dm_train.ToString()); Console.WriteLine(dm_test.ToString()); modshogun.exit_shogun(); }
internal static HandleRef getCPtr(CosineDistance obj) { return((obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr); }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); // For every post, use the "KNNSearch" method on the graph to find the three most similar posts const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request one result too many from the KNNSearch call because it's expected that the original // post will come back as the best match and we'll want to exclude that var similarResults = graph .KNNSearch(result, maximumNumberOfResultsToReturn + 1) .Where(similarResult => similarResult.Item.UID != result.UID) .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance }) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }
internal static HandleRef getCPtr(CosineDistance obj) { return (obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr; }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Training TF-IDF model.."); var tfidf = new TFIDF(pipeline.Language, version: 0, tag: ""); await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document)); Console.WriteLine("Getting average TF-IDF weights per word.."); var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase); foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document)) { tfidf.Process(doc); foreach (var sentence in doc) { foreach (var token in sentence) { if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs)) { freqs = new(); tokenValueTFIDF.Add(token.Value, freqs); } freqs.Add(token.Frequency); } } } var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary( entry => entry.Key, entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request that the KNNSearch operate over all documents because we can't take the top {n} // until we've combined the ordering with the title TFIDF proximity values var similarResults = graph .KNNSearch(result, postsWithDocuments.Length) .Where(similarResult => similarResult.Item.UID != result.UID); var tokenValuesInTitle = GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline) .Select(token => token.Value) .ToHashSet(StringComparer.OrdinalIgnoreCase); return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance, ProximityByTitleTFIDF = GetProximityByTitleTFIDF( NormaliseSomeCommonTerms(similarResult.Item.Post.Title), tokenValuesInTitle, averagedTokenValueTFIDF, pipeline ) }) .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF) .ThenBy(similarResult => similarResult.Distance) .Take(maximumNumberOfResultsToReturn) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }
private DistanceMeasure CreateMeasure(string processName, string dirName, DistanceMeasures measure, PDB.PDBMODE atoms, bool jury1d, string alignFileName, string profileName = null, string refJuryProfile = null) { DistanceMeasure dist = null; switch (measure) { case DistanceMeasures.HAMMING: if (alignFileName != null) { dist = new JuryDistance(alignFileName, jury1d, profileName, refJuryProfile); } else { dist = new JuryDistance(dirName, alignFileName, jury1d, profileName, refJuryProfile); } break; case DistanceMeasures.TANIMOTO: if (alignFileName != null) { dist = new Tanimoto(alignFileName, jury1d, profileName, refJuryProfile); } else { dist = new Tanimoto(dirName, alignFileName, jury1d, profileName, refJuryProfile); } break; case DistanceMeasures.COSINE: if (alignFileName != null) { dist = new CosineDistance(alignFileName, jury1d, profileName, refJuryProfile); } else { dist = new CosineDistance(dirName, alignFileName, jury1d, profileName, refJuryProfile); } break; case DistanceMeasures.RMSD: if (dirName == null) { throw new Exception("RMSD and MAXSUB measures cannot be used for aligned profiles!"); } dist = new Rmsd(dirName, alignFileName, jury1d, atoms, refJuryProfile); break; case DistanceMeasures.MAXSUB: if (dirName == null) { throw new Exception("RMSD and MAXSUB measures cannot be used for aligned profiles!"); } dist = new MaxSub(dirName, alignFileName, jury1d, refJuryProfile); break; case DistanceMeasures.GDT_TS: if (dirName == null) { throw new Exception("RMSD and MAXSUB measures cannot be used for aligned profiles!"); } dist = new GDT_TS(dirName, alignFileName, jury1d, refJuryProfile); break; } return(dist); }