private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); // For every post, use the "KNNSearch" method on the graph to find the three most similar posts const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request one result too many from the KNNSearch call because it's expected that the original // post will come back as the best match and we'll want to exclude that var similarResults = graph .KNNSearch(result, maximumNumberOfResultsToReturn + 1) .Where(similarResult => similarResult.Item.UID != result.UID) .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance }) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Training TF-IDF model.."); var tfidf = new TFIDF(pipeline.Language, version: 0, tag: ""); await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document)); Console.WriteLine("Getting average TF-IDF weights per word.."); var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase); foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document)) { tfidf.Process(doc); foreach (var sentence in doc) { foreach (var token in sentence) { if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs)) { freqs = new(); tokenValueTFIDF.Add(token.Value, freqs); } freqs.Add(token.Frequency); } } } var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary( entry => entry.Key, entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request that the KNNSearch operate over all documents because we can't take the top {n} // until we've combined the ordering with the title TFIDF proximity values var similarResults = graph .KNNSearch(result, postsWithDocuments.Length) .Where(similarResult => similarResult.Item.UID != result.UID); var tokenValuesInTitle = GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline) .Select(token => token.Value) .ToHashSet(StringComparer.OrdinalIgnoreCase); return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance, ProximityByTitleTFIDF = GetProximityByTitleTFIDF( NormaliseSomeCommonTerms(similarResult.Item.Post.Title), tokenValuesInTitle, averagedTokenValueTFIDF, pipeline ) }) .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF) .ThenBy(similarResult => similarResult.Distance) .Take(maximumNumberOfResultsToReturn) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }