static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); //Configures the model storage to use the online repository backed by the local folder ./catalyst-models/ Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); //Download the Reuters corpus if necessary var(train, test) = await Corpus.Reuters.GetAsync(); //Parse the documents using the English pipeline, as the text data is untokenized so far var nlp = Pipeline.For(Language.English); var trainDocs = nlp.Process(train).ToArray(); var testDocs = nlp.Process(test).ToArray(); //Train a FastText supervised classifier with a multi-label loss (OneVsAll) var fastText = new FastText(Language.English, 0, "Reuters-Classifier"); fastText.Data.Type = FastText.ModelType.Supervised; fastText.Data.Loss = FastText.LossType.OneVsAll; fastText.Data.LearningRate = 1f; fastText.Data.Dimensions = 256; fastText.Data.Epoch = 100; fastText.Data.MinimumWordNgramsCounts = 5; fastText.Data.MaximumWordNgrams = 3; fastText.Data.MinimumCount = 5; fastText.Train(trainDocs); //You can also auto-tune the model using the algorithm from https://ai.facebook.com/blog/fasttext-blog-post-open-source-in-brief/ fastText.AutoTuneTrain(trainDocs, testDocs, new FastText.AutoTuneOptions()); //Compute predictions Dictionary <IDocument, Dictionary <string, float> > predTrain, predTest; using (new Measure(Logger, "Computing train-set predictions", trainDocs.Length)) { predTrain = trainDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred); } using (new Measure(Logger, "Computing test set predictions", testDocs.Length)) { predTest = testDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred); } var resultsTrain = ComputeStats(predTrain); var resultsTest = ComputeStats(predTest); Console.WriteLine("\n\n\n--- Results ---\n\n\n"); foreach (var res in resultsTrain.Zip(resultsTest)) { Console.WriteLine($"\tScore cutoff: {res.First.Cutoff:n2} Train: F1={res.First.F1:n2} P={res.First.Precision:n2} R={res.First.Recall:n2} Test: F1={res.Second.F1:n2} P={res.Second.Precision:n2} R={res.Second.Recall:n2}"); } Console.ReadLine(); }
public new static async Task <bool> DeleteAsync(Language language, int version, string tag) { var a = new FastTextLanguageDetector(version); bool deleted = false; deleted |= await FastText.DeleteAsync(Language.Any, version, "language-detector"); deleted |= await a.DeleteDataAsync(); return(deleted); }
public new static async Task <bool> DeleteAsync(Language language, int version, string tag) { var a = new FastText(language, version, tag); bool deleted = false; deleted |= await DataStore.DeleteAsync(language, nameof(StarSpaceModel) + "-Matrix", version, tag + "-lhs"); deleted |= await DataStore.DeleteAsync(language, nameof(StarSpaceModel) + "-Matrix", version, tag + "-rhs"); deleted |= await a.DeleteDataAsync(); return(deleted); }
public static async Task TxtClassification() { Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var(train, test) = await Corpus.Reuters.GetAsync(); var nlp = Pipeline.For(Language.English); var trainDocs = nlp.Process(train).ToArray(); var testDocs = nlp.Process(test).ToArray(); var fastText = new FastText(Language.English, 0, "Reuters-Classifier"); fastText.Data.Type = FastText.ModelType.Supervised; fastText.Data.Loss = FastText.LossType.OneVsAll; fastText.Data.LearningRate = 1f; fastText.Data.Dimensions = 256; fastText.Data.Epoch = 100; fastText.Data.MinimumWordNgramsCounts = 5; fastText.Data.MaximumWordNgrams = 3; fastText.Data.MinimumCount = 5; fastText.Train(trainDocs); fastText.AutoTuneTrain(trainDocs, testDocs, new FastText.AutoTuneOptions()); Dictionary <IDocument, Dictionary <string, float> > predTrain, predTest; using (new Measure(Logger, "Computing train-set predictions", trainDocs.Length)) { predTrain = trainDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred); } using (new Measure(Logger, "Computing test set predictions", testDocs.Length)) { predTest = testDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred); } var resultsTrain = ComputeStats(predTrain); var resultsTest = ComputeStats(predTest); Console.WriteLine("\n\n\n--- Results ---\n\n\n"); foreach (var res in resultsTrain.Zip(resultsTest)) { Console.WriteLine($"\tScore cutoff: {res.First.Cutoff:n2} Train: F1={res.First.F1:n2} P={res.First.Precision:n2} R={res.First.Recall:n2} Test: F1={res.Second.F1:n2} P={res.Second.Precision:n2} R={res.Second.Recall:n2}"); } Console.ReadLine(); }
public FastTextLanguageDetector(int version) : base(Language.Any, version, nameof(FastTextLanguageDetector), compress: false) { Model = new FastText(Language.Any, version, "language-detector"); Model.Data.Type = FastText.ModelType.Supervised; Model.Data.MaximumWordNgrams = 0; Model.Data.MinimumNgrams = 2; Model.Data.MaximumNgrams = 5; Model.Data.VectorQuantization = QuantizationType.None; Model.Data.LearningRate = 0.1f; Model.Data.Epoch = 50; Model.Data.Dimensions = 16; Model.Data.IgnoreCase = false; Model.Data.Loss = FastText.LossType.NegativeSampling; Model.Data.MinimumCount = 5; Tokenizer = new SpaceTokenizer(); }
public void ComputeAlignment(FastText source, FastText target) { //find words that exist on both models int dim = source.Data.Dimensions; if (target.Data.Dimensions != dim) { throw new Exception("Source and Target models must have the same dimensions!"); } int N = 10_000; var dict = TranslationDictionary.GetDictionary(source.Language, target.Language, N * 3); var A = new Matrix(dict.Count, dim); var B = new Matrix(dict.Count, dim); int k = 0; foreach (var kv in dict) { if ((source.GetWordIndex(kv.Key) > -1) && (target.GetWordIndex(kv.Value) > -1)) { Dictionary.Add(kv.Key, kv.Value); A[k] = source.GetVector(kv.Key, source.Language); B[k] = target.GetVector(kv.Value, target.Language); k++; } if (k == N) { break; } } A.ResizeAndFillRows(k, 0); B.ResizeAndFillRows(k, 0); var U = B.Transpose().Multiply(A); CalculateSVD(U.ToArray(), out float[] w, out float[][] v); TranslationMatrix = U.Multiply(new Matrix(v).Transpose()); //M = A*B //U,V = SVD(M) //W = U dot V' }
public new static async Task <FastTextLanguageDetector> FromStoreAsync(Language language, int version, string tag) { var a = new FastTextLanguageDetector(version); //Because we use the model name as the tag of this model, we've to check for formernames here try { await a.LoadDataAsync(); } catch (FileNotFoundException) { if (ObjectStore.TryGetFormerNames(nameof(FastTextLanguageDetector), out var formerNames)) { var correctTag = a.Tag; foreach (var formerName in formerNames) { try { a.Tag = formerName; await a.LoadDataAsync(); a.Tag = correctTag; break; } catch (FileNotFoundException) { //ignore } } } } a.Model = await FastText.FromStoreAsync_Internal(Language.Any, version, "language-detector"); a.Model?.CompactSupervisedModel(); return(a); }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); // For every post, use the "KNNSearch" method on the graph to find the three most similar posts const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request one result too many from the KNNSearch call because it's expected that the original // post will come back as the best match and we'll want to exclude that var similarResults = graph .KNNSearch(result, maximumNumberOfResultsToReturn + 1) .Where(similarResult => similarResult.Item.UID != result.UID) .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance }) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Training TF-IDF model.."); var tfidf = new TFIDF(pipeline.Language, version: 0, tag: ""); await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document)); Console.WriteLine("Getting average TF-IDF weights per word.."); var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase); foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document)) { tfidf.Process(doc); foreach (var sentence in doc) { foreach (var token in sentence) { if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs)) { freqs = new(); tokenValueTFIDF.Add(token.Value, freqs); } freqs.Add(token.Frequency); } } } var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary( entry => entry.Key, entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request that the KNNSearch operate over all documents because we can't take the top {n} // until we've combined the ordering with the title TFIDF proximity values var similarResults = graph .KNNSearch(result, postsWithDocuments.Length) .Where(similarResult => similarResult.Item.UID != result.UID); var tokenValuesInTitle = GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline) .Select(token => token.Value) .ToHashSet(StringComparer.OrdinalIgnoreCase); return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance, ProximityByTitleTFIDF = GetProximityByTitleTFIDF( NormaliseSomeCommonTerms(similarResult.Item.Post.Title), tokenValuesInTitle, averagedTokenValueTFIDF, pipeline ) }) .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF) .ThenBy(similarResult => similarResult.Distance) .Take(maximumNumberOfResultsToReturn) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }