public static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); //Need to register the languages we want to use first Catalyst.Models.English.Register(); //This example shows the two language detection models available on Catalyst. //The first is derived from the Chrome former language detection code Compact Language Detector 2 (https://github.com/CLD2Owners/cld2) //and the newer model is derived from Facebook's FastText language detection dataset (see: https://fasttext.cc/blog/2017/10/02/blog-post.html) //Configures the model storage to use the local folder ./catalyst-models/ Storage.Current = new DiskStorage("catalyst-models"); var cld2LanguageDetector = await LanguageDetector.FromStoreAsync(Language.Any, Version.Latest, ""); var fastTextLanguageDetector = await FastTextLanguageDetector.FromStoreAsync(Language.Any, Version.Latest, ""); //We show bellow the detection on short and longer samples. You can expect lower precision on shorter texts, as there is less information for the model to work with //It's also interesting to see the kind of mistakes these models make, such as detecting Welsh as Gaelic_Scottish_Gaelic foreach (var(lang, text) in Data.ShortSamples) { var doc = new Document(text); fastTextLanguageDetector.Process(doc); var doc2 = new Document(text); cld2LanguageDetector.Process(doc2); Console.WriteLine(text); Console.WriteLine($"Actual:\t{lang}\nFT:\t{doc.Language}\nCLD2\t{doc2.Language}"); Console.WriteLine(); } foreach (var(lang, text) in Data.LongSamples) { var doc = new Document(text); fastTextLanguageDetector.Process(doc); var doc2 = new Document(text); cld2LanguageDetector.Process(doc2); Console.WriteLine(text); Console.WriteLine($"Actual:\t{lang}\nFT:\t{doc.Language}\nCLD2\t{doc2.Language}"); Console.WriteLine(); } // You can also access all predictions via the Predict method: var allPredictions = fastTextLanguageDetector.Predict(new Document(Data.LongSamples[Language.Spanish])); Console.WriteLine($"\n\nTop 10 predictions and scores for the Spanish sample:"); foreach (var kv in allPredictions.OrderByDescending(kv => kv.Value).Take(10)) { Console.WriteLine($"{kv.Key.ToString().PadRight(40)}\tScore: {kv.Value:n2}"); } }
public static void Test(string pathToSentences) { var vectorizer = FastTextLanguageDetector.FromStoreAsync(Language.Any, 0, null).WaitResult(); var pipe = new Pipeline(); pipe.Add(new SpaceTokenizer()); var docs = File.ReadAllLines(pathToSentences).Shuffle() .Where(txt => !string.IsNullOrWhiteSpace(txt)) .Select(txt => txt.Split('\t')) .Where(s => s.Length == 3 && Languages.IsValid3LetterCode(s[1])) //.GroupBy(l => l[1]) //.Where(g => g.Count() > 10_000) //.SelectMany(g => g) .Select(s => { var doc = new Document(s[2]); doc.Labels.Add(Languages.EnumToCode(Languages.ThreeLetterCodeToEnum(s[1]))); return(doc as IDocument); }); docs = pipe.Process(docs).WithCaching(Language.Any, 0, "language-detector-corpus", 100_000).ToList(); int TP = 0, FP = 0, FN = 0; int k = 0; var sw = Stopwatch.StartNew(); Parallel.ForEach(docs, (doc) => { k++; vectorizer.Process(doc); if (doc.Language == Languages.CodeToEnum(doc.Labels.First())) { Interlocked.Increment(ref TP); } else { Interlocked.Increment(ref FP); Interlocked.Increment(ref FN); } }); sw.Stop(); var precision = (double)TP / (double)(TP + FN); var recall = (double)TP / (double)(TP + FN); var f1 = 2 * (precision * recall) / (precision + recall); Console.WriteLine($"F1= {f1 * 100:0.0}% P= {precision * 100:0.0}% R={recall * 100:0.0}% in {sw.Elapsed.TotalSeconds:0.00}s or {k / sw.Elapsed.TotalSeconds} doc/s"); }
public static void Train(string pathToSentences) { var docs = File.ReadAllLines(pathToSentences).Shuffle() .Where(txt => !string.IsNullOrWhiteSpace(txt)) .Select(txt => txt.Split('\t')) .Where(s => s.Length == 3 && Languages.IsValid3LetterCode(s[1])) .Select(s => { var doc = new Document(s[2]); doc.Labels.Add(Languages.EnumToCode(Languages.ThreeLetterCodeToEnum(s[1]))); return(doc as IDocument); }); var pipe = new Pipeline(); pipe.Add(new SpaceTokenizer()); var ldm = new FastTextLanguageDetector(0); ldm.Train(pipe.Process(docs).WithCaching(Language.Any, 0, "language-detector-corpus", 100_000)); ldm.StoreAsync().Wait(); }