Esempio n. 1
0
        public static async Task Main(string[] args)
        {
            Console.OutputEncoding = Encoding.UTF8;
            ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole()));

            //Need to register the languages we want to use first
            Catalyst.Models.English.Register();

            //This example shows the two language detection models available on Catalyst.
            //The first is derived from the Chrome former language detection code Compact Language Detector 2 (https://github.com/CLD2Owners/cld2)
            //and the newer model is derived from Facebook's FastText language detection dataset (see: https://fasttext.cc/blog/2017/10/02/blog-post.html)

            //Configures the model storage to use the local folder ./catalyst-models/
            Storage.Current = new DiskStorage("catalyst-models");

            var cld2LanguageDetector = await LanguageDetector.FromStoreAsync(Language.Any, Version.Latest, "");

            var fastTextLanguageDetector = await FastTextLanguageDetector.FromStoreAsync(Language.Any, Version.Latest, "");

            //We show bellow the detection on short and longer samples. You can expect lower precision on shorter texts, as there is less information for the model to work with
            //It's also interesting to see the kind of mistakes these models make, such as detecting Welsh as Gaelic_Scottish_Gaelic

            foreach (var(lang, text) in Data.ShortSamples)
            {
                var doc = new Document(text);
                fastTextLanguageDetector.Process(doc);

                var doc2 = new Document(text);
                cld2LanguageDetector.Process(doc2);

                Console.WriteLine(text);
                Console.WriteLine($"Actual:\t{lang}\nFT:\t{doc.Language}\nCLD2\t{doc2.Language}");
                Console.WriteLine();
            }

            foreach (var(lang, text) in Data.LongSamples)
            {
                var doc = new Document(text);
                fastTextLanguageDetector.Process(doc);

                var doc2 = new Document(text);
                cld2LanguageDetector.Process(doc2);

                Console.WriteLine(text);
                Console.WriteLine($"Actual:\t{lang}\nFT:\t{doc.Language}\nCLD2\t{doc2.Language}");
                Console.WriteLine();
            }

            // You can also access all predictions via the Predict method:
            var allPredictions = fastTextLanguageDetector.Predict(new Document(Data.LongSamples[Language.Spanish]));

            Console.WriteLine($"\n\nTop 10 predictions and scores for the Spanish sample:");
            foreach (var kv in allPredictions.OrderByDescending(kv => kv.Value).Take(10))
            {
                Console.WriteLine($"{kv.Key.ToString().PadRight(40)}\tScore: {kv.Value:n2}");
            }
        }
        public static void Test(string pathToSentences)
        {
            var vectorizer = FastTextLanguageDetector.FromStoreAsync(Language.Any, 0, null).WaitResult();

            var pipe = new Pipeline();

            pipe.Add(new SpaceTokenizer());


            var docs = File.ReadAllLines(pathToSentences).Shuffle()
                       .Where(txt => !string.IsNullOrWhiteSpace(txt))
                       .Select(txt => txt.Split('\t'))
                       .Where(s => s.Length == 3 && Languages.IsValid3LetterCode(s[1]))
                       //.GroupBy(l => l[1])
                       //.Where(g => g.Count() > 10_000)
                       //.SelectMany(g => g)
                       .Select(s =>
            {
                var doc = new Document(s[2]);
                doc.Labels.Add(Languages.EnumToCode(Languages.ThreeLetterCodeToEnum(s[1])));
                return(doc as IDocument);
            });


            docs = pipe.Process(docs).WithCaching(Language.Any, 0, "language-detector-corpus", 100_000).ToList();


            int TP = 0, FP = 0, FN = 0;
            int k  = 0;
            var sw = Stopwatch.StartNew();

            Parallel.ForEach(docs, (doc) =>
            {
                k++;
                vectorizer.Process(doc);
                if (doc.Language == Languages.CodeToEnum(doc.Labels.First()))
                {
                    Interlocked.Increment(ref TP);
                }
                else
                {
                    Interlocked.Increment(ref FP);
                    Interlocked.Increment(ref FN);
                }
            });

            sw.Stop();

            var precision = (double)TP / (double)(TP + FN);
            var recall    = (double)TP / (double)(TP + FN);

            var f1 = 2 * (precision * recall) / (precision + recall);

            Console.WriteLine($"F1= {f1 * 100:0.0}% P= {precision * 100:0.0}% R={recall * 100:0.0}% in {sw.Elapsed.TotalSeconds:0.00}s or {k / sw.Elapsed.TotalSeconds} doc/s");
        }
        public static void Train(string pathToSentences)
        {
            var docs = File.ReadAllLines(pathToSentences).Shuffle()
                       .Where(txt => !string.IsNullOrWhiteSpace(txt))
                       .Select(txt => txt.Split('\t'))
                       .Where(s => s.Length == 3 && Languages.IsValid3LetterCode(s[1]))
                       .Select(s =>
            {
                var doc = new Document(s[2]);
                doc.Labels.Add(Languages.EnumToCode(Languages.ThreeLetterCodeToEnum(s[1])));
                return(doc as IDocument);
            });


            var pipe = new Pipeline();

            pipe.Add(new SpaceTokenizer());

            var ldm = new FastTextLanguageDetector(0);

            ldm.Train(pipe.Process(docs).WithCaching(Language.Any, 0, "language-detector-corpus", 100_000));
            ldm.StoreAsync().Wait();
        }