Ejemplo n.º 1
0
        public override void Compute()
        {
            TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts");

            Logger.Info("Using " + _config.Language + " stemmer");
            TLArtifactsCollection stemmedArtifacts = SnowballStemmer.ProcessArtifacts(artifacts, _config.Language);

            Workspace.Store("ListOfArtifacts", stemmedArtifacts);
        }
Ejemplo n.º 2
0
        public void EnglishStemmerTest()
        {
            string terms = "jump jumping jumps jumped";

            string[] stem = SnowballStemmer.ProcessText(terms, SnowballStemmerEnum.English).Split();
            #if Verbose
            Console.WriteLine("SnowballStemmerTest.EnglishStemmerTest()");
            Console.WriteLine("Original: {0}", terms);
            Console.WriteLine("Stemmed:  {0}", String.Join(" ", stem));
            #endif
            foreach (string term in stem)
            {
                Assert.AreEqual("jump", term);
            }
        }
Ejemplo n.º 3
0
        public void GermanStemmerTest()
        {
            string terms = "mochte mochtest mochten mochtet";

            string[] stem = SnowballStemmer.ProcessText(terms, SnowballStemmerEnum.German).Split();
            #if Verbose
            Console.WriteLine("SnowballStemmerTest.GermanStemmerTest()");
            Console.WriteLine("Original: {0}", terms);
            Console.WriteLine("Stemmed:  {0}", String.Join(" ", stem));
            #endif
            for (int i = 0; i < 3; i++)
            {
                Assert.AreEqual("mocht", stem[i]);
            }
            Assert.AreEqual("mochtet", stem[3]);
        }
Ejemplo n.º 4
0
        static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                WriteLine("Usage:\n\n\ttrain.exe <folder path>");
                return;
            }

            WriteLine("Platform: " + IntPtr.Size);

            var stemmer = new SnowballStemmer();

            using (var zip = new ZipOutputStream(
                       OpenWrite(
                           Combine(
                               GetDirectoryName(args[0]),
                               GetFileName(args[0]) + ".cl"))))
            {
                WriteLine("Reading dictionary...");
                var dicPath = Combine(args[0], "dic.txt");
                var dicText = File.Exists(dicPath) ? ReadAllLines(dicPath) : new string[0];

                WriteLine("Building dictionary...");
                var text = ReadAllLines(Combine(args[0], "text.txt"));

                var dic = new Dictionary(dicText.Concat(text
                                                        .SelectMany(jd => jd.StemText(stemmer))
                                                        .ToLookup(w => w.Stemmed)
                                                        .Select(l => new { Stemmed = l.Key, Count = l.Count() })
                                                        .OrderByDescending(w => w.Count)
                                                        .Select(w => w.Stemmed))
                                         .Take(5000)
                                         .ToArray());

                zip.PutNextEntry(new ZipEntry("dic.txt"));
                using (var writer = new StreamWriter(zip, UTF8, 4096, true))
                    dic.WriteTo(writer);
                zip.CloseEntry();

                WriteLine("Vectorizing...");
                var input = new double[text.Length][];
                for (int i = 0; i < text.Length; i++)
                {
                    input[i] = dic.Vectorize(new Document(text[i], stemmer));
                }

                var classifiers = from f in GetFiles(args[0])
                                  where GetFileName(f) != "text.txt" && GetFileName(f) != "dic.txt"
                                  where GetExtension(f) == ".txt"
                                  let ll                                              = ReadAllLines(f)
                                                             let ld                   = ll.Distinct().ToArray()
                                                                               let li = ld
                                                                                        .Select((l, i) => new { l, i })
                                                                                        .ToDictionary(x => x.l, x => x.i)
                                                                                        select new
                {
                    Name   = GetFileNameWithoutExtension(f),
                    Labels = ld,
                    Output = ll.Select(l => li[l]).ToArray()
                };

                foreach (var classifier in classifiers)
                {
                    Write($"Training: {classifier.Name}");
                    IKernel kernel  = new Linear();
                    var     machine = new MulticlassSupportVectorMachine(input[0].Length, kernel, classifier.Labels.Length);
                    var     teacher = new MulticlassSupportVectorLearning(machine, input, classifier.Output);
                    teacher.Algorithm = (svm, classInputs, classOutputs, i, j) =>
                    {
                        //var sequentialMinimalOptimization = new SequentialMinimalOptimization(svm, classInputs, classOutputs);
                        //sequentialMinimalOptimization.Run();
                        var linearCoordinateDescent = new LinearCoordinateDescent(svm, classInputs, classOutputs);
                        linearCoordinateDescent.Run();

                        var probabilisticOutputLearning = new ProbabilisticOutputCalibration(svm, classInputs, classOutputs);
                        return(probabilisticOutputLearning);
                    };
                    WriteLine($", error = {teacher.Run()}");

                    zip.PutNextEntry(new ZipEntry(classifier.Name + ".svm"));
                    using (var stream = new MemoryStream())
                    {
                        machine.Save(stream);
                        stream.Position = 0;
                        stream.CopyTo(zip);
                    }
                    zip.CloseEntry();

                    zip.PutNextEntry(new ZipEntry(classifier.Name + ".txt"));
                    using (var writer = new StreamWriter(zip, UTF8, 4096, true))
                        foreach (var lable in classifier.Labels)
                        {
                            writer.WriteLine(lable);
                        }

                    zip.CloseEntry();
                }

                WriteLine("Done.");
            }
        }