Esempio n. 1
0
        /// <summary>
        /// Подключение к корпусу на локальном диске.
        /// Если корпуса не существует, то создать,
        /// иначе считать все файлы, находящиеся в нём.
        /// </summary>
        public void Connect()
        {
            DirectoryInfo directoryInfo = new DirectoryInfo(connectorString + "CorporaStore");

            if (!directoryInfo.Exists)
            {
                directoryInfo.Create();
            }
            foreach (var d in directoryInfo.GetDirectories())
            {
                Corpus corpus = new Corpus()
                {
                    Title = d.Name
                };
                foreach (var f in d.GetFiles())
                {
                    var file = new TextFile()
                    {
                        Info  = f.FullName,
                        Title = f.Name
                    };
                    corpus.Add(file);
                }
                corpora.Add(corpus);
            }
        }
Esempio n. 2
0
 public void Basic_Add()
 {
     Corpus c = new Corpus();
     c.Add("one");
     c.Add("two");
     c.Add("three");
     Assert.AreEqual(3, c.Tokens.Count);
 }
Esempio n. 3
0
        public void TestClustering()
        {
            List <string> word_sequence = new List <string>();
            Corpus        corpus        = new Corpus();

            using (StreamReader reader = new StreamReader("sample.txt"))
            {
                string[] words = reader.ReadToEnd().Split(new char[] { ' ', '?', ',', ':', '"', '\n', '\t' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (string word in words)
                {
                    string w2 = word.Trim();
                    if (w2 == ".")
                    {
                        continue;
                    }
                    if (w2.EndsWith("."))
                    {
                        w2 = w2.Substring(0, w2.Length - 1);
                    }
                    if (!string.IsNullOrEmpty(w2) && word.Length > 1)
                    {
                        word_sequence.Add(w2);
                        corpus.Add(w2);
                    }
                }
            }

            int M = 70;

            Console.WriteLine("M: {0}", M);
            Console.WriteLine("Corpus Size: {0}", corpus.Count);
            Console.WriteLine("Document Size: {0}", word_sequence.Count);

            BrownClustering bc = new BrownClustering(M);

            bc.Cluster(corpus, word_sequence);

            Dictionary <string, List <string> > clusters = bc.GetClustersWithCodewordsOfLength(10);

            foreach (string codeword in clusters.Keys)
            {
                Console.WriteLine("In Cluster {0}", codeword);
                foreach (string word in clusters[codeword])
                {
                    Console.Write("{0}, ", word);
                }
                Console.WriteLine();
            }

            XmlDocument doc  = new XmlDocument();
            XmlElement  root = bc.ToXml(doc);

            doc.AppendChild(root);

            doc.Save("BrownClusteringResult.xml");
        }
Esempio n. 4
0
        public void AddFile(Corpus corpus, string fileName)
        {
            var path = connectorString + @"CorporaStore\" + corpus.Title + @"\" + Path.GetFileName(fileName);

            corpus.Add(new TextFile()
            {
                Title = Path.GetFileName(fileName), Info = path
            });
            FileInfo fileInfo = new FileInfo(fileName);

            fileInfo.CopyTo(path, true);
            //fileInfo.Delete();
        }
Esempio n. 5
0
        public void Test_Token_Probability()
        {
            Corpus good = new Corpus();
            Corpus bad = new Corpus();

            good.Add("the chicken jumped over the moon", 3);
            bad.Add("the cow ran threw the moon", 3);

            Calculator c = new Calculator(Calculator.Defaults);

            Assert.AreEqual<double>(0.3333333333333333, c.CalculateTokenProbability("the", good, bad));
            Assert.AreEqual<double>(0.3333333333333333, c.CalculateTokenProbability("moon", good, bad));
            //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, c.CalculateTokenProbability("ran", good, bad));
            //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, c.CalculateTokenProbability("cow", good, bad));
        }
Esempio n. 6
0
        public void Test_Calculate_Probability()
        {
            Corpus good = new Corpus();
            Corpus bad = new Corpus();

            good.Add("the chicken jumped over the moon", 3);
            bad.Add("the cow ran threw the moon", 3);

            Calculator c = new Calculator(Calculator.Defaults);
            Probability prob = c.CalculateProbabilities(good, bad);

            Assert.AreEqual<double>(0.3333333333333333, prob.Prob["the"]);
            Assert.AreEqual<double>(0.3333333333333333, prob.Prob["moon"]);
            //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, prob.Prob["ran"]);
            //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, prob.Prob["cow"]);
        }
Esempio n. 7
0
        public void Test_Matching()
        {
            Corpus good = new Corpus();
            Corpus bad = new Corpus();

            good.Add("the chicken jumped over the moon", 3);
            bad.Add("the cow ran threw the moon", 3);

            Calculator c = new Calculator(Calculator.Defaults);
            Probability prob = c.CalculateProbabilities(good, bad);

            Filter target = new Filter(prob);

            target.Test("the cow ran over the moon", 3);

            Assert.IsTrue(target.Test("the cow ran threw the moon", 3) > 0.98);
            Assert.IsTrue(target.Test("the cow ran over the moon", 3) > 0.25);
        }
Esempio n. 8
0
        // TODO: изменить на подключение к корпусу, а не к текстам.
        public void Connect()
        {
            using (connection = new SqlConnection(connectionString))
            {
                connection.Open();

                var command = new SqlCommand("select * from text", connection);
                var reader  = command.ExecuteReader();

                if (reader.HasRows)
                {
                    while (reader.Read())
                    {
                        var id    = reader["text_Id"].ToString();
                        var title = reader["name"].ToString();

                        corpus.Add(new TextFile()
                        {
                            Title = title, Info = id
                        });
                    }
                }
            }
        }
Esempio n. 9
0
 public void Basic_Builder_Add()
 {
     Corpus c = new Corpus();
     c.Add("one two three a 333 3adsf a123", 3);
     Assert.AreEqual(4, c.Tokens.Count);
 }
Esempio n. 10
0
 public void Basic_List_Add()
 {
     Corpus c = new Corpus();
     c.Add(new string[] { "one", "two", "three" });
     Assert.AreEqual(3, c.Tokens.Count);
 }