/// <summary> /// Подключение к корпусу на локальном диске. /// Если корпуса не существует, то создать, /// иначе считать все файлы, находящиеся в нём. /// </summary> public void Connect() { DirectoryInfo directoryInfo = new DirectoryInfo(connectorString + "CorporaStore"); if (!directoryInfo.Exists) { directoryInfo.Create(); } foreach (var d in directoryInfo.GetDirectories()) { Corpus corpus = new Corpus() { Title = d.Name }; foreach (var f in d.GetFiles()) { var file = new TextFile() { Info = f.FullName, Title = f.Name }; corpus.Add(file); } corpora.Add(corpus); } }
public void Basic_Add() { Corpus c = new Corpus(); c.Add("one"); c.Add("two"); c.Add("three"); Assert.AreEqual(3, c.Tokens.Count); }
public void TestClustering() { List <string> word_sequence = new List <string>(); Corpus corpus = new Corpus(); using (StreamReader reader = new StreamReader("sample.txt")) { string[] words = reader.ReadToEnd().Split(new char[] { ' ', '?', ',', ':', '"', '\n', '\t' }, StringSplitOptions.RemoveEmptyEntries); foreach (string word in words) { string w2 = word.Trim(); if (w2 == ".") { continue; } if (w2.EndsWith(".")) { w2 = w2.Substring(0, w2.Length - 1); } if (!string.IsNullOrEmpty(w2) && word.Length > 1) { word_sequence.Add(w2); corpus.Add(w2); } } } int M = 70; Console.WriteLine("M: {0}", M); Console.WriteLine("Corpus Size: {0}", corpus.Count); Console.WriteLine("Document Size: {0}", word_sequence.Count); BrownClustering bc = new BrownClustering(M); bc.Cluster(corpus, word_sequence); Dictionary <string, List <string> > clusters = bc.GetClustersWithCodewordsOfLength(10); foreach (string codeword in clusters.Keys) { Console.WriteLine("In Cluster {0}", codeword); foreach (string word in clusters[codeword]) { Console.Write("{0}, ", word); } Console.WriteLine(); } XmlDocument doc = new XmlDocument(); XmlElement root = bc.ToXml(doc); doc.AppendChild(root); doc.Save("BrownClusteringResult.xml"); }
public void AddFile(Corpus corpus, string fileName) { var path = connectorString + @"CorporaStore\" + corpus.Title + @"\" + Path.GetFileName(fileName); corpus.Add(new TextFile() { Title = Path.GetFileName(fileName), Info = path }); FileInfo fileInfo = new FileInfo(fileName); fileInfo.CopyTo(path, true); //fileInfo.Delete(); }
public void Test_Token_Probability() { Corpus good = new Corpus(); Corpus bad = new Corpus(); good.Add("the chicken jumped over the moon", 3); bad.Add("the cow ran threw the moon", 3); Calculator c = new Calculator(Calculator.Defaults); Assert.AreEqual<double>(0.3333333333333333, c.CalculateTokenProbability("the", good, bad)); Assert.AreEqual<double>(0.3333333333333333, c.CalculateTokenProbability("moon", good, bad)); //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, c.CalculateTokenProbability("ran", good, bad)); //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, c.CalculateTokenProbability("cow", good, bad)); }
public void Test_Calculate_Probability() { Corpus good = new Corpus(); Corpus bad = new Corpus(); good.Add("the chicken jumped over the moon", 3); bad.Add("the cow ran threw the moon", 3); Calculator c = new Calculator(Calculator.Defaults); Probability prob = c.CalculateProbabilities(good, bad); Assert.AreEqual<double>(0.3333333333333333, prob.Prob["the"]); Assert.AreEqual<double>(0.3333333333333333, prob.Prob["moon"]); //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, prob.Prob["ran"]); //Assert.AreEqual<double>(Calculator.Defaults.LikelySpamScore, prob.Prob["cow"]); }
public void Test_Matching() { Corpus good = new Corpus(); Corpus bad = new Corpus(); good.Add("the chicken jumped over the moon", 3); bad.Add("the cow ran threw the moon", 3); Calculator c = new Calculator(Calculator.Defaults); Probability prob = c.CalculateProbabilities(good, bad); Filter target = new Filter(prob); target.Test("the cow ran over the moon", 3); Assert.IsTrue(target.Test("the cow ran threw the moon", 3) > 0.98); Assert.IsTrue(target.Test("the cow ran over the moon", 3) > 0.25); }
// TODO: изменить на подключение к корпусу, а не к текстам. public void Connect() { using (connection = new SqlConnection(connectionString)) { connection.Open(); var command = new SqlCommand("select * from text", connection); var reader = command.ExecuteReader(); if (reader.HasRows) { while (reader.Read()) { var id = reader["text_Id"].ToString(); var title = reader["name"].ToString(); corpus.Add(new TextFile() { Title = title, Info = id }); } } } }
public void Basic_Builder_Add() { Corpus c = new Corpus(); c.Add("one two three a 333 3adsf a123", 3); Assert.AreEqual(4, c.Tokens.Count); }
public void Basic_List_Add() { Corpus c = new Corpus(); c.Add(new string[] { "one", "two", "three" }); Assert.AreEqual(3, c.Tokens.Count); }