public static void MakeTrees(string contentTreeName) { IIO io = new FileIO(); IEnumerable<string> file = io.ReadSourceIterable(testpath + "TIME.ALL"); ITextExtractor it = new BeginMarkerExtraction(file, "*TEXT"); ITreeIO tio = new TreeIO(); IBaseTree tree = tio.LoadBaseTree(testpath + contentTreeName); int count = 1; while (it.HasNextContent()) { string content = it.FindNextContent(); //Console.WriteLine("-----"); string name = "" + count; //Console.WriteLine(name); content = Helpers.ConsumeName(content); //Console.WriteLine(content); IDataTree datatree = DataTreeBuilder.CreateDocumentMappedTree(tree); //Console.WriteLine(tree); //Console.WriteLine(datatree.GetBaseTree()); DataTreeBuilder.AddToDataTree(datatree, content); datatree.Name = name; tio.SaveDataTree(datatree, testpath + @"\datatrees\" + name + ".dtree"); //Console.WriteLine(datatree.MappedWords); count++; } }
public static void BuildSuggestor() { IIO io = new FileIO(); IEnumerable<string> file = io.ReadSourceIterable(testpath + "TIME.ALL"); ITextExtractor it = new BeginMarkerExtraction(file, "*TEXT"); Console.WriteLine("Building suggestion base"); WordSuggestor ws = new WordSuggestor(); ws.addAll(it); Console.WriteLine("Saving tree"); ITreeIO tio = new TreeIO(); tio.SaveObject(ws, testpath + "WordSuggestions"); }
public static void TestSuggestions() { IIO io = new FileIO(); IEnumerable<string> file = io.ReadSourceIterable(testpath + "TIME.ALL"); ITextExtractor it = new BeginMarkerExtraction(file, "*TEXT"); WordSuggestor ws = new WordSuggestor(); ws.AddAllStemmed(it); var words = ws.Suggestions(.2); foreach (string s in words) { Console.WriteLine(s); } }
public static void TestWordSuggestions() { IIO io = new FileIO(); IEnumerable<string> file = io.ReadSourceIterable(testpath + "TIME.ALL"); ITextExtractor it = new BeginMarkerExtraction(file, "*TEXT"); Console.WriteLine("Building suggestion base"); WordSuggestor ws = new WordSuggestor(); ws.AddAllStemmed(it); string test = "embassi"; Console.WriteLine("Determining suggestions for '" + test + "':"); var words = ws.WordSuggestions(test, .2); foreach (string s in words) { Console.WriteLine(s); } }
public static void Compare(string contentTreeName) { Console.SetBufferSize(100, 20000); IIO io = new FileIO(); IEnumerable<string> file = io.ReadSourceIterable(testpath + "TIME.QUE"); IEnumerable<string> expectedResults = io.ReadSourceIterable(testpath + "TIME.REL"); var resultsEnum = expectedResults.GetEnumerator(); ITextExtractor it = new BeginMarkerExtraction(file, "*FIND"); ITreeIO tio = new TreeIO(); IBaseTree tree = tio.LoadBaseTree(testpath + contentTreeName); double totalRecall = 0; double totalPrecision = 0; double bestRecall = -1; double worstRecall = 2; double bestPrecision = -1; double worstPrecision = 2; double totalGoodPrecision = 0; double count = 0; while (it.HasNextContent()) { string query = it.FindNextContent(); Console.WriteLine("---------------------------------"); string queryName = Helpers.GetNameWhenFirst(query); Console.WriteLine("Query: " + queryName); query = Helpers.ConsumeName(query); Console.WriteLine(query); IDataTree queryTree = DataTreeBuilder.CreateDocumentMappedTree(tree); DataTreeBuilder.AddToDataTree(queryTree, query); queryTree.PrintDataTree(); Console.WriteLine("Expected Results: "); while(string.IsNullOrEmpty(resultsEnum.Current)) resultsEnum.MoveNext(); string expected = Helpers.ConsumeName(resultsEnum.Current); Console.WriteLine(expected); resultsEnum.MoveNext(); expected = expected.Trim(); string[] expectedArray = expected.Split(' '); double relevant = 0; double totalRetrieved = 0; Console.WriteLine("Actual Results: "); foreach(String s in Directory.EnumerateFiles(testpath + @"\datatrees")) { IDataTree docTree = tio.LoadDataTree(s); if (queryTree.CompareTo(docTree)) { Console.WriteLine(" Found: " + docTree.Name); //docTree.PrintDataTree(); totalRetrieved++; if (expectedArray.Contains(docTree.Name)) { relevant++; } } else if (expectedArray.Contains(docTree.Name)) { Console.WriteLine(" Expected: " + docTree.Name); //docTree.PrintDataTree(); } } Console.WriteLine(); Console.WriteLine("Precision: " + relevant + "/" + totalRetrieved ); Console.WriteLine("Recall: " + relevant + "/" + (expectedArray.Length)); Console.WriteLine(); //Console.ReadLine(); count++; double recall = relevant / expectedArray.Length; double precision = 0; if (totalRetrieved > 0) { precision = relevant / totalRetrieved; } totalPrecision += precision; totalRecall += recall; if (precision > bestPrecision) { bestPrecision = precision; } if (precision < worstPrecision) { worstPrecision = precision; } if(recall > bestRecall) { bestRecall = recall; } if(recall < worstRecall) { worstRecall = recall; } if (recall > .5) { totalGoodPrecision += precision; } } Console.WriteLine("-------------------"); Console.WriteLine("Average Precision: " + totalPrecision / count); Console.WriteLine("Average Recall: " + totalRecall / count); Console.WriteLine("Worst Precision: " + worstPrecision); Console.WriteLine("Worst Recall: " + worstRecall); Console.WriteLine("Best Precision: " + bestPrecision); Console.WriteLine("Best Recall: " + bestRecall); //Console.WriteLine("Average Good Recall Precision: " + totalGoodPrecision / count); }
public static void getDocsForQuery(string contentTreeName) { Console.SetBufferSize(100, 2000); IIO io = new FileIO(); IEnumerable<string> file = io.ReadSourceIterable(testpath + "TIME.QUE"); IEnumerable<string> expectedResults = io.ReadSourceIterable(testpath + "TIME.REL"); var resultsEnum = expectedResults.GetEnumerator(); ITextExtractor it = new BeginMarkerExtraction(file, "*FIND"); ITreeIO tio = new TreeIO(); IBaseTree tree = tio.LoadBaseTree(testpath + contentTreeName); string query = it.FindNextContent(); Console.WriteLine("---------------------------------"); string queryName = Helpers.GetNameWhenFirst(query); Console.WriteLine("Query: " + queryName); query = Helpers.ConsumeName(query); Console.WriteLine(query); IDataTree queryTree = DataTreeBuilder.CreateStemmedDocumentMapTree(tree); DataTreeBuilder.AddToDataTreeBoyerMoore(queryTree, query); queryTree.PrintDataTree(); Console.WriteLine("Expected Results: "); while (string.IsNullOrEmpty(resultsEnum.Current)) resultsEnum.MoveNext(); string expected = Helpers.ConsumeName(resultsEnum.Current); Console.WriteLine(expected); resultsEnum.MoveNext(); expected = expected.Trim(); string[] expectedArray = expected.Split(' '); double relevant = 0; double totalRetrieved = 0; Console.WriteLine("Actual Results: "); List<string> retrieved = new List<string>(); foreach (String s in Directory.EnumerateFiles(testpath + @"\datatrees")) { IDataTree docTree = tio.LoadDataTree(s); if (queryTree.CompareTo(docTree)) { Console.Write(" " + docTree.Name); retrieved.Add(docTree.Name); totalRetrieved++; if (expectedArray.Contains(docTree.Name)) { relevant++; } Console.WriteLine("Found---"); Console.WriteLine(docTree.Name); docTree.PrintDataTree(); Console.WriteLine("---"); } if (expectedArray.Contains(docTree.Name)) { Console.WriteLine("Expected---"); Console.WriteLine(docTree.Name); docTree.PrintDataTree(); Console.WriteLine("---"); } } Console.WriteLine(); Console.WriteLine("Precision: " + relevant + "/" + totalRetrieved); Console.WriteLine("Recall: " + relevant + "/" + (expectedArray.Length)); Console.WriteLine(); Console.WriteLine("---------------------------------"); Thread.Sleep(10000); IEnumerable<string> fileAll = io.ReadSourceIterable(testpath + "TIME.ALL"); ITextExtractor itAll = new BeginMarkerExtraction(fileAll, "*TEXT"); //int count = 1; //while (itAll.HasNextContent()) { // string content = itAll.FindNextContent(); // string name = "" + count; // if (retrieved.Contains(name) || expectedArray.ToList().Contains(name)) { // Console.WriteLine("------------------------------------------------------------"); // Console.WriteLine("------------------------------------------------------------"); // Console.WriteLine(name); // content = Helpers.ConsumeName(content); // Console.WriteLine(content); // Console.WriteLine("------------------------------------------------------------"); // Console.WriteLine("------------------------------------------------------------"); // } // count++; //} }
private void formatOkay_Click(object sender, RoutedEventArgs e) { if (baseTree == null) { MessageBox.Show("Please select a content tree for the data tree."); return; } if (formatBox.SelectedIndex == -1) { formatBox.BorderBrush = Brushes.Red; return; } if (string.IsNullOrEmpty(documentFormatBox.Text)) { documentFormatBox.BorderBrush = Brushes.Red; return; } OpenFileDialog ofd = new OpenFileDialog(); ofd.FileName = "Tree"; ofd.DefaultExt = ".txt"; Nullable <bool> result = ofd.ShowDialog(); if (result == true) { string filename = ofd.FileName; documentLabel.Content = filename + "datatrees"; using (Ookii.Dialogs.Wpf.ProgressDialog dial = new ProgressDialog()) { dial.ProgressBarStyle = ProgressBarStyle.MarqueeProgressBar; dial.Show(); dial.Description = "Analyzing text..."; IIO io = new FileIO(); ITextExtractor it = null; switch (formatBox.SelectedIndex) { case 0: string text = io.ReadSource(filename); it = new XMLTextExtractor(text, documentFormatBox.Text); break; case 1: var texts = io.ReadSourceIterable(filename); it = new BeginMarkerExtraction(texts, documentFormatBox.Text); break; default: throw new InvalidOperationException(); } documents = new ObservableCollection <string>(); while (it.HasNextContent()) { string content = it.FindNextContent(); string name = Helpers.GetNameWhenFirst(content); documents.Add(name); IDataTree tree = DataTreeBuilder.CreateDocumentMappedTree(baseTree); DataTreeBuilder.AddToDataTree(tree, content); ITreeIO tio = new TreeIO(); tio.SaveDataTree(tree, filename + @"datatrees\" + name + ".dtree"); } documentList.ItemsSource = documents; } } buildDataTreePopup.IsOpen = false; }