private void LoadContent() { // Reset info to clear way for new info. dataSearchResults.Invoke((MethodInvoker) delegate { dataSearchResults.Rows.Clear(); }); lblFiles.Invoke((MethodInvoker) delegate { lblFiles.Text = "Added Files (Analyzing files...)"; }); extractor = new WordExtractor(); DateTime start = DateTime.Now; for (int i = 0; i < fileList.Count; i++) { string fileContent = InputOutput.ReadFile(fileList[i]); extractor.ExtractWordsFromTextFile(fileContent, fileList[i]); } // Create one sorted list and one unsorted. unsortedWordsList = extractor.GetCompoundedList(); List <Word> temporaryList = extractor.GetCompoundedList(); SearchEngine <Word> .QuickSort(temporaryList, 0, temporaryList.Count - 1); sortedWordsList = temporaryList; TimeSpan span = DateTime.Now - start; // Since this method is called in a separate thread, we to invoke this code in the main-thread. Because that's where lblFiles is. lblFiles.Invoke((MethodInvoker) delegate { lblFiles.Text = $"Files analyzed and sorted... It took {span.TotalSeconds.ToString("F")} seconds"; }); }
public void TestWord95() { // Too old for the default try { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word95.doc") ); Assert.Fail(); } catch (OldWordFileFormatException) { } // Can work with the special one Word6Extractor w6e = new Word6Extractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word95.doc") ); String text = w6e.Text; Assert.IsTrue(text.Contains("The quick brown fox jumps over the lazy dog")); Assert.IsTrue(text.Contains("Paragraph 2")); Assert.IsTrue(text.Contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); Assert.IsTrue(text.Contains("Last (4th) paragraph")); String[] tp = w6e.ParagraphText; Assert.AreEqual(7, tp.Length); Assert.AreEqual("The quick brown fox jumps over the lazy dog\r\n", tp[0]); Assert.AreEqual("\r\n", tp[1]); Assert.AreEqual("Paragraph 2\r\n", tp[2]); Assert.AreEqual("\r\n", tp[3]); Assert.AreEqual("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]); Assert.AreEqual("\r\n", tp[5]); Assert.AreEqual("Last (4th) paragraph.\r\n", tp[6]); }
static void Main(string[] args) { WordExtractor wordExtractor = new WordExtractor(ReutersBig); wordExtractor.Start(); Console.ReadKey(); }
public void BinarySearch_SeveralMatchesExists_FindsAll() { WordExtractor extractor = new WordExtractor(); string path1 = $"{AppDomain.CurrentDomain.BaseDirectory}TestFiles\\TextFile1.txt"; string text1 = InputOutput.ReadFile(path1); string path2 = $"{AppDomain.CurrentDomain.BaseDirectory}TestFiles\\TextFile2.txt"; string text2 = InputOutput.ReadFile(path2); string path3 = $"{AppDomain.CurrentDomain.BaseDirectory}TestFiles\\TextFile3.txt"; string text3 = InputOutput.ReadFile(path3); string path4 = $"{AppDomain.CurrentDomain.BaseDirectory}TestFiles\\TextFile4.txt"; string text4 = InputOutput.ReadFile(path4); extractor.ExtractWordsFromTextFile(text1, path1); extractor.ExtractWordsFromTextFile(text2, path2); extractor.ExtractWordsFromTextFile(text3, path3); extractor.ExtractWordsFromTextFile(text4, path4); List <Word> list = extractor.GetCompoundedList(); SearchEngine <Word> .QuickSort(list, 0, list.Count - 1); var result = SearchEngine <Word> .BinarySearch(list, true, "is"); Dictionary <string, int> expected = new Dictionary <string, int>(); expected.Add(path1, 1); expected.Add(path2, 2); expected.Add(path3, 1); expected.Add(path4, 1); Assert.AreEqual(expected, result); }
private static WordExtractor BuildWordExtractor(InputConfiguration config) { var preProcessors = new List <ITextPreProcessor> { new LineEndingNormalizer() }; if (!config.KeepComments) { preProcessors.Add(new CommentRemover()); } if (!config.KeepStrings) { preProcessors.Add(new StringRemover()); } if (!config.KeepNamespaces) { preProcessors.Add(new NamespaceCleaner()); } var postProcessors = new ITextPostProcessor[] { new WordBlacklist(config.Blacklist), new ShortWordRemover(config.MinWordLength) }; var extractor = new WordExtractor(preProcessors, postProcessors); return(extractor); }
public void ShouldReturnEmptyDictionaryWhenInputIsEmpty() { // Arrange & Act var actual = WordExtractor.ExtractWithoutProcessing(string.Empty); // Assert Assert.Empty(actual); }
public void ShouldRemoveDigitOnlyWords(string input) { // Arrange & Act var actual = WordExtractor.ExtractWithoutProcessing(input); // Assert Assert.Empty(actual); }
private void Start() { tileText = GetComponentInChildren <TextMeshProUGUI>(); alphabet = letterGen.ReturnLetter(); tileText.text = alphabet.ToString(); mainBoard = GameObject.FindGameObjectWithTag("MainBoard").GetComponent <Board>(); wordExtractor = GameObject.FindGameObjectWithTag("WordExtractor").GetComponent <WordExtractor>(); }
public void TestFirstParagraphFix() { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Bug48075.doc") ); String text = extractor.Text; Assert.IsTrue(text.StartsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435")); }
public void TestProblemMetadata() { WordExtractor extractor = new WordExtractor(POIDataSamples.GetDocumentInstance().OpenResourceAsStream("ProblemExtracting.doc")); // Check it gives text without error string text = extractor.Text; string[] paratext = extractor.ParagraphText; string textfrompieces = extractor.TextFromPieces; }
public void TestFastSaved() { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("rasp.doc") ); String text = extractor.Text; Assert.IsTrue(text.Contains("\u0425\u0425\u0425\u0425\u0425")); Assert.IsTrue(text.Contains("\u0423\u0423\u0423\u0423\u0423")); }
public void ShouldReturnEmptyDictionaryWhenInputIsEmpty() { // Arrange const string input = ""; var sut = new WordExtractor(); // Act var actual = sut.Extract(input); // Assert Assert.Empty(actual); }
public void ShouldCallPreProcessors() { // Arrange var p1 = Substitute.For <ITextPreProcessor>(); var p2 = Substitute.For <ITextPreProcessor>(); var sut = new WordExtractor(new[] { p1, p2 }); // Act var _ = sut.Extract("hello, world"); // Assert p1.Received().Process(Arg.Any <string>()); p2.Received().Process(Arg.Any <string>()); }
public void ShouldCallPostProcessors() { // Arrange var p1 = Substitute.For <ITextPostProcessor>(); var p2 = Substitute.For <ITextPostProcessor>(); var sut = new WordExtractor(Enumerable.Empty <ITextPreProcessor>(), new[] { p1, p2 }); // Act var _ = sut.Extract("hello, world"); // Assert p1.Received().Process(Arg.Any <Dictionary <string, int> >()); p2.Received().Process(Arg.Any <Dictionary <string, int> >()); }
public void ShouldContainWordsWithCorrectFrequencyIgnoringCase() { // Arrange const string input = "one TWO two three thRee ThrEE"; var expected = new Dictionary <string, int> { { "one", 1 }, { "two", 2 }, { "three", 3 } }; // Act var actual = WordExtractor.ExtractWithoutProcessing(input); // Assert Assert.Equal(expected, actual); }
public void GetCompoundedList_SortedAndUnsortedLists_AreNotEqual() { WordExtractor extractor = new WordExtractor(); string path1 = $"{AppDomain.CurrentDomain.BaseDirectory}TestFiles\\TextFile1.txt"; string text1 = InputOutput.ReadFile(path1); extractor.ExtractWordsFromTextFile(text1, path1); var unsortedList = extractor.GetCompoundedList(); var sortedList = extractor.GetCompoundedList(); SearchEngine <Word> .QuickSort(sortedList, 0, sortedList.Count - 1); Assert.AreNotEqual(unsortedList, sortedList); }
public void ShouldNotContainSpecialCharactersInWords() { // Arrange const string input = "#region !not nullable?"; var expected = new Dictionary <string, int> { { "region", 1 }, { "not", 1 }, { "nullable", 1 } }; // Act var actual = WordExtractor.ExtractWithoutProcessing(input); // Assert Assert.Equal(expected, actual); }
public void ShouldContainAllWordsFromInput() { // Arrange const string input = "one two three"; var expected = new Dictionary <string, int> { { "one", 1 }, { "two", 1 }, { "three", 1 } }; // Act var actual = WordExtractor.ExtractWithoutProcessing(input); // Assert Assert.Equal(expected, actual); }
public void TestComments() { HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename6); extractor = new WordExtractor(doc); String[] text = extractor.CommentsText; StringBuilder b = new StringBuilder(); for (int i = 0; i < text.Length; i++) { b.Append(text[i]); } Assert.IsTrue(b.ToString().Contains("TestComment")); }
private void OpenPdf(string path) { ClearContextAndUi(); using (var prgForm = new ProgressForm()) { _ctx = new EditContext(new PdfFile { Path = path }); IReadOnlyList <IWord> words = null; prgForm.ShowWhile(async() => { var md5Task = Task.Run(() => _ctx.OpenFile.ComputeMd5()); prgForm.Report("Extracting words..."); var analyzePageProgress = new Progress <int>(pg => { if (pg % 25 == 0) { prgForm.Report($"Page {pg} loaded."); } }); var analyzer = new Analyzer(); IAnalysis analysis; try { analysis = await analyzer.AnalyzeAsync(_ctx.OpenFile.Path, analyzePageProgress).ConfigureAwait(true); await md5Task.ConfigureAwait(true); } catch (Exception ex) { var msg = $"An error occured while opening the selected file: {ex.Message}"; MessageBox.Show(this, msg, "Could not open file", MessageBoxButtons.OK, MessageBoxIcon.Exclamation); ClearContextAndUi(); return; } prgForm.Report("Document loaded. Analyzing words..."); var we = new WordExtractor(); words = await we.ExtractAsync(analysis).ConfigureAwait(true); }, this); ListWordsInOpenDocument(words); _ctx.Annotations = new Dictionary <IWord, Annotation.Annotation>(); annotationsListView.Items.Clear(); } LoadSavedAnnotationsForOpenFile(); }
private static string GetWordContent(string path) { string fileExtension = Path.GetExtension(path).ToLower(); if (fileExtension.Contains("docx")) { WordprocessingDocument docx = WordprocessingDocument.Open(path, false); return(docx.MainDocumentPart.Document.InnerText); } else { FileStream fs = new FileStream(path, FileMode.Open); HWPFDocument doc = new HWPFDocument(fs); WordExtractor extractor = new WordExtractor(doc); fs.Close(); return(extractor.Text); } }
public void TestProblemHeaderStories49936() { HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile("HeaderFooterProblematic.doc"); HeaderStories hs = new HeaderStories(doc); Assert.AreEqual("", hs.FirstHeader); Assert.AreEqual("\r", hs.EvenHeader); Assert.AreEqual("", hs.OddHeader); Assert.AreEqual("", hs.FirstFooter); Assert.AreEqual("", hs.EvenFooter); Assert.AreEqual("", hs.OddFooter); WordExtractor ext = new WordExtractor(doc); Assert.AreEqual("\n", ext.HeaderText); Assert.AreEqual("", ext.FooterText); }
public void BinarySearch_LoadsNull_ReturnsEmptyDictionary() { WordExtractor extractor = new WordExtractor(); string path1 = null; string text1 = InputOutput.ReadFile(path1); extractor.ExtractWordsFromTextFile(text1, path1); List <Word> list = extractor.GetCompoundedList(); SearchEngine <Word> .QuickSort(list, 0, list.Count - 1); var result = SearchEngine <Word> .BinarySearch(list, true, "could"); Dictionary <string, int> expected = new Dictionary <string, int>(); Assert.AreEqual(expected, result); }
public void ShouldApplyProcessorsInCorrectOrder() { // Arrange var p1 = Substitute.For <ITextPreProcessor>(); p1.Process("zero").Returns("one"); var p2 = Substitute.For <ITextPreProcessor>(); p2.Process("one").Returns("two"); var sut = new WordExtractor(new[] { p1, p2 }); // Act var _ = sut.Extract("zero"); // Assert p1.Received(1).Process("zero"); p2.Received(1).Process("one"); }
public void BinarySearch_LoadsEmptyTextFile_ReturnsEmptyDictionary() { WordExtractor extractor = new WordExtractor(); string path1 = $"{AppDomain.CurrentDomain.BaseDirectory}TestFiles\\EmptyTextFile.txt"; string text1 = InputOutput.ReadFile(path1); extractor.ExtractWordsFromTextFile(text1, path1); List <Word> list = extractor.GetCompoundedList(); SearchEngine <Word> .QuickSort(list, 0, list.Count - 1); var result = SearchEngine <Word> .BinarySearch(list, true, ""); Dictionary <string, int> expected = new Dictionary <string, int>(); Assert.AreEqual(expected, result); }
public void SetUp() { String filename = "test2.doc"; String filename2 = "test.doc"; filename3 = "excel_with_embeded.xls"; filename4 = "ThreeColHeadFoot.doc"; filename5 = "HeaderFooterUnicode.doc"; filename6 = "footnote.doc"; POIDataSamples docTests = POIDataSamples.GetDocumentInstance(); extractor = new WordExtractor(docTests.OpenResourceAsStream(filename)); extractor2 = new WordExtractor(docTests.OpenResourceAsStream(filename2)); // Build splat'd out text version for (int i = 0; i < p_text1.Length; i++) { p_text1_block += p_text1[i]; } }
public void TestWithHeader() { // Non-unicode HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename4); extractor = new WordExtractor(doc); Assert.AreEqual("First header column!\tMid header Right header!\n", extractor.HeaderText); String text = extractor.Text; Assert.IsTrue(text.IndexOf("First header column!") > -1); // Unicode doc = HWPFTestDataSamples.OpenSampleFile(filename5); extractor = new WordExtractor(doc); Assert.AreEqual("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor .HeaderText); text = extractor.Text; Assert.IsTrue(text.IndexOf("This is a simple header") > -1); }
public void TestWithFooter() { // Non-unicode HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename4); extractor = new WordExtractor(doc); Assert.AreEqual("Footer Left\tFooter Middle Footer Right\n", extractor.FooterText); String text = extractor.Text; Assert.IsTrue(text.IndexOf("Footer Left") > -1); // Unicode doc = HWPFTestDataSamples.OpenSampleFile(filename5); extractor = new WordExtractor(doc); Assert.AreEqual("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor .FooterText); text = extractor.Text; Assert.IsTrue(text.IndexOf("The footer, with") > -1); }
/// <summary> /// Reads file from file path. Extracts words from file. Sorts words in ascending alphabetical order. /// </summary> /// <param name="wordExtractor"></param> /// <param name="compoundedList"></param> /// <returns></returns> static List <Word> LoadFileToMain(WordExtractor wordExtractor, List <Word> compoundedList) { Console.WriteLine("Enter a file from a catalogue using the following syntax: \"C\\User\\admin\\text.txt\""); Console.Write(">: "); string filePath = Console.ReadLine(); string fileContent = InputOutput.ReadFile(filePath); if (fileContent == "Could not read file" || fileContent == "You don't have access, your authority level is to low") { Console.WriteLine(fileContent); Console.WriteLine(); } else { wordExtractor.ExtractWordsFromTextFile(fileContent, filePath); compoundedList = wordExtractor.GetCompoundedList(); SearchEngine <Word> .QuickSort(compoundedList, 0, compoundedList.Count - 1); Console.WriteLine($"{filePath} has been loaded."); Console.WriteLine(); } return(compoundedList); }
public void TestWord6() { // Too old for the default try { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word6.doc") ); Assert.Fail(); } catch (OldWordFileFormatException) { } Word6Extractor w6e = new Word6Extractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word6.doc") ); String text = w6e.Text; Assert.IsTrue(text.Contains("The quick brown fox jumps over the lazy dog")); String[] tp = w6e.ParagraphText; Assert.AreEqual(1, tp.Length); Assert.AreEqual("The quick brown fox jumps over the lazy dog\r\n", tp[0]); }