// Extract searchable text from a file using IFilterTextReader. // Extract text from document, then replace multiple white space sequences with a single space. // If IFilterTextReader fails (for example, old Office document; or unknown document type), an exception is logged and null is returned. // Prefix is optional text to prepend to the result - such as document filename, metadata properties, anything else to include in search text. private static String ExtractTextFromFile(ILogger log, String inputFile, String prefix = null) { String line; String cleanedString = prefix; try { FilterReaderOptions options = new FilterReaderOptions() { }; using (var reader = new FilterReader(inputFile, string.Empty, options)) { while ((line = reader.ReadLine()) != null) { line = line.Trim(); if (!String.IsNullOrEmpty(line)) { line = System.Text.RegularExpressions.Regex.Replace(line, @"[,]\s+", " "); line = System.Text.RegularExpressions.Regex.Replace(line, @"[,]", ""); line = System.Text.RegularExpressions.Regex.Replace(line, @"[^a-zA-Z'\d\s:]", " "); line = System.Text.RegularExpressions.Regex.Replace(line, @"\s+", " "); cleanedString += line + " "; } } } // end reader } catch (Exception ex) { log.LogError("ExtractTextFromFile: " + ex.Message); } return(cleanedString); }
private static void TryReadFile(FileInfo file) { var stream = file.OpenRead(); FilterReader reader = null; try { FilterReaderOptions filterReaderOptions = new FilterReaderOptions(); reader = new FilterReader(stream, file.Extension, filterReaderOptions); var result = reader.ReadToEnd(); } catch (Exception ex) { Console.WriteLine(ex.Message); } finally { reader?.Close(); stream?.Close(); } }
private void SelectButton_Click(object sender, EventArgs e) { // Create an instance of the open file dialog box. var openFileDialog1 = new OpenFileDialog { // ReSharper disable once LocalizableElement Filter = "Alle files (*.*)|*.*", FilterIndex = 1, Multiselect = false }; // Process input if the user clicked OK. if (openFileDialog1.ShowDialog() == DialogResult.OK) { FileLabel.Text = openFileDialog1.FileName; FindTextButton.Enabled = true; TextToFindTextBox.Enabled = true; FindWithRegexButton.Enabled = true; TextToFindWithRegexTextBox.Enabled = true; try { DisableInput(); FilterTextBox.AppendText("*** Processing file '" + openFileDialog1.FileName + "' ***" + Environment.NewLine + Environment.NewLine); Application.DoEvents(); var stopWatch = new Stopwatch(); var timeoutOption = FilterReaderTimeout.NoTimeout; switch (TimeoutOptionsComboBox.SelectedIndex) { case 0: timeoutOption = FilterReaderTimeout.NoTimeout; break; case 1: timeoutOption = FilterReaderTimeout.TimeoutOnly; break; case 2: timeoutOption = FilterReaderTimeout.TimeoutWithException; break; } var options = new FilterReaderOptions() { DisableEmbeddedContent = DisableEmbeddedContentCheckBox.Checked, IncludeProperties = IncludePropertiesCheckBox.Checked, ReadIntoMemory = ReadIntoMemoryCheckBox.Checked, ReaderTimeout = timeoutOption, Timeout = int.Parse(TimeoutTextBox.Text) }; using (var reader = new FilterReader(openFileDialog1.FileName, string.Empty, options)) { stopWatch.Start(); string line; string tempFileName = Path.GetTempFileName(); while ((line = reader.ReadLine()) != null) { FilterTextBox.AppendText(line + Environment.NewLine); Application.DoEvents(); System.IO.File.AppendAllLines(tempFileName, new[] { line }); } stopWatch.Stop(); FilterTextBox.AppendText(Environment.NewLine + "*** DONE IN " + stopWatch.Elapsed + " ***" + Environment.NewLine); Application.DoEvents(); } } catch (Exception exception) { DisableInput(); FilterTextBox.Text = exception.StackTrace + Environment.NewLine + GetInnerException(exception); } finally { EnableInput(); } } }