static void Main(string[] args) { using (var obj = new DocumentProcessor()) { obj.MergeDocuments(); } }
public void TestPassThrough() { string sCopyFile = TESTFILE_DIR + "copy.docx"; if (File.Exists(sCopyFile)) File.Delete(sCopyFile); try { using (Stream str = File.Open(TESTFILE_DIR + "test002.docx", FileMode.Open)) { DocumentProcessor dp = new DocumentProcessor(str); using (dp.Output = File.Open(sCopyFile, FileMode.CreateNew)) { dp.Process(DocumentProcessingActions.PassThrough); } } Assert.IsTrue(CommonTestUtilities.AreZipFilesEqual(TESTFILE_DIR + "test002.docx", sCopyFile)); } finally { File.Delete(sCopyFile); } }
static DocumentProcessor Configure() { DocumentProcessor rc = new DocumentProcessor(); rc.AddProcess(DocumentProcesses.Spellcheck); rc.AddProcess(DocumentProcesses.Repaginate); rc.AddProcess(DocumentProcesses.TransaleIntoFrench, delegate(Document doc) { return !doc.Text.Contains("?"); }); rc.AddProcess(DocumentProcesses.DoingSomethingElseCool); TrademarkFilter trademarkFilter = new TrademarkFilter(); trademarkFilter.TradeMarks.Add("O'Reilly"); trademarkFilter.TradeMarks.Add("millennium"); rc.AddProcess(trademarkFilter.HighlightTrademarks); return rc; }
public void TestCreatePartFilterForContentType() { using (Stream str = File.Open(TESTFILE_DIR + "test002.docx", FileMode.Open)) { DocumentProcessor dp = new DocumentProcessor(str); Assert.IsTrue(dp.GetPartFilter("cheese/sandwiches", DocumentProcessingActions.Discover) is DefaultPartFilter); IPartFilter pf = dp.GetPartFilter("toast/xml", DocumentProcessingActions.Discover); Assert.IsInstanceOf(typeof(DefaultPartFilter), pf); Assert.AreEqual(str, pf.ProcessPart(str, null, null, DocumentProcessingActions.Clean), "expected the default part filter pass back the input stream when cleaning"); Assert.AreEqual(str, pf.ProcessPart(str, null, null, DocumentProcessingActions.PassThrough), "expected the default part filter pass back the input stream when passing through"); Assert.AreEqual(str, pf.ProcessPart(str, null, null, DocumentProcessingActions.DiscoverAndClean), "expected the default part filter pass back the input stream when cleaning and discovering"); Assert.AreEqual(null, pf.ProcessPart(str, null, null, DocumentProcessingActions.Discover), "expected the default part filter pass back null when discovering"); } }
public static IDF fromFiles(string[] files) { DocsStatistics docStats = new DocsStatistics(); DocumentProcessor docProcessor = new DocumentProcessor(); int i = 0; foreach (string file in files) { ++i; //processFile(docStats, file); //* string fileContent = File.ReadAllText(file, Encoding.Default); Document doc = docProcessor.process(fileContent); docStats.addDocument(doc); /* * if ((i % 1000) == 0) * { * System.GC.Collect(); * Trace.write("Done for : " + i); * } * //*/ //*/ //doc = null; } IDF idf = new IDF(); foreach (string word in docStats.wordsCount.Keys) { //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count; double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]); double wordIdf = Math.Log(docStats.docCount / (wordRefCount)); idf.idf[word] = wordIdf; } return(idf); }
public void preprocessLanguageModel(string[] documentFiles, string bigramFilePath) { // No need for Stop Words Removal. StopWordsHandler.setInstance(new NullStopWordsHandler()); DocumentProcessor docProcessor = new DocumentProcessor(); BigramStatisticsModel bigramStats = new BigramStatisticsModel(); int i = 0; foreach (string filename in documentFiles) { ++i; string fileContent = File.ReadAllText(filename, Encoding.Default); Document doc = docProcessor.process(fileContent); bigramStats.addDocument(doc); } bigramStats.toFile(bigramFilePath); }
public async Task Deletes_document_after_error() { var documentDeleted = false; Task DocumentDeleter(WaivesDocument document) { documentDeleted = true; return(Task.CompletedTask); } var fakeDocumentActions = FakeDocumentAction.AListOfDocumentActions(1); var sut = new DocumentProcessor( _documentCreator, fakeDocumentActions.Select <FakeDocumentAction, Func <WaivesDocument, CancellationToken, Task <WaivesDocument> > >(f => f.ThrowError), DocumentDeleter, _onDocumentException); await sut.RunAsync(_testDocument); Assert.True(documentDeleted); }
static void Main(string[] args) { Wyam.Common.Tracing.Trace.Level = SourceLevels.Verbose; Wyam.Common.Tracing.Trace.AddListener(new TextWriterTraceListener(Console.Out)); var configBuilder = new ConfigurationBuilder(); configBuilder.AddJsonFile("config.json"); var config = new RootConfig(); configBuilder.Build().Bind(config); var processor = new DocumentProcessor(config); processor.Process(); Console.ReadLine(); // }
static void Main(string[] args) { TextDocument document = new TextDocument("Test", "blaaaaaaa"); //withour generics ITextDocumentTranslator translator = new TextDocumentTranslatorwithoutGenerics(); TextDocumentProcessor processor = new TextDocumentProcessor(translator); processor.Process(document); //With generics var processor2 = new DocumentProcessor <TextDocument>( new TextDocumentTranslator()); processor2.Process(document); //WithMoreGenerics var genericTranslator = new GenericDocumentTranslator <TextDocument>( new TextContentExtractor()); genericTranslator.Translate(document, "EN"); }
static void InteractiveConsoleInput() { Console.WriteLine("Type 'exit' to exit."); Console.WriteLine("Type any code and press ENTER to parse it."); Console.WriteLine(); var charReader = new VarStringCharReader(); var tokenizer = new Tokenizer(charReader); var interpreter = new DocumentProcessor(tokenizer, LoadBuiltinTypes(), new InstructionInterpreterFactory()); while (true) { Console.WriteLine(); Console.ForegroundColor = ConsoleColor.White; string line = Console.ReadLine(); if (line.ToLower() == "exit") { return; } charReader.SetString(line); Console.WriteLine(); Console.ForegroundColor = ConsoleColor.Gray; try { var namespaces = interpreter.Read(); OutputNamespaces(namespaces); Console.WriteLine(); } catch (SyntaxException ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("^".PadLeft((int)ex.Position + 1, '-')); Console.WriteLine(ex); } } }
public ActionResult ValidateDocument(HttpPostedFileBase file) { if (file == null || file.ContentLength == 0) { return(Json(new { status = "error", message = "A file is required." })); } string extension = Path.GetExtension(file.FileName); if (extension != ".doc" && extension != ".docx") { return(Json(new { status = "error", message = "Only Microsoft Office Word documents are supported." })); } using (MemoryStream ms = new MemoryStream()) { file.InputStream.CopyTo(ms); string results = DocumentProcessor.ValidateWordDocument(ms); results = results.Replace(Environment.NewLine, "<br />"); string stats = DocumentProcessor.BasicStats(ms); stats = stats.Replace("\n", "<br />"); return(Json(new { status = "success", message = "<strong>Validated!</strong> " + results + "<br />" + stats })); } }
public static void Main(string[] args) { var fileSystem = new Tools.XmlToMdConverter.FileSystem.FileSystem(); var configuration = new Configuration(); var documentProcessor = new DocumentProcessor(configuration); new Program( fileSystem, new Dictionary <string, ConverterInformation> { { "value-type", new ConverterInformation( new ValueTypeXml2MdConverter(documentProcessor), "Value Type'ların xml dökümanlarını Markdown formatına çevirip belirtilen klasör altına export eder.") }, { "api", new ConverterInformation( new ApiXml2MdConverter(documentProcessor), "API'ların xml dökümanlarını Markdown formatına çevirip belirtilen klasör altına export eder." ) } }, documentProcessor, configuration ).Execute(args); }
public XmlMetadataProcessingManager(DocumentProcessor documentProcessor) { m_documentProcessor = documentProcessor; }
public AddTwoListsTests() { _linkedListService = new LinkedListService(); _documentProcessor = new DocumentProcessor(); }
public SearchAndReplaceForm(DocumentProcessor documentProcessor) { InitializeComponent(); this.documentProcessor = documentProcessor; }
private static void Main(string[] args) { var proccessor = new DocumentProcessor(); }
private async static Task <ProcessingResult> ProcessInputBlobForScanningAsync( CloudBlockBlob blob, string entityId, string indexingContainer, TraceWriter log) { ProcessingResult result = new ProcessingResult(); string processedBlobName = Path.GetFileNameWithoutExtension(blob.Name); try { blob.FetchAttributes(); var storageHelper = new StorageHelper(); var extension = Path.GetExtension(blob.Name).ToLower(); var builder = new StringBuilder(); using (var stream = new MemoryStream()) { await blob.DownloadToStreamAsync(stream); try { // currently only process certain types of documents // if not a processable type of document, we just pass through to indexing location switch (extension) { case ".pdf": var bytes = stream.ToArray(); // if there is any text within document add to builder DocumentProcessor.GetTextFromPdf(bytes, out builder); // extract all images within document that are greater than 50x50 pixels List <Stream> images = DocumentProcessor.ExtractImagesFromPDF(bytes, log); if (images.Count > 0) { int imageCounter = 0; foreach (Stream img in images) { imageCounter++; try { builder.Append(" " + DocumentProcessor.ScanImageToString(img)); log.Info($"OCR completed successfully for pdf image #{imageCounter}"); // Azure Vision service has a cap on images processed per second // let's slow it down await Task.Delay(1000); } catch (ArgumentException aex) { // stream isn't a valid image log.Warning($"Failed to open image #{imageCounter} of {images.Count} for {blob.Name}. Error:{aex.Message}"); continue; } catch (Exception ex) { log.Warning($"Failed to OCR scan pfd image #{imageCounter} of {images.Count} for {blob.Name}. Error:{ex.Message}"); // Vision API can throw ClientException, grab inner exception for details if (ex.InnerException != null && ex.InnerException is ClientException) { log.Warning($"InnerException Details: Message={((ClientException)ex.InnerException).Error.Message}"); } } } } break; case ".docx": builder.Append(OfficeHelper.GetAllTextFromWordDoc(stream, log)); break; case ".xlsx": builder.Append(OfficeHelper.GetAllTextFromExcelDoc(stream, log)); break; default: // document is not a proccessable document type. just send through for indexing result.Status = ProcessingStatus.Success; result.DocumentLocation = await MarkAndSendDocumentAsync( entityId, blob, indexingContainer, processedBlobName, log); return(result); } if (builder.Length == 0) { throw new ApplicationException("Text could not be extracted from Document. Can't create empty document"); } // we always create a new pdf doc for indexing with all existing text merged with image text using (var textStream = await DocumentProcessor.CreateTextDocumentAsync(builder.ToString())) { log.Info($"Indexable document created successfully!"); result.Status = ProcessingStatus.Success; result.DocumentLocation = await MarkAndSendDocumentAsync( entityId, textStream, indexingContainer, processedBlobName, log); return(result); } } catch (ApplicationException aex) { var errorMsg = $"Document failed to get processed. Passing document along to indexing location"; log.Warning(errorMsg); // something went wrong processing document, just send through to get indexed result.Status = ProcessingStatus.Warning; result.Message = $"{errorMsg}. Error:{aex.Message}"; result.DocumentLocation = await MarkAndSendDocumentAsync( entityId, blob, indexingContainer, processedBlobName, log); return(result); } } } catch (Exception ex) { result.Status = ProcessingStatus.Failure; result.DocumentLocation = null; result.Message = $"Failed to process document {blob.Name} due to the following error: {ex.Message}{Environment.NewLine}{ex.StackTrace}"; log.Error(result.Message); return(result); } }
static void Main(string[] args) { EmailHandler emailHandler = new EmailHandler(); DocumentProcessor d = new DocumentProcessor(); Console.WriteLine("Starting ... \n"); Console.WriteLine("If this is your first time running this program, you must process all emails (option 1)."); Console.WriteLine("\nPlease type in an option and press ENTER to proceed."); Console.WriteLine("0: Exit."); Console.WriteLine("1: Read all emails and download and sort all attachments."); Console.WriteLine("2: Read new emails only, and download and sort all attachments."); bool readAll = true; int option; while (true) { if (int.TryParse(Console.ReadLine(), out option)) { if (option == 0) { Environment.Exit(0); } else if (option == 1) { break; } else if (option == 2) { Console.WriteLine("Digging through your mailbox ..."); readAll = false; break; } else { Console.WriteLine("Invalid number. Please pick a number from 0 to 8."); } } else { Console.WriteLine("Please enter a number."); } } string currentPath = Directory.GetCurrentDirectory(); string errorPath = currentPath + "\\Failed Documents"; int year = DateTime.Now.Year; int month = DateTime.Now.Month; var now = DateTime.Now.ToString(); now = now.Replace(":", " "); string logDirectory = currentPath + "\\Logs" + "\\" + year + "\\" + month; string logPath = logDirectory + "\\" + now + ".txt"; Directory.CreateDirectory(logDirectory); try { MAPIFolder reportsFolder = emailHandler.findMailFolder(); if (reportsFolder == null) { MessageBox.Show("Could not find transport email directory."); } else { Console.WriteLine("Checking for unprocessed emails ..\n"); emailHandler.EnumerateFolders(reportsFolder, readAll); } Console.WriteLine("\nPlease enter the type of file to parse and store in the database. Ensure that all reports are closed."); Console.WriteLine("NOTE: Only DAIR's (1) are available for parsing. Storing in the database is under development."); Console.WriteLine("\nPlease type in an option and press ENTER to proceed."); Console.WriteLine("0: Exit"); Console.WriteLine("1: DAIR"); Console.WriteLine("2: Journal"); Console.WriteLine("3: Image"); Console.WriteLine("4: DATMR/MATMR"); Console.WriteLine("5: Daily Report"); Console.WriteLine("6: SNOWIZ"); Console.WriteLine("7: Timesheet"); Console.WriteLine("8: Vehicle Insepection"); int x; while (true) { if (int.TryParse(Console.ReadLine(), out x)) { if (x == 0) { Environment.Exit(0); } else if (x == 1) { string DAIRPath = emailHandler.DAIRPath; Console.WriteLine("Parsing DAIR's ...\n"); d.DAIRparser(DAIRPath); break; } else if (x > 8) { Console.WriteLine("Invalid number. Please pick a number from 0 to 8."); } else { Console.WriteLine("Currently unavailable."); } } else { Console.WriteLine("Please enter a number."); } } } catch (System.Exception e) { Logger logger = new Logger(logPath); logger.Append(e.Message); } }
public void TestNoContentTypeForExternalRels() { string sCopyFile = TESTFILE_DIR + "copy.docx"; if (File.Exists(sCopyFile)) File.Delete(sCopyFile); try { using (Stream str = File.Open(TESTFILE_DIR + "test002.docx", FileMode.Open)) { using (DocumentProcessor dp = new DocumentProcessor(str)) { using (dp.Output = File.Open(sCopyFile, FileMode.CreateNew)) { dp.Process(DocumentProcessingActions.PassThrough); } } } using (OPCPackage package = new OPCPackage(File.Open(sCopyFile, FileMode.Open))) { List<ContentTypeInfo> ctilist = package.GetContentTypes(); foreach (ContentTypeInfo cti in ctilist) { Assert.IsFalse(cti.Name.StartsWith("file://")); Assert.IsFalse(cti.Name.StartsWith("http://")); } } } finally { File.Delete(sCopyFile); } }