static void Main(string[] args)
 {
     using (var obj = new DocumentProcessor())
     {
         obj.MergeDocuments();
     }
 }
        public void TestPassThrough()
        {
            string sCopyFile = TESTFILE_DIR + "copy.docx";
            if (File.Exists(sCopyFile))
                File.Delete(sCopyFile);

            try
            {
                using (Stream str = File.Open(TESTFILE_DIR + "test002.docx", FileMode.Open))
                {
                    DocumentProcessor dp = new DocumentProcessor(str);

                    using (dp.Output = File.Open(sCopyFile, FileMode.CreateNew))
                    {

                        dp.Process(DocumentProcessingActions.PassThrough);
                    }
                }
                Assert.IsTrue(CommonTestUtilities.AreZipFilesEqual(TESTFILE_DIR + "test002.docx", sCopyFile));
            }
            finally
            {
                File.Delete(sCopyFile);
            }
        }
Beispiel #3
0
        static DocumentProcessor Configure()
        {
            DocumentProcessor rc = new DocumentProcessor();
            rc.AddProcess(DocumentProcesses.Spellcheck);
            rc.AddProcess(DocumentProcesses.Repaginate);
            rc.AddProcess(DocumentProcesses.TransaleIntoFrench, delegate(Document doc) { return !doc.Text.Contains("?"); });
            rc.AddProcess(DocumentProcesses.DoingSomethingElseCool);

            TrademarkFilter trademarkFilter = new TrademarkFilter();
            trademarkFilter.TradeMarks.Add("O'Reilly");
            trademarkFilter.TradeMarks.Add("millennium");

            rc.AddProcess(trademarkFilter.HighlightTrademarks);

            return rc;
        }
        public void TestCreatePartFilterForContentType()
        {
            using (Stream str = File.Open(TESTFILE_DIR + "test002.docx", FileMode.Open))
            {
                DocumentProcessor dp = new DocumentProcessor(str);

                Assert.IsTrue(dp.GetPartFilter("cheese/sandwiches", DocumentProcessingActions.Discover) is DefaultPartFilter);

                IPartFilter pf = dp.GetPartFilter("toast/xml", DocumentProcessingActions.Discover);
                Assert.IsInstanceOf(typeof(DefaultPartFilter), pf);
                Assert.AreEqual(str, pf.ProcessPart(str, null, null, DocumentProcessingActions.Clean), "expected the default part filter pass back the input stream when cleaning");
                Assert.AreEqual(str, pf.ProcessPart(str, null, null, DocumentProcessingActions.PassThrough), "expected the default part filter pass back the input stream when passing through");
                Assert.AreEqual(str, pf.ProcessPart(str, null, null, DocumentProcessingActions.DiscoverAndClean), "expected the default part filter pass back the input stream when cleaning and discovering");
                Assert.AreEqual(null, pf.ProcessPart(str, null, null, DocumentProcessingActions.Discover), "expected the default part filter pass back null when discovering");
            }

        }
            public static IDF fromFiles(string[] files)
            {
                DocsStatistics    docStats     = new DocsStatistics();
                DocumentProcessor docProcessor = new DocumentProcessor();

                int i = 0;

                foreach (string file in files)
                {
                    ++i;
                    //processFile(docStats, file);
                    //*
                    string   fileContent = File.ReadAllText(file, Encoding.Default);
                    Document doc         = docProcessor.process(fileContent);
                    docStats.addDocument(doc);

                    /*
                     * if ((i % 1000) == 0)
                     * {
                     *  System.GC.Collect();
                     *  Trace.write("Done for : " + i);
                     * }
                     * //*/
                    //*/

                    //doc = null;
                }

                IDF idf = new IDF();

                foreach (string word in docStats.wordsCount.Keys)
                {
                    //double wordRefCount = docStats.wordRefs[firstWord] == null ? 0 : ((HashSet<Document>)docStats.wordRefs[firstWord]).Count;
                    double wordRefCount = docStats.wordRefsCount[word] == null ? 0 : ((int)docStats.wordRefsCount[word]);
                    double wordIdf      = Math.Log(docStats.docCount / (wordRefCount));

                    idf.idf[word] = wordIdf;
                }

                return(idf);
            }
Beispiel #6
0
        public void preprocessLanguageModel(string[] documentFiles, string bigramFilePath)
        {
            // No need for Stop Words Removal.
            StopWordsHandler.setInstance(new NullStopWordsHandler());

            DocumentProcessor     docProcessor = new DocumentProcessor();
            BigramStatisticsModel bigramStats  = new BigramStatisticsModel();

            int i = 0;

            foreach (string filename in documentFiles)
            {
                ++i;
                string   fileContent = File.ReadAllText(filename, Encoding.Default);
                Document doc         = docProcessor.process(fileContent);

                bigramStats.addDocument(doc);
            }

            bigramStats.toFile(bigramFilePath);
        }
Beispiel #7
0
        public async Task Deletes_document_after_error()
        {
            var documentDeleted = false;

            Task DocumentDeleter(WaivesDocument document)
            {
                documentDeleted = true;
                return(Task.CompletedTask);
            }

            var fakeDocumentActions = FakeDocumentAction.AListOfDocumentActions(1);

            var sut = new DocumentProcessor(
                _documentCreator,
                fakeDocumentActions.Select <FakeDocumentAction, Func <WaivesDocument, CancellationToken, Task <WaivesDocument> > >(f => f.ThrowError),
                DocumentDeleter,
                _onDocumentException);
            await sut.RunAsync(_testDocument);

            Assert.True(documentDeleted);
        }
Beispiel #8
0
        static void Main(string[] args)
        {
            Wyam.Common.Tracing.Trace.Level = SourceLevels.Verbose;
            Wyam.Common.Tracing.Trace.AddListener(new TextWriterTraceListener(Console.Out));

            var configBuilder = new ConfigurationBuilder();

            configBuilder.AddJsonFile("config.json");

            var config = new RootConfig();

            configBuilder.Build().Bind(config);

            var processor = new DocumentProcessor(config);

            processor.Process();

            Console.ReadLine();

            //
        }
        static void Main(string[] args)
        {
            TextDocument document = new TextDocument("Test", "blaaaaaaa");

            //withour generics
            ITextDocumentTranslator translator = new TextDocumentTranslatorwithoutGenerics();
            TextDocumentProcessor   processor  = new TextDocumentProcessor(translator);

            processor.Process(document);

            //With generics
            var processor2 = new DocumentProcessor <TextDocument>(
                new TextDocumentTranslator());

            processor2.Process(document);

            //WithMoreGenerics
            var genericTranslator = new GenericDocumentTranslator <TextDocument>(
                new TextContentExtractor());

            genericTranslator.Translate(document, "EN");
        }
Beispiel #10
0
        static void InteractiveConsoleInput()
        {
            Console.WriteLine("Type 'exit' to exit.");
            Console.WriteLine("Type any code and press ENTER to parse it.");
            Console.WriteLine();

            var charReader  = new VarStringCharReader();
            var tokenizer   = new Tokenizer(charReader);
            var interpreter = new DocumentProcessor(tokenizer, LoadBuiltinTypes(), new InstructionInterpreterFactory());

            while (true)
            {
                Console.WriteLine();
                Console.ForegroundColor = ConsoleColor.White;

                string line = Console.ReadLine();
                if (line.ToLower() == "exit")
                {
                    return;
                }
                charReader.SetString(line);

                Console.WriteLine();
                Console.ForegroundColor = ConsoleColor.Gray;

                try
                {
                    var namespaces = interpreter.Read();
                    OutputNamespaces(namespaces);
                    Console.WriteLine();
                }
                catch (SyntaxException ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("^".PadLeft((int)ex.Position + 1, '-'));
                    Console.WriteLine(ex);
                }
            }
        }
Beispiel #11
0
        public ActionResult ValidateDocument(HttpPostedFileBase file)
        {
            if (file == null || file.ContentLength == 0)
            {
                return(Json(new { status = "error", message = "A file is required." }));
            }
            string extension = Path.GetExtension(file.FileName);

            if (extension != ".doc" && extension != ".docx")
            {
                return(Json(new { status = "error", message = "Only Microsoft Office Word documents are supported." }));
            }

            using (MemoryStream ms = new MemoryStream())
            {
                file.InputStream.CopyTo(ms);
                string results = DocumentProcessor.ValidateWordDocument(ms);
                results = results.Replace(Environment.NewLine, "<br />");
                string stats = DocumentProcessor.BasicStats(ms);
                stats = stats.Replace("\n", "<br />");
                return(Json(new { status = "success", message = "<strong>Validated!</strong> " + results + "<br />" + stats }));
            }
        }
Beispiel #12
0
        public static void Main(string[] args)
        {
            var fileSystem        = new Tools.XmlToMdConverter.FileSystem.FileSystem();
            var configuration     = new Configuration();
            var documentProcessor = new DocumentProcessor(configuration);

            new Program(
                fileSystem,
                new Dictionary <string, ConverterInformation>
            {
                { "value-type",
                  new ConverterInformation(
                      new ValueTypeXml2MdConverter(documentProcessor),
                      "Value Type'ların xml dökümanlarını Markdown formatına çevirip belirtilen klasör altına export eder.") },
                { "api",
                  new ConverterInformation(
                      new ApiXml2MdConverter(documentProcessor),
                      "API'ların xml dökümanlarını Markdown formatına çevirip belirtilen klasör altına export eder."
                      ) }
            },
                documentProcessor,
                configuration
                ).Execute(args);
        }
Beispiel #13
0
 public XmlMetadataProcessingManager(DocumentProcessor documentProcessor)
 {
     m_documentProcessor = documentProcessor;
 }
 public AddTwoListsTests()
 {
     _linkedListService = new LinkedListService();
     _documentProcessor = new DocumentProcessor();
 }
 public SearchAndReplaceForm(DocumentProcessor documentProcessor)
 {
     InitializeComponent();
     this.documentProcessor = documentProcessor;
 }
Beispiel #16
0
 private static void Main(string[] args)
 {
     var proccessor = new DocumentProcessor();
 }
Beispiel #17
0
        private async static Task <ProcessingResult> ProcessInputBlobForScanningAsync(
            CloudBlockBlob blob,
            string entityId,
            string indexingContainer,
            TraceWriter log)
        {
            ProcessingResult result            = new ProcessingResult();
            string           processedBlobName = Path.GetFileNameWithoutExtension(blob.Name);

            try
            {
                blob.FetchAttributes();

                var storageHelper = new StorageHelper();
                var extension     = Path.GetExtension(blob.Name).ToLower();
                var builder       = new StringBuilder();

                using (var stream = new MemoryStream())
                {
                    await blob.DownloadToStreamAsync(stream);

                    try
                    {
                        // currently only process certain types of documents
                        // if not a processable type of document, we just pass through to indexing location
                        switch (extension)
                        {
                        case ".pdf":

                            var bytes = stream.ToArray();

                            // if there is any text within document add to builder
                            DocumentProcessor.GetTextFromPdf(bytes, out builder);

                            // extract all images within document that are greater than 50x50 pixels
                            List <Stream> images = DocumentProcessor.ExtractImagesFromPDF(bytes, log);

                            if (images.Count > 0)
                            {
                                int imageCounter = 0;

                                foreach (Stream img in images)
                                {
                                    imageCounter++;

                                    try
                                    {
                                        builder.Append(" " + DocumentProcessor.ScanImageToString(img));
                                        log.Info($"OCR completed successfully for pdf image #{imageCounter}");

                                        // Azure Vision service has a cap on images processed per second
                                        // let's slow it down
                                        await Task.Delay(1000);
                                    }
                                    catch (ArgumentException aex)
                                    {
                                        // stream isn't a valid image
                                        log.Warning($"Failed to open image #{imageCounter} of {images.Count} for {blob.Name}. Error:{aex.Message}");
                                        continue;
                                    }
                                    catch (Exception ex)
                                    {
                                        log.Warning($"Failed to OCR scan pfd image #{imageCounter} of {images.Count} for {blob.Name}. Error:{ex.Message}");

                                        // Vision API can throw ClientException, grab inner exception for details
                                        if (ex.InnerException != null && ex.InnerException is ClientException)
                                        {
                                            log.Warning($"InnerException Details: Message={((ClientException)ex.InnerException).Error.Message}");
                                        }
                                    }
                                }
                            }

                            break;

                        case ".docx":

                            builder.Append(OfficeHelper.GetAllTextFromWordDoc(stream, log));
                            break;

                        case ".xlsx":

                            builder.Append(OfficeHelper.GetAllTextFromExcelDoc(stream, log));
                            break;

                        default:

                            // document is not a proccessable document type.  just send through for indexing
                            result.Status           = ProcessingStatus.Success;
                            result.DocumentLocation = await MarkAndSendDocumentAsync(
                                entityId,
                                blob,
                                indexingContainer,
                                processedBlobName,
                                log);

                            return(result);
                        }

                        if (builder.Length == 0)
                        {
                            throw new ApplicationException("Text could not be extracted from Document.  Can't create empty document");
                        }

                        // we always create a new pdf doc for indexing with all existing text merged with image text
                        using (var textStream = await DocumentProcessor.CreateTextDocumentAsync(builder.ToString()))
                        {
                            log.Info($"Indexable document created successfully!");

                            result.Status           = ProcessingStatus.Success;
                            result.DocumentLocation = await MarkAndSendDocumentAsync(
                                entityId,
                                textStream,
                                indexingContainer,
                                processedBlobName,
                                log);

                            return(result);
                        }
                    }
                    catch (ApplicationException aex)
                    {
                        var errorMsg = $"Document failed to get processed.  Passing document along to indexing location";
                        log.Warning(errorMsg);

                        // something went wrong processing document, just send through to get indexed
                        result.Status           = ProcessingStatus.Warning;
                        result.Message          = $"{errorMsg}. Error:{aex.Message}";
                        result.DocumentLocation = await MarkAndSendDocumentAsync(
                            entityId,
                            blob,
                            indexingContainer,
                            processedBlobName,
                            log);

                        return(result);
                    }
                }
            }
            catch (Exception ex)
            {
                result.Status           = ProcessingStatus.Failure;
                result.DocumentLocation = null;
                result.Message          = $"Failed to process document {blob.Name} due to the following error: {ex.Message}{Environment.NewLine}{ex.StackTrace}";
                log.Error(result.Message);
                return(result);
            }
        }
Beispiel #18
0
        static void Main(string[] args)
        {
            EmailHandler      emailHandler = new EmailHandler();
            DocumentProcessor d            = new DocumentProcessor();

            Console.WriteLine("Starting ... \n");

            Console.WriteLine("If this is your first time running this program, you must process all emails (option 1).");
            Console.WriteLine("\nPlease type in an option and press ENTER to proceed.");
            Console.WriteLine("0: Exit.");
            Console.WriteLine("1: Read all emails and download and sort all attachments.");
            Console.WriteLine("2: Read new emails only, and download and sort all attachments.");
            bool readAll = true;
            int  option;

            while (true)
            {
                if (int.TryParse(Console.ReadLine(), out option))
                {
                    if (option == 0)
                    {
                        Environment.Exit(0);
                    }
                    else if (option == 1)
                    {
                        break;
                    }
                    else if (option == 2)
                    {
                        Console.WriteLine("Digging through your mailbox ...");
                        readAll = false;
                        break;
                    }
                    else
                    {
                        Console.WriteLine("Invalid number. Please pick a number from 0 to 8.");
                    }
                }
                else
                {
                    Console.WriteLine("Please enter a number.");
                }
            }

            string currentPath = Directory.GetCurrentDirectory();
            string errorPath   = currentPath + "\\Failed Documents";
            int    year        = DateTime.Now.Year;
            int    month       = DateTime.Now.Month;
            var    now         = DateTime.Now.ToString();

            now = now.Replace(":", " ");
            string logDirectory = currentPath + "\\Logs" + "\\" + year + "\\" + month;
            string logPath      = logDirectory + "\\" + now + ".txt";

            Directory.CreateDirectory(logDirectory);

            try
            {
                MAPIFolder reportsFolder = emailHandler.findMailFolder();

                if (reportsFolder == null)
                {
                    MessageBox.Show("Could not find transport email directory.");
                }
                else
                {
                    Console.WriteLine("Checking for unprocessed emails ..\n");
                    emailHandler.EnumerateFolders(reportsFolder, readAll);
                }

                Console.WriteLine("\nPlease enter the type of file to parse and store in the database. Ensure that all reports are closed.");
                Console.WriteLine("NOTE: Only DAIR's (1) are available for parsing. Storing in the database is under development.");
                Console.WriteLine("\nPlease type in an option and press ENTER to proceed.");
                Console.WriteLine("0: Exit");
                Console.WriteLine("1: DAIR");
                Console.WriteLine("2: Journal");
                Console.WriteLine("3: Image");
                Console.WriteLine("4: DATMR/MATMR");
                Console.WriteLine("5: Daily Report");
                Console.WriteLine("6: SNOWIZ");
                Console.WriteLine("7: Timesheet");
                Console.WriteLine("8: Vehicle Insepection");
                int x;
                while (true)
                {
                    if (int.TryParse(Console.ReadLine(), out x))
                    {
                        if (x == 0)
                        {
                            Environment.Exit(0);
                        }
                        else if (x == 1)
                        {
                            string DAIRPath = emailHandler.DAIRPath;
                            Console.WriteLine("Parsing DAIR's ...\n");
                            d.DAIRparser(DAIRPath);
                            break;
                        }
                        else if (x > 8)
                        {
                            Console.WriteLine("Invalid number. Please pick a number from 0 to 8.");
                        }
                        else
                        {
                            Console.WriteLine("Currently unavailable.");
                        }
                    }
                    else
                    {
                        Console.WriteLine("Please enter a number.");
                    }
                }
            }
            catch (System.Exception e)
            {
                Logger logger = new Logger(logPath);
                logger.Append(e.Message);
            }
        }
        public void TestNoContentTypeForExternalRels()
        {
            string sCopyFile = TESTFILE_DIR + "copy.docx";
            if (File.Exists(sCopyFile))
                File.Delete(sCopyFile);

            try
            {
                using (Stream str = File.Open(TESTFILE_DIR + "test002.docx", FileMode.Open))
                {
                    using (DocumentProcessor dp = new DocumentProcessor(str))
                    {

                        using (dp.Output = File.Open(sCopyFile, FileMode.CreateNew))
                        {

                            dp.Process(DocumentProcessingActions.PassThrough);
                        }
                    }
                }

                using (OPCPackage package = new OPCPackage(File.Open(sCopyFile, FileMode.Open)))
                {
                    List<ContentTypeInfo> ctilist = package.GetContentTypes();
                    foreach (ContentTypeInfo cti in ctilist)
                    {
                        Assert.IsFalse(cti.Name.StartsWith("file://"));
                        Assert.IsFalse(cti.Name.StartsWith("http://"));
                    }
                }
            }
            finally
            {
                File.Delete(sCopyFile);
            }
        }