/*
         * do all the work
         */

        public static bool DoFileConversion(IAmazonS3 s3Client, string pdfBucketName,
                                            IAmazonS3 s3DocxClient, string docxBucketName,
                                            string licensePath,
                                            string outputDir, string extractWorkDir, string tempDocDir,
                                            string objectKey, MetaDataHolder holder,
                                            out string textFileS3Path)
        {
            textFileS3Path = String.Empty;  // this is output file location
            // we will need this array
            char delimiter = '/';

            String[] keyElements = objectKey.Split(delimiter);
            string   fileName    = keyElements[keyElements.Length - 1];

            string pdfFilePath = outputDir + fileName;

            logger.Debug("downloading " + objectKey + " --> " + pdfFilePath);

            Stopwatch swAll = new Stopwatch();

            swAll.Start();

            bool ok = DownloadFileFromS3(s3Client, pdfBucketName, objectKey, pdfFilePath);

            if (!ok)
            {
                logger.Error("Error while downloading");
                return(ok);
            }
            // get file length
            long length = new System.IO.FileInfo(pdfFilePath).Length;

            // construct the name of the doc file
            string docxPath = Path.ChangeExtension(pdfFilePath, ".docx");

            // start stop watch
            Stopwatch sw = new Stopwatch();

            sw.Start();
            ok = ConvertPDFDocument(pdfFilePath, docxPath, licensePath);
            sw.Stop();
            if (!ok)
            {
                logger.Error("Error while converting");
                DeleteFile(pdfFilePath);
                return(ok);
            }
            double conversionRate = (double)length / sw.Elapsed.TotalSeconds;

            logger.Info("Done conversion, size: " + length + " time:" + sw.Elapsed.TotalSeconds + " sec, rate:" + conversionRate + " b/s");

            // new filter parts of the doc that are not needed
            // this is not the best place to have it
            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            // construct the location of final output file
            logger.Debug("Starting Filtering");

            string newDocxFile = tempDocDir + @"\" + fileName;

            newDocxFile = Path.ChangeExtension(newDocxFile, ".docx");

            ok = filter.ApplyFilter(docxPath, newDocxFile, false);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath
                });
                logger.Error("Error while filtering docx");
                return(ok);
            }

            // one more step convert docx to txt
            logger.Debug("Starting extraction of the text");
            string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
            DocxToText docxToText   = new DocxToText();

            ok = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath, newDocxFile
                });
                logger.Error("Error while Extracting text");
                return(ok);
            }

            /* now we have text file and we will need json file and we need to
             * collect data from database
             */

            ok = holder.LoadContentFromFile(textFileName);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath, newDocxFile, textFileName
                });
                logger.Error("Error while loading content from text file");
                return(ok);
            }

            // now save json file
            string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

            holder.SaveAsJSON(jsonFileName);

            // construct output object name
            // we are now uploading json file not docx !!!

            string jsonS3FileName = Path.GetFileName(jsonFileName);

            Array.Resize(ref keyElements, keyElements.Length - 1);
            string jsonObjectName = string.Join(delimiter.ToString(), keyElements) + delimiter.ToString() + jsonS3FileName;

            logger.Debug("uploading " + newDocxFile + " --> " + jsonObjectName);

            ok = UploadFileToS3(s3DocxClient, docxBucketName, jsonObjectName, jsonFileName);
            if (!ok)
            {
                logger.Error("Error while uploading");
                return(ok);
            }
            textFileS3Path = jsonObjectName;
            swAll.Stop();

            logger.Info("Time for the cycle:" + swAll.Elapsed.TotalSeconds + " sec");

            // all good, delete files
            DeleteListOfFiles(new List <string> {
                docxPath, pdfFilePath, newDocxFile, jsonFileName, textFileName
            });

            return(true);
        }
        static void Main(string[] args)
        {
            // imput parameters, read from config file App.config

            string toDebug               = Config.toDebug;
            string pdfBucketName         = Config.pdfBucketName;
            string docxBucketName        = Config.docxBucketName;
            string outputDir             = Config.outputDir;
            string licensePath           = Config.licensePath;
            string listenerQueueName     = Config.listenerQueueName;
            string confirmationQueueName = Config.confirmationQueueName;
            string extractWorkDir        = Config.extractWorkDir;
            string tempDocDir            = Config.tempDocDir;
            int    sleepTimeMillis       = Config.sleepTimeMillis;

            int maxIdleTime = Config.maxIdleTime;

            Config.printAllParams();

            // start logging
            logger.Info("Starting program");

            // try to get instance id
            string instanceId = "NOT_ON_AMAZON";

            try
            {
                instanceId = Amazon.Util.EC2InstanceMetadata.InstanceId.ToString();
                logger.Info("Instance id:" + instanceId);
            }
            catch (Exception)
            {
                logger.Info("Not on EC2 instance");
            }

            //  clear working directory

            RemoveFilesAndSubDirectories(tempDocDir);

            // initialize various objects that we need all the time
            RegionEndpoint bucketRegion = RegionEndpoint.USWest1;
            IAmazonS3      s3Client     = new AmazonS3Client(bucketRegion);

            RegionEndpoint docxBucketRegion = RegionEndpoint.USWest2;
            IAmazonS3      s3DocxClient     = new AmazonS3Client(docxBucketRegion);

            MetaDataHolderFactory.connectionString = Config.DbConnectionString;
            MetaDataHolderFactory.loadMaps(Config.languageMapFile, Config.threeMapFile, Config.twoMapFile, Config.nonMapFile);
            MetaDataHolderFactory.S3bucket = pdfBucketName;
            // text is needed like us-west-2
            MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName;

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName);

            char delimiter = '|';

            // open queues
            // connect to sending queue
            AmazonSQSConfig sqsConfig = new AmazonSQSConfig();

            // this is needed as well
            sqsConfig.RegionEndpoint = Amazon.RegionEndpoint.USWest2;
            AmazonSQSClient sqsClient = new AmazonSQSClient(sqsConfig);

            ReceiveMessageRequest recRequest = new ReceiveMessageRequest();

            recRequest.QueueUrl            = listenerQueueName;
            recRequest.MaxNumberOfMessages = 1;

            // loop and read mails from controller
            int counter = 0;
            // doWork will be true until message with id = 0 is detected
            // this part is not implemented yet !!!
            bool doWork = true;

            // get message time, this will be the time of last message sent
            // if nothing happens for some time program will exit

            DateTime lastMessageTime = DateTime.Now;

            while (doWork)
            {
                TimeSpan idleTime = DateTime.Now.Subtract(lastMessageTime);
                if (idleTime.TotalMinutes > maxIdleTime)
                {
                    logger.Info("Exiting, no message within last " + maxIdleTime.ToString() + " minutes");
                    break;
                }
                List <Tuple <int, string> > tuples;

                ConversionResponseMessage conversionResponseMessage;
                string requestMessageId;

                WaitForInputMessage(listenerQueueName, delimiter, sqsClient, recRequest, out requestMessageId, out tuples);
                // now process documents one after another
                int processedCounter = 0, totalCounter = 0;

                conversionResponseMessage            = new ConversionResponseMessage(requestMessageId);
                conversionResponseMessage.InstanceId = instanceId;
                if (tuples == null)
                {
                    logger.Debug("Sleeping");
                    System.Threading.Thread.Sleep(sleepTimeMillis);
                    continue;
                }

                MetaDataHolderFactory.GetConnection();
                bool ok = bankDataProcessing.Connect();
                if (!ok)
                {
                    logger.Error("Error in connecting to dynamo db: ");
                    System.Environment.Exit(1);
                }

                foreach (Tuple <int, string> tup in tuples)
                {
                    totalCounter++;
                    int    id      = tup.Item1;
                    string fileUrl = tup.Item2;
                    if (id == 0)
                    {
                        doWork = false;
                        break;
                    }
                    logger.Info(counter + " processing id: " + id + " " + fileUrl);

                    // now do the processing of database data for id
                    List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id });

                    // it is always just one meta data holder for now so we can easily extract it
                    // and pass it to the processing routine
                    MetaDataHolder holder         = mhlist[0];
                    string         textFileS3Path = "";
                    ok = DoFileConversion(s3Client, pdfBucketName, s3DocxClient, docxBucketName,
                                          licensePath, outputDir, extractWorkDir, tempDocDir, fileUrl, holder, out textFileS3Path);
                    if (!ok)
                    {
                        logger.Error("Error in processing id: " + id.ToString());
                        conversionResponseMessage.AddIdAndFileUrlThatIsNotProcessed(id, fileUrl);
                        continue;
                    }
                    else
                    {
                        bankDataProcessing.Insert(id, holder.Bank, holder.Language, fileUrl, textFileS3Path);
                        processedCounter++;
                    }
                }

                MetaDataHolderFactory.CloseConnection();
                bankDataProcessing.Disconnect();

                // processing done see how successfull and report to the controller
                int badFiles = totalCounter - processedCounter;
                if (badFiles > 0)
                {
                    logger.Info("Not all files are processed succesfully, failures:" + badFiles.ToString());
                }

                SendMessageRequest request = new SendMessageRequest();
                request.MessageBody = conversionResponseMessage.GetMessageBody();
                request.QueueUrl    = confirmationQueueName;
                SendMessageResponse confirmationResponse = sqsClient.SendMessage(request);
                if (confirmationResponse.HttpStatusCode == System.Net.HttpStatusCode.OK)
                {
                    logger.Debug("Confirmation message sent ");
                    // remember when last message with results is sent
                    lastMessageTime = DateTime.Now;
                }
                else
                {
                    logger.Error("Problem sending confirmation message");
                }
            }

            System.Environment.Exit(0);
        }
Пример #3
0
        private static void ProcessFiles()
        {
            RegionEndpoint bucketRegion = RegionEndpoint.USWest2;

            string bucketName           = "sumup-test-mm";
            string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input";
            string extractWorkDir       = @"C:\radnidio\japan-ocr-files\work";
            string tempDocDir           = @"C:\radnidio\japan-ocr-files\tempdoc";

            string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;";
            string languageMapFile    = @"C:\transfer\solid-conversions\mappings\language-codes.csv";
            string threeMapFile       = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv";
            string twoMapFile         = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv";
            string nonMapFile         = @"C:\temp\non-mapped-document-categories.txt";
            string pdfBucketName      = "sua-liveboard";

            string docxBucketName = "sumup-docx-outbound";
            // setup various objects neesed

            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            FileToIdMapCollector collector = new FileToIdMapCollector();

            collector.connectionString = dbConnectionString;
            bool isOk;

            isOk = collector.LoadLists();
            if (!isOk)
            {
                logger.Error("Can not collect file id maps");
                System.Environment.Exit(0);
            }


            MetaDataHolderFactory.connectionString = dbConnectionString;
            MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile);
            MetaDataHolderFactory.S3bucket = pdfBucketName;
            // text is needed like us-west-2
            MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName;

            MetaDataHolderFactory.GetConnection();

            IAmazonS3 client = new AmazonS3Client(bucketRegion);

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName);

            isOk = bankDataProcessing.Connect();
            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }

            // skip list
            List <string> skipList = new List <string>();

            skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx");
            logger.Info("Started working ");



            ListingObjectsAsync(bucketName, client).Wait();

            int ic      = 0;
            int maxFile = 3000;

            foreach (string s3file in FilesToProcessInS3)
            {
                ic++;
                if (ic > maxFile)
                {
                    break;
                }
                Console.WriteLine("Processing: {0}", s3file);
                if (skipList.Contains(s3file))
                {
                    logger.Warn("file is skip list, skipping");
                    continue;
                }

                string docxPath     = Path.Combine(localDirForDocxFiles, s3file);
                string newDocxFile  = Path.Combine(tempDocDir, s3file);
                string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

                logger.Info("Local file: {0}", docxPath);
                // check do we have json file anready, if so skip
                if (File.Exists(jsonFileName))
                {
                    logger.Info("Json file already exist, skipping");
                    continue;
                }

                // first download s3 file to local dir
                // do not load if file already exist ( better to put this in the method for some other time )
                if (!File.Exists(docxPath))
                {
                    isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath);
                    if (!isOk)
                    {
                        logger.Error("file not downloaded {0}", s3file);
                        break;
                    }
                }
                else
                {
                    logger.Info("file aready downloaded");
                }

                // now filter out what is not needed in docx
                isOk = filter.ApplyFilter(docxPath, newDocxFile, false);
                if (!isOk)
                {
                    logger.Error("Error while filtering docx");
                    break;
                }
                // convert docx to txt
                logger.Debug("Starting extraction of the text");
                string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
                DocxToText docxToText   = new DocxToText();
                isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
                if (!isOk)
                {
                    logger.Error("Error while Extracting text");
                    break;
                }

                // now collect metadata

                int id = collector.GetId(s3file);
                if (id == FileToIdMapCollector.MISSING_ID)
                {
                    logger.Warn("id not found: {0}", s3file);
                    continue;
                }
                logger.Info("ID: {0}", id);

                List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id });
                MetaDataHolder        holder = mhlist[0];
                isOk = holder.LoadContentFromFile(textFileName);
                if (!isOk)
                {
                    logger.Error("Error while loading content from text file {0}", textFileName);
                    continue;
                }
                // now save json file

                holder.SaveAsJSON(jsonFileName);

                isOk = bankDataProcessing.IsIdPresent(id);
                if (isOk)
                {
                    logger.Info("id in dynamo db");
                }
                else
                {
                    logger.Info("id NOT in dynamo db");
                }
            }
            MetaDataHolderFactory.CloseConnection();
            bankDataProcessing.Disconnect();
            logger.Info("Done");
        }