/*
         * do all the work
         */

        public static bool DoFileConversion(IAmazonS3 s3Client, string pdfBucketName,
                                            IAmazonS3 s3DocxClient, string docxBucketName,
                                            string licensePath,
                                            string outputDir, string extractWorkDir, string tempDocDir,
                                            string objectKey, MetaDataHolder holder,
                                            out string textFileS3Path)
        {
            textFileS3Path = String.Empty;  // this is output file location
            // we will need this array
            char delimiter = '/';

            String[] keyElements = objectKey.Split(delimiter);
            string   fileName    = keyElements[keyElements.Length - 1];

            string pdfFilePath = outputDir + fileName;

            logger.Debug("downloading " + objectKey + " --> " + pdfFilePath);

            Stopwatch swAll = new Stopwatch();

            swAll.Start();

            bool ok = DownloadFileFromS3(s3Client, pdfBucketName, objectKey, pdfFilePath);

            if (!ok)
            {
                logger.Error("Error while downloading");
                return(ok);
            }
            // get file length
            long length = new System.IO.FileInfo(pdfFilePath).Length;

            // construct the name of the doc file
            string docxPath = Path.ChangeExtension(pdfFilePath, ".docx");

            // start stop watch
            Stopwatch sw = new Stopwatch();

            sw.Start();
            ok = ConvertPDFDocument(pdfFilePath, docxPath, licensePath);
            sw.Stop();
            if (!ok)
            {
                logger.Error("Error while converting");
                DeleteFile(pdfFilePath);
                return(ok);
            }
            double conversionRate = (double)length / sw.Elapsed.TotalSeconds;

            logger.Info("Done conversion, size: " + length + " time:" + sw.Elapsed.TotalSeconds + " sec, rate:" + conversionRate + " b/s");

            // new filter parts of the doc that are not needed
            // this is not the best place to have it
            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            // construct the location of final output file
            logger.Debug("Starting Filtering");

            string newDocxFile = tempDocDir + @"\" + fileName;

            newDocxFile = Path.ChangeExtension(newDocxFile, ".docx");

            ok = filter.ApplyFilter(docxPath, newDocxFile, false);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath
                });
                logger.Error("Error while filtering docx");
                return(ok);
            }

            // one more step convert docx to txt
            logger.Debug("Starting extraction of the text");
            string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
            DocxToText docxToText   = new DocxToText();

            ok = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath, newDocxFile
                });
                logger.Error("Error while Extracting text");
                return(ok);
            }

            /* now we have text file and we will need json file and we need to
             * collect data from database
             */

            ok = holder.LoadContentFromFile(textFileName);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath, newDocxFile, textFileName
                });
                logger.Error("Error while loading content from text file");
                return(ok);
            }

            // now save json file
            string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

            holder.SaveAsJSON(jsonFileName);

            // construct output object name
            // we are now uploading json file not docx !!!

            string jsonS3FileName = Path.GetFileName(jsonFileName);

            Array.Resize(ref keyElements, keyElements.Length - 1);
            string jsonObjectName = string.Join(delimiter.ToString(), keyElements) + delimiter.ToString() + jsonS3FileName;

            logger.Debug("uploading " + newDocxFile + " --> " + jsonObjectName);

            ok = UploadFileToS3(s3DocxClient, docxBucketName, jsonObjectName, jsonFileName);
            if (!ok)
            {
                logger.Error("Error while uploading");
                return(ok);
            }
            textFileS3Path = jsonObjectName;
            swAll.Stop();

            logger.Info("Time for the cycle:" + swAll.Elapsed.TotalSeconds + " sec");

            // all good, delete files
            DeleteListOfFiles(new List <string> {
                docxPath, pdfFilePath, newDocxFile, jsonFileName, textFileName
            });

            return(true);
        }
Exemple #2
0
        private static void ProcessFiles()
        {
            RegionEndpoint bucketRegion = RegionEndpoint.USWest2;

            string bucketName           = "sumup-test-mm";
            string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input";
            string extractWorkDir       = @"C:\radnidio\japan-ocr-files\work";
            string tempDocDir           = @"C:\radnidio\japan-ocr-files\tempdoc";

            string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;";
            string languageMapFile    = @"C:\transfer\solid-conversions\mappings\language-codes.csv";
            string threeMapFile       = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv";
            string twoMapFile         = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv";
            string nonMapFile         = @"C:\temp\non-mapped-document-categories.txt";
            string pdfBucketName      = "sua-liveboard";

            string docxBucketName = "sumup-docx-outbound";
            // setup various objects neesed

            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            FileToIdMapCollector collector = new FileToIdMapCollector();

            collector.connectionString = dbConnectionString;
            bool isOk;

            isOk = collector.LoadLists();
            if (!isOk)
            {
                logger.Error("Can not collect file id maps");
                System.Environment.Exit(0);
            }


            MetaDataHolderFactory.connectionString = dbConnectionString;
            MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile);
            MetaDataHolderFactory.S3bucket = pdfBucketName;
            // text is needed like us-west-2
            MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName;

            MetaDataHolderFactory.GetConnection();

            IAmazonS3 client = new AmazonS3Client(bucketRegion);

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName);

            isOk = bankDataProcessing.Connect();
            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }

            // skip list
            List <string> skipList = new List <string>();

            skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx");
            logger.Info("Started working ");



            ListingObjectsAsync(bucketName, client).Wait();

            int ic      = 0;
            int maxFile = 3000;

            foreach (string s3file in FilesToProcessInS3)
            {
                ic++;
                if (ic > maxFile)
                {
                    break;
                }
                Console.WriteLine("Processing: {0}", s3file);
                if (skipList.Contains(s3file))
                {
                    logger.Warn("file is skip list, skipping");
                    continue;
                }

                string docxPath     = Path.Combine(localDirForDocxFiles, s3file);
                string newDocxFile  = Path.Combine(tempDocDir, s3file);
                string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

                logger.Info("Local file: {0}", docxPath);
                // check do we have json file anready, if so skip
                if (File.Exists(jsonFileName))
                {
                    logger.Info("Json file already exist, skipping");
                    continue;
                }

                // first download s3 file to local dir
                // do not load if file already exist ( better to put this in the method for some other time )
                if (!File.Exists(docxPath))
                {
                    isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath);
                    if (!isOk)
                    {
                        logger.Error("file not downloaded {0}", s3file);
                        break;
                    }
                }
                else
                {
                    logger.Info("file aready downloaded");
                }

                // now filter out what is not needed in docx
                isOk = filter.ApplyFilter(docxPath, newDocxFile, false);
                if (!isOk)
                {
                    logger.Error("Error while filtering docx");
                    break;
                }
                // convert docx to txt
                logger.Debug("Starting extraction of the text");
                string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
                DocxToText docxToText   = new DocxToText();
                isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
                if (!isOk)
                {
                    logger.Error("Error while Extracting text");
                    break;
                }

                // now collect metadata

                int id = collector.GetId(s3file);
                if (id == FileToIdMapCollector.MISSING_ID)
                {
                    logger.Warn("id not found: {0}", s3file);
                    continue;
                }
                logger.Info("ID: {0}", id);

                List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id });
                MetaDataHolder        holder = mhlist[0];
                isOk = holder.LoadContentFromFile(textFileName);
                if (!isOk)
                {
                    logger.Error("Error while loading content from text file {0}", textFileName);
                    continue;
                }
                // now save json file

                holder.SaveAsJSON(jsonFileName);

                isOk = bankDataProcessing.IsIdPresent(id);
                if (isOk)
                {
                    logger.Info("id in dynamo db");
                }
                else
                {
                    logger.Info("id NOT in dynamo db");
                }
            }
            MetaDataHolderFactory.CloseConnection();
            bankDataProcessing.Disconnect();
            logger.Info("Done");
        }