private static void ProcessFiles() { RegionEndpoint bucketRegion = RegionEndpoint.USWest2; string bucketName = "sumup-test-mm"; string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input"; string extractWorkDir = @"C:\radnidio\japan-ocr-files\work"; string tempDocDir = @"C:\radnidio\japan-ocr-files\tempdoc"; string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;"; string languageMapFile = @"C:\transfer\solid-conversions\mappings\language-codes.csv"; string threeMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv"; string twoMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv"; string nonMapFile = @"C:\temp\non-mapped-document-categories.txt"; string pdfBucketName = "sua-liveboard"; string docxBucketName = "sumup-docx-outbound"; // setup various objects neesed DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); FileToIdMapCollector collector = new FileToIdMapCollector(); collector.connectionString = dbConnectionString; bool isOk; isOk = collector.LoadLists(); if (!isOk) { logger.Error("Can not collect file id maps"); System.Environment.Exit(0); } MetaDataHolderFactory.connectionString = dbConnectionString; MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile); MetaDataHolderFactory.S3bucket = pdfBucketName; // text is needed like us-west-2 MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName; MetaDataHolderFactory.GetConnection(); IAmazonS3 client = new AmazonS3Client(bucketRegion); BankDataProcessingDynamoDbDAO bankDataProcessing = new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName); isOk = bankDataProcessing.Connect(); if (!isOk) { logger.Error("Error in connecting to dynamo db: "); System.Environment.Exit(1); } // skip list List <string> skipList = new List <string>(); skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx"); logger.Info("Started working "); ListingObjectsAsync(bucketName, client).Wait(); int ic = 0; int maxFile = 3000; foreach (string s3file in FilesToProcessInS3) { ic++; if (ic > maxFile) { break; } Console.WriteLine("Processing: {0}", s3file); if (skipList.Contains(s3file)) { logger.Warn("file is skip list, skipping"); continue; } string docxPath = Path.Combine(localDirForDocxFiles, s3file); string newDocxFile = Path.Combine(tempDocDir, s3file); string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); logger.Info("Local file: {0}", docxPath); // check do we have json file anready, if so skip if (File.Exists(jsonFileName)) { logger.Info("Json file already exist, skipping"); continue; } // first download s3 file to local dir // do not load if file already exist ( better to put this in the method for some other time ) if (!File.Exists(docxPath)) { isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath); if (!isOk) { logger.Error("file not downloaded {0}", s3file); break; } } else { logger.Info("file aready downloaded"); } // now filter out what is not needed in docx isOk = filter.ApplyFilter(docxPath, newDocxFile, false); if (!isOk) { logger.Error("Error while filtering docx"); break; } // convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!isOk) { logger.Error("Error while Extracting text"); break; } // now collect metadata int id = collector.GetId(s3file); if (id == FileToIdMapCollector.MISSING_ID) { logger.Warn("id not found: {0}", s3file); continue; } logger.Info("ID: {0}", id); List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id }); MetaDataHolder holder = mhlist[0]; isOk = holder.LoadContentFromFile(textFileName); if (!isOk) { logger.Error("Error while loading content from text file {0}", textFileName); continue; } // now save json file holder.SaveAsJSON(jsonFileName); isOk = bankDataProcessing.IsIdPresent(id); if (isOk) { logger.Info("id in dynamo db"); } else { logger.Info("id NOT in dynamo db"); } } MetaDataHolderFactory.CloseConnection(); bankDataProcessing.Disconnect(); logger.Info("Done"); }