/* * do all the work */ public static bool DoFileConversion(IAmazonS3 s3Client, string pdfBucketName, IAmazonS3 s3DocxClient, string docxBucketName, string licensePath, string outputDir, string extractWorkDir, string tempDocDir, string objectKey, MetaDataHolder holder, out string textFileS3Path) { textFileS3Path = String.Empty; // this is output file location // we will need this array char delimiter = '/'; String[] keyElements = objectKey.Split(delimiter); string fileName = keyElements[keyElements.Length - 1]; string pdfFilePath = outputDir + fileName; logger.Debug("downloading " + objectKey + " --> " + pdfFilePath); Stopwatch swAll = new Stopwatch(); swAll.Start(); bool ok = DownloadFileFromS3(s3Client, pdfBucketName, objectKey, pdfFilePath); if (!ok) { logger.Error("Error while downloading"); return(ok); } // get file length long length = new System.IO.FileInfo(pdfFilePath).Length; // construct the name of the doc file string docxPath = Path.ChangeExtension(pdfFilePath, ".docx"); // start stop watch Stopwatch sw = new Stopwatch(); sw.Start(); ok = ConvertPDFDocument(pdfFilePath, docxPath, licensePath); sw.Stop(); if (!ok) { logger.Error("Error while converting"); DeleteFile(pdfFilePath); return(ok); } double conversionRate = (double)length / sw.Elapsed.TotalSeconds; logger.Info("Done conversion, size: " + length + " time:" + sw.Elapsed.TotalSeconds + " sec, rate:" + conversionRate + " b/s"); // new filter parts of the doc that are not needed // this is not the best place to have it DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); // construct the location of final output file logger.Debug("Starting Filtering"); string newDocxFile = tempDocDir + @"\" + fileName; newDocxFile = Path.ChangeExtension(newDocxFile, ".docx"); ok = filter.ApplyFilter(docxPath, newDocxFile, false); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath }); logger.Error("Error while filtering docx"); return(ok); } // one more step convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); ok = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile }); logger.Error("Error while Extracting text"); return(ok); } /* now we have text file and we will need json file and we need to * collect data from database */ ok = holder.LoadContentFromFile(textFileName); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile, textFileName }); logger.Error("Error while loading content from text file"); return(ok); } // now save json file string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); holder.SaveAsJSON(jsonFileName); // construct output object name // we are now uploading json file not docx !!! string jsonS3FileName = Path.GetFileName(jsonFileName); Array.Resize(ref keyElements, keyElements.Length - 1); string jsonObjectName = string.Join(delimiter.ToString(), keyElements) + delimiter.ToString() + jsonS3FileName; logger.Debug("uploading " + newDocxFile + " --> " + jsonObjectName); ok = UploadFileToS3(s3DocxClient, docxBucketName, jsonObjectName, jsonFileName); if (!ok) { logger.Error("Error while uploading"); return(ok); } textFileS3Path = jsonObjectName; swAll.Stop(); logger.Info("Time for the cycle:" + swAll.Elapsed.TotalSeconds + " sec"); // all good, delete files DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile, jsonFileName, textFileName }); return(true); }
private static void ProcessFiles() { RegionEndpoint bucketRegion = RegionEndpoint.USWest2; string bucketName = "sumup-test-mm"; string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input"; string extractWorkDir = @"C:\radnidio\japan-ocr-files\work"; string tempDocDir = @"C:\radnidio\japan-ocr-files\tempdoc"; string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;"; string languageMapFile = @"C:\transfer\solid-conversions\mappings\language-codes.csv"; string threeMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv"; string twoMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv"; string nonMapFile = @"C:\temp\non-mapped-document-categories.txt"; string pdfBucketName = "sua-liveboard"; string docxBucketName = "sumup-docx-outbound"; // setup various objects neesed DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); FileToIdMapCollector collector = new FileToIdMapCollector(); collector.connectionString = dbConnectionString; bool isOk; isOk = collector.LoadLists(); if (!isOk) { logger.Error("Can not collect file id maps"); System.Environment.Exit(0); } MetaDataHolderFactory.connectionString = dbConnectionString; MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile); MetaDataHolderFactory.S3bucket = pdfBucketName; // text is needed like us-west-2 MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName; MetaDataHolderFactory.GetConnection(); IAmazonS3 client = new AmazonS3Client(bucketRegion); BankDataProcessingDynamoDbDAO bankDataProcessing = new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName); isOk = bankDataProcessing.Connect(); if (!isOk) { logger.Error("Error in connecting to dynamo db: "); System.Environment.Exit(1); } // skip list List <string> skipList = new List <string>(); skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx"); logger.Info("Started working "); ListingObjectsAsync(bucketName, client).Wait(); int ic = 0; int maxFile = 3000; foreach (string s3file in FilesToProcessInS3) { ic++; if (ic > maxFile) { break; } Console.WriteLine("Processing: {0}", s3file); if (skipList.Contains(s3file)) { logger.Warn("file is skip list, skipping"); continue; } string docxPath = Path.Combine(localDirForDocxFiles, s3file); string newDocxFile = Path.Combine(tempDocDir, s3file); string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); logger.Info("Local file: {0}", docxPath); // check do we have json file anready, if so skip if (File.Exists(jsonFileName)) { logger.Info("Json file already exist, skipping"); continue; } // first download s3 file to local dir // do not load if file already exist ( better to put this in the method for some other time ) if (!File.Exists(docxPath)) { isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath); if (!isOk) { logger.Error("file not downloaded {0}", s3file); break; } } else { logger.Info("file aready downloaded"); } // now filter out what is not needed in docx isOk = filter.ApplyFilter(docxPath, newDocxFile, false); if (!isOk) { logger.Error("Error while filtering docx"); break; } // convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!isOk) { logger.Error("Error while Extracting text"); break; } // now collect metadata int id = collector.GetId(s3file); if (id == FileToIdMapCollector.MISSING_ID) { logger.Warn("id not found: {0}", s3file); continue; } logger.Info("ID: {0}", id); List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id }); MetaDataHolder holder = mhlist[0]; isOk = holder.LoadContentFromFile(textFileName); if (!isOk) { logger.Error("Error while loading content from text file {0}", textFileName); continue; } // now save json file holder.SaveAsJSON(jsonFileName); isOk = bankDataProcessing.IsIdPresent(id); if (isOk) { logger.Info("id in dynamo db"); } else { logger.Info("id NOT in dynamo db"); } } MetaDataHolderFactory.CloseConnection(); bankDataProcessing.Disconnect(); logger.Info("Done"); }