/* * do all the work */ public static bool DoFileConversion(IAmazonS3 s3Client, string pdfBucketName, IAmazonS3 s3DocxClient, string docxBucketName, string licensePath, string outputDir, string extractWorkDir, string tempDocDir, string objectKey, MetaDataHolder holder, out string textFileS3Path) { textFileS3Path = String.Empty; // this is output file location // we will need this array char delimiter = '/'; String[] keyElements = objectKey.Split(delimiter); string fileName = keyElements[keyElements.Length - 1]; string pdfFilePath = outputDir + fileName; logger.Debug("downloading " + objectKey + " --> " + pdfFilePath); Stopwatch swAll = new Stopwatch(); swAll.Start(); bool ok = DownloadFileFromS3(s3Client, pdfBucketName, objectKey, pdfFilePath); if (!ok) { logger.Error("Error while downloading"); return(ok); } // get file length long length = new System.IO.FileInfo(pdfFilePath).Length; // construct the name of the doc file string docxPath = Path.ChangeExtension(pdfFilePath, ".docx"); // start stop watch Stopwatch sw = new Stopwatch(); sw.Start(); ok = ConvertPDFDocument(pdfFilePath, docxPath, licensePath); sw.Stop(); if (!ok) { logger.Error("Error while converting"); DeleteFile(pdfFilePath); return(ok); } double conversionRate = (double)length / sw.Elapsed.TotalSeconds; logger.Info("Done conversion, size: " + length + " time:" + sw.Elapsed.TotalSeconds + " sec, rate:" + conversionRate + " b/s"); // new filter parts of the doc that are not needed // this is not the best place to have it DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); // construct the location of final output file logger.Debug("Starting Filtering"); string newDocxFile = tempDocDir + @"\" + fileName; newDocxFile = Path.ChangeExtension(newDocxFile, ".docx"); ok = filter.ApplyFilter(docxPath, newDocxFile, false); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath }); logger.Error("Error while filtering docx"); return(ok); } // one more step convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); ok = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile }); logger.Error("Error while Extracting text"); return(ok); } /* now we have text file and we will need json file and we need to * collect data from database */ ok = holder.LoadContentFromFile(textFileName); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile, textFileName }); logger.Error("Error while loading content from text file"); return(ok); } // now save json file string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); holder.SaveAsJSON(jsonFileName); // construct output object name // we are now uploading json file not docx !!! string jsonS3FileName = Path.GetFileName(jsonFileName); Array.Resize(ref keyElements, keyElements.Length - 1); string jsonObjectName = string.Join(delimiter.ToString(), keyElements) + delimiter.ToString() + jsonS3FileName; logger.Debug("uploading " + newDocxFile + " --> " + jsonObjectName); ok = UploadFileToS3(s3DocxClient, docxBucketName, jsonObjectName, jsonFileName); if (!ok) { logger.Error("Error while uploading"); return(ok); } textFileS3Path = jsonObjectName; swAll.Stop(); logger.Info("Time for the cycle:" + swAll.Elapsed.TotalSeconds + " sec"); // all good, delete files DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile, jsonFileName, textFileName }); return(true); }
static void Main(string[] args) { // imput parameters, read from config file App.config string toDebug = Config.toDebug; string pdfBucketName = Config.pdfBucketName; string docxBucketName = Config.docxBucketName; string outputDir = Config.outputDir; string licensePath = Config.licensePath; string listenerQueueName = Config.listenerQueueName; string confirmationQueueName = Config.confirmationQueueName; string extractWorkDir = Config.extractWorkDir; string tempDocDir = Config.tempDocDir; int sleepTimeMillis = Config.sleepTimeMillis; int maxIdleTime = Config.maxIdleTime; Config.printAllParams(); // start logging logger.Info("Starting program"); // try to get instance id string instanceId = "NOT_ON_AMAZON"; try { instanceId = Amazon.Util.EC2InstanceMetadata.InstanceId.ToString(); logger.Info("Instance id:" + instanceId); } catch (Exception) { logger.Info("Not on EC2 instance"); } // clear working directory RemoveFilesAndSubDirectories(tempDocDir); // initialize various objects that we need all the time RegionEndpoint bucketRegion = RegionEndpoint.USWest1; IAmazonS3 s3Client = new AmazonS3Client(bucketRegion); RegionEndpoint docxBucketRegion = RegionEndpoint.USWest2; IAmazonS3 s3DocxClient = new AmazonS3Client(docxBucketRegion); MetaDataHolderFactory.connectionString = Config.DbConnectionString; MetaDataHolderFactory.loadMaps(Config.languageMapFile, Config.threeMapFile, Config.twoMapFile, Config.nonMapFile); MetaDataHolderFactory.S3bucket = pdfBucketName; // text is needed like us-west-2 MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName; BankDataProcessingDynamoDbDAO bankDataProcessing = new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName); char delimiter = '|'; // open queues // connect to sending queue AmazonSQSConfig sqsConfig = new AmazonSQSConfig(); // this is needed as well sqsConfig.RegionEndpoint = Amazon.RegionEndpoint.USWest2; AmazonSQSClient sqsClient = new AmazonSQSClient(sqsConfig); ReceiveMessageRequest recRequest = new ReceiveMessageRequest(); recRequest.QueueUrl = listenerQueueName; recRequest.MaxNumberOfMessages = 1; // loop and read mails from controller int counter = 0; // doWork will be true until message with id = 0 is detected // this part is not implemented yet !!! bool doWork = true; // get message time, this will be the time of last message sent // if nothing happens for some time program will exit DateTime lastMessageTime = DateTime.Now; while (doWork) { TimeSpan idleTime = DateTime.Now.Subtract(lastMessageTime); if (idleTime.TotalMinutes > maxIdleTime) { logger.Info("Exiting, no message within last " + maxIdleTime.ToString() + " minutes"); break; } List <Tuple <int, string> > tuples; ConversionResponseMessage conversionResponseMessage; string requestMessageId; WaitForInputMessage(listenerQueueName, delimiter, sqsClient, recRequest, out requestMessageId, out tuples); // now process documents one after another int processedCounter = 0, totalCounter = 0; conversionResponseMessage = new ConversionResponseMessage(requestMessageId); conversionResponseMessage.InstanceId = instanceId; if (tuples == null) { logger.Debug("Sleeping"); System.Threading.Thread.Sleep(sleepTimeMillis); continue; } MetaDataHolderFactory.GetConnection(); bool ok = bankDataProcessing.Connect(); if (!ok) { logger.Error("Error in connecting to dynamo db: "); System.Environment.Exit(1); } foreach (Tuple <int, string> tup in tuples) { totalCounter++; int id = tup.Item1; string fileUrl = tup.Item2; if (id == 0) { doWork = false; break; } logger.Info(counter + " processing id: " + id + " " + fileUrl); // now do the processing of database data for id List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id }); // it is always just one meta data holder for now so we can easily extract it // and pass it to the processing routine MetaDataHolder holder = mhlist[0]; string textFileS3Path = ""; ok = DoFileConversion(s3Client, pdfBucketName, s3DocxClient, docxBucketName, licensePath, outputDir, extractWorkDir, tempDocDir, fileUrl, holder, out textFileS3Path); if (!ok) { logger.Error("Error in processing id: " + id.ToString()); conversionResponseMessage.AddIdAndFileUrlThatIsNotProcessed(id, fileUrl); continue; } else { bankDataProcessing.Insert(id, holder.Bank, holder.Language, fileUrl, textFileS3Path); processedCounter++; } } MetaDataHolderFactory.CloseConnection(); bankDataProcessing.Disconnect(); // processing done see how successfull and report to the controller int badFiles = totalCounter - processedCounter; if (badFiles > 0) { logger.Info("Not all files are processed succesfully, failures:" + badFiles.ToString()); } SendMessageRequest request = new SendMessageRequest(); request.MessageBody = conversionResponseMessage.GetMessageBody(); request.QueueUrl = confirmationQueueName; SendMessageResponse confirmationResponse = sqsClient.SendMessage(request); if (confirmationResponse.HttpStatusCode == System.Net.HttpStatusCode.OK) { logger.Debug("Confirmation message sent "); // remember when last message with results is sent lastMessageTime = DateTime.Now; } else { logger.Error("Problem sending confirmation message"); } } System.Environment.Exit(0); }
private static void ProcessFiles() { RegionEndpoint bucketRegion = RegionEndpoint.USWest2; string bucketName = "sumup-test-mm"; string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input"; string extractWorkDir = @"C:\radnidio\japan-ocr-files\work"; string tempDocDir = @"C:\radnidio\japan-ocr-files\tempdoc"; string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;"; string languageMapFile = @"C:\transfer\solid-conversions\mappings\language-codes.csv"; string threeMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv"; string twoMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv"; string nonMapFile = @"C:\temp\non-mapped-document-categories.txt"; string pdfBucketName = "sua-liveboard"; string docxBucketName = "sumup-docx-outbound"; // setup various objects neesed DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); FileToIdMapCollector collector = new FileToIdMapCollector(); collector.connectionString = dbConnectionString; bool isOk; isOk = collector.LoadLists(); if (!isOk) { logger.Error("Can not collect file id maps"); System.Environment.Exit(0); } MetaDataHolderFactory.connectionString = dbConnectionString; MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile); MetaDataHolderFactory.S3bucket = pdfBucketName; // text is needed like us-west-2 MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName; MetaDataHolderFactory.GetConnection(); IAmazonS3 client = new AmazonS3Client(bucketRegion); BankDataProcessingDynamoDbDAO bankDataProcessing = new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName); isOk = bankDataProcessing.Connect(); if (!isOk) { logger.Error("Error in connecting to dynamo db: "); System.Environment.Exit(1); } // skip list List <string> skipList = new List <string>(); skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx"); logger.Info("Started working "); ListingObjectsAsync(bucketName, client).Wait(); int ic = 0; int maxFile = 3000; foreach (string s3file in FilesToProcessInS3) { ic++; if (ic > maxFile) { break; } Console.WriteLine("Processing: {0}", s3file); if (skipList.Contains(s3file)) { logger.Warn("file is skip list, skipping"); continue; } string docxPath = Path.Combine(localDirForDocxFiles, s3file); string newDocxFile = Path.Combine(tempDocDir, s3file); string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); logger.Info("Local file: {0}", docxPath); // check do we have json file anready, if so skip if (File.Exists(jsonFileName)) { logger.Info("Json file already exist, skipping"); continue; } // first download s3 file to local dir // do not load if file already exist ( better to put this in the method for some other time ) if (!File.Exists(docxPath)) { isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath); if (!isOk) { logger.Error("file not downloaded {0}", s3file); break; } } else { logger.Info("file aready downloaded"); } // now filter out what is not needed in docx isOk = filter.ApplyFilter(docxPath, newDocxFile, false); if (!isOk) { logger.Error("Error while filtering docx"); break; } // convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!isOk) { logger.Error("Error while Extracting text"); break; } // now collect metadata int id = collector.GetId(s3file); if (id == FileToIdMapCollector.MISSING_ID) { logger.Warn("id not found: {0}", s3file); continue; } logger.Info("ID: {0}", id); List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id }); MetaDataHolder holder = mhlist[0]; isOk = holder.LoadContentFromFile(textFileName); if (!isOk) { logger.Error("Error while loading content from text file {0}", textFileName); continue; } // now save json file holder.SaveAsJSON(jsonFileName); isOk = bankDataProcessing.IsIdPresent(id); if (isOk) { logger.Info("id in dynamo db"); } else { logger.Info("id NOT in dynamo db"); } } MetaDataHolderFactory.CloseConnection(); bankDataProcessing.Disconnect(); logger.Info("Done"); }