Exemple #1
0
        /*
         * this one will update dynamo db status
         * for these files we will put special value into field
         * is_converted_to_text, value will be S that will alow special handling of these files
         * when loaded to nucleus
         */

        public static void UpdateDynamoDb(string saveToFile)
        {
            string pdfBucketName  = "sua-liveboard";
            string docxBucketName = "sumup-docx-outbound";

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName,
                                                  pdfBucketName, docxBucketName);
            bool isOk = bankDataProcessing.Connect();

            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }
            // special status is to set is converted to text to S
            string isConvertedToText = "S";
            string source            = "bank_of_japan";
            string language          = "japanese";

            using (StreamReader sr = new StreamReader(saveToFile))
            {
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    string[] items            = line.Split('|');
                    string   fullJsonFileName = items[2];
                    int      id           = Int32.Parse(items[1]);
                    string   docxFileName = items[0];
                    // we need this to insert new record
                    string pdfFileName = Path.ChangeExtension(fullJsonFileName, ".pdf");
                    isOk = bankDataProcessing.IsIdPresent(id);
                    if (isOk)
                    {
                        logger.Debug("id in dynamo updating: {0}", id);
                        isOk = bankDataProcessing.UpdateForReprocessing(id, isConvertedToText);
                        if (!isOk)
                        {
                            logger.Error("Update not successful for id {0}", id);
                            System.Environment.Exit(1);
                        }
                    }
                    else
                    {
                        logger.Debug("Not id in dynamo inserting: {0}", id);
                        isOk = bankDataProcessing.InsertSpecial(id, source, language, pdfFileName,
                                                                fullJsonFileName, isConvertedToText);
                        if (!isOk)
                        {
                            logger.Error("Inseert not successful for id {0}", id);
                            System.Environment.Exit(1);
                        }
                    }
                }
            }
            bankDataProcessing.Disconnect();
        }
Exemple #2
0
        /*
         * this one will report current dynamo db status before update
         * run this to verify what is sent and what is not and what does exist in dynamo db
         */
        public static void ReportDynamoDbStatus(string saveToFile, string sentAlreadyFile)
        {
            string pdfBucketName  = "sua-liveboard";
            string docxBucketName = "sumup-docx-outbound";

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName,
                                                  pdfBucketName, docxBucketName);
            bool isOk = bankDataProcessing.Connect();

            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }
            StreamWriter sw = new StreamWriter(sentAlreadyFile);

            using (StreamReader sr = new StreamReader(saveToFile))
            {
                string line;


                while ((line = sr.ReadLine()) != null)
                {
                    string[] items            = line.Split('|');
                    string   fullJsonFileName = items[2];
                    int      id           = Int32.Parse(items[1]);
                    string   docxFileName = items[0];
                    isOk = bankDataProcessing.IsIdPresent(id);
                    if (isOk)
                    {
                        logger.Debug("id in dynamo {0}", id);
                    }
                    bool isSent = bankDataProcessing.IsIdSentToNucleus(id);
                    if (isSent)
                    {
                        logger.Debug("It is sent to nucleus {0}", id);
                        sw.WriteLine("{0}|{1}|{2}", docxFileName, id, "Sent to nucleus already");
                        continue;
                    }
                    else
                    {
                    }
                }
            }
            sw.Close();
            bankDataProcessing.Disconnect();
        }
        /*
         * this one will prepare some sort of message class that keeps content in certain format
         * message body will be send over SQS
         */
        public static ConversionRequestMessage ConstructMessage(int lastId, int rowcount, string pdfSource,
                                                                char delimiter, MySqlConnection conn, BankDataProcessingDynamoDbDAO bankDataProcessing, bool skipProcessedId)
        {
            string query = "select id, file_url from bank_data " +
                           "where id > @p1 " +
                           "and source = @p2 " +
                           "and file_url like '%.pdf' " +
                           "limit @p3; ";
            MySqlCommand cmd = new MySqlCommand(query, conn);

            cmd.Parameters.AddWithValue("@p1", lastId);
            cmd.Parameters.AddWithValue("@p2", pdfSource);
            cmd.Parameters.AddWithValue("@p3", rowcount);

            logger.Debug(cmd.CommandText);
            MySqlDataReader reader = cmd.ExecuteReader();

            ConversionRequestMessage msg = new ConversionRequestMessage();

            while (reader.Read())
            {
                int id = reader.GetInt32(0);
                // this part will skip ids that are already in dynamo db table
                // they will not be re-processed !!!!
                if (skipProcessedId)
                {
                    if (bankDataProcessing.IsIdPresent(id))
                    {
                        continue;
                    }
                }
                string fileUrl = reader.GetString(1);
                msg.AddIdAndFile(id, fileUrl);
            }
            reader.Close();

            return(msg);
        }
Exemple #4
0
        private static void ProcessFiles()
        {
            RegionEndpoint bucketRegion = RegionEndpoint.USWest2;

            string bucketName           = "sumup-test-mm";
            string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input";
            string extractWorkDir       = @"C:\radnidio\japan-ocr-files\work";
            string tempDocDir           = @"C:\radnidio\japan-ocr-files\tempdoc";

            string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;";
            string languageMapFile    = @"C:\transfer\solid-conversions\mappings\language-codes.csv";
            string threeMapFile       = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv";
            string twoMapFile         = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv";
            string nonMapFile         = @"C:\temp\non-mapped-document-categories.txt";
            string pdfBucketName      = "sua-liveboard";

            string docxBucketName = "sumup-docx-outbound";
            // setup various objects neesed

            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            FileToIdMapCollector collector = new FileToIdMapCollector();

            collector.connectionString = dbConnectionString;
            bool isOk;

            isOk = collector.LoadLists();
            if (!isOk)
            {
                logger.Error("Can not collect file id maps");
                System.Environment.Exit(0);
            }


            MetaDataHolderFactory.connectionString = dbConnectionString;
            MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile);
            MetaDataHolderFactory.S3bucket = pdfBucketName;
            // text is needed like us-west-2
            MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName;

            MetaDataHolderFactory.GetConnection();

            IAmazonS3 client = new AmazonS3Client(bucketRegion);

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName);

            isOk = bankDataProcessing.Connect();
            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }

            // skip list
            List <string> skipList = new List <string>();

            skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx");
            logger.Info("Started working ");



            ListingObjectsAsync(bucketName, client).Wait();

            int ic      = 0;
            int maxFile = 3000;

            foreach (string s3file in FilesToProcessInS3)
            {
                ic++;
                if (ic > maxFile)
                {
                    break;
                }
                Console.WriteLine("Processing: {0}", s3file);
                if (skipList.Contains(s3file))
                {
                    logger.Warn("file is skip list, skipping");
                    continue;
                }

                string docxPath     = Path.Combine(localDirForDocxFiles, s3file);
                string newDocxFile  = Path.Combine(tempDocDir, s3file);
                string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

                logger.Info("Local file: {0}", docxPath);
                // check do we have json file anready, if so skip
                if (File.Exists(jsonFileName))
                {
                    logger.Info("Json file already exist, skipping");
                    continue;
                }

                // first download s3 file to local dir
                // do not load if file already exist ( better to put this in the method for some other time )
                if (!File.Exists(docxPath))
                {
                    isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath);
                    if (!isOk)
                    {
                        logger.Error("file not downloaded {0}", s3file);
                        break;
                    }
                }
                else
                {
                    logger.Info("file aready downloaded");
                }

                // now filter out what is not needed in docx
                isOk = filter.ApplyFilter(docxPath, newDocxFile, false);
                if (!isOk)
                {
                    logger.Error("Error while filtering docx");
                    break;
                }
                // convert docx to txt
                logger.Debug("Starting extraction of the text");
                string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
                DocxToText docxToText   = new DocxToText();
                isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
                if (!isOk)
                {
                    logger.Error("Error while Extracting text");
                    break;
                }

                // now collect metadata

                int id = collector.GetId(s3file);
                if (id == FileToIdMapCollector.MISSING_ID)
                {
                    logger.Warn("id not found: {0}", s3file);
                    continue;
                }
                logger.Info("ID: {0}", id);

                List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id });
                MetaDataHolder        holder = mhlist[0];
                isOk = holder.LoadContentFromFile(textFileName);
                if (!isOk)
                {
                    logger.Error("Error while loading content from text file {0}", textFileName);
                    continue;
                }
                // now save json file

                holder.SaveAsJSON(jsonFileName);

                isOk = bankDataProcessing.IsIdPresent(id);
                if (isOk)
                {
                    logger.Info("id in dynamo db");
                }
                else
                {
                    logger.Info("id NOT in dynamo db");
                }
            }
            MetaDataHolderFactory.CloseConnection();
            bankDataProcessing.Disconnect();
            logger.Info("Done");
        }