Ejemplo n.º 1
0
        /*
         * this one will generate list of ids
         * that have to be processed and save them in the file
         * for future consuption
         */
        public static void GenerateListOfIdsForFiles(string dataDir, string saveToFile, string emptyFilesList)
        {
            FileToIdMapCollector collector = new FileToIdMapCollector();
            string dbConnectionString      = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;";

            collector.connectionString = dbConnectionString;
            logger.Info("Collecting list");
            bool isOk;

            isOk = collector.LoadLists();
            if (!isOk)
            {
                logger.Error("Can not collect file id maps");
                System.Environment.Exit(0);
            }

            // list local files
            StreamWriter sw2 = new StreamWriter(emptyFilesList);

            string[] fileArray = Directory.GetFiles(dataDir, "*.json");
            using (StreamWriter sw = new StreamWriter(saveToFile))
            {
                foreach (string ffile in fileArray)
                {
                    logger.Info(ffile);
                    // test is text file empty
                    string fpath  = Path.ChangeExtension(ffile, ".txt");
                    long   length = new System.IO.FileInfo(fpath).Length;
                    if (length == 0)
                    {
                        logger.Info("file len is 0");
                    }
                    string name = Path.GetFileNameWithoutExtension(ffile) + ".docx";
                    int    id   = collector.GetId(name);

                    if (id == FileToIdMapCollector.MISSING_ID)
                    {
                        logger.Warn("Document without id: {0}", name);
                        sw2.WriteLine(name + "||file does not have id");
                    }
                    else
                    {
                        string fullFileName = collector.GetFullFileName(name);
                        fullFileName = Path.ChangeExtension(fullFileName, ".json");
                        logger.Info("{0} --> {1}", name, id);
                        // sw.WriteLine(name + "|" + id.ToString() + "|" + fullFileName);
                        if (length == 0)
                        {
                            sw2.WriteLine(name + "|" + id.ToString() + "|" + fullFileName);
                        }
                        else
                        {
                            sw.WriteLine(name + "|" + id.ToString() + "|" + fullFileName);
                        }
                    }
                }
            }
            sw2.Close();
        }
Ejemplo n.º 2
0
        private static void ProcessFiles()
        {
            RegionEndpoint bucketRegion = RegionEndpoint.USWest2;

            string bucketName           = "sumup-test-mm";
            string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input";
            string extractWorkDir       = @"C:\radnidio\japan-ocr-files\work";
            string tempDocDir           = @"C:\radnidio\japan-ocr-files\tempdoc";

            string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;";
            string languageMapFile    = @"C:\transfer\solid-conversions\mappings\language-codes.csv";
            string threeMapFile       = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv";
            string twoMapFile         = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv";
            string nonMapFile         = @"C:\temp\non-mapped-document-categories.txt";
            string pdfBucketName      = "sua-liveboard";

            string docxBucketName = "sumup-docx-outbound";
            // setup various objects neesed

            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            FileToIdMapCollector collector = new FileToIdMapCollector();

            collector.connectionString = dbConnectionString;
            bool isOk;

            isOk = collector.LoadLists();
            if (!isOk)
            {
                logger.Error("Can not collect file id maps");
                System.Environment.Exit(0);
            }


            MetaDataHolderFactory.connectionString = dbConnectionString;
            MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile);
            MetaDataHolderFactory.S3bucket = pdfBucketName;
            // text is needed like us-west-2
            MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName;

            MetaDataHolderFactory.GetConnection();

            IAmazonS3 client = new AmazonS3Client(bucketRegion);

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName);

            isOk = bankDataProcessing.Connect();
            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }

            // skip list
            List <string> skipList = new List <string>();

            skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx");
            logger.Info("Started working ");



            ListingObjectsAsync(bucketName, client).Wait();

            int ic      = 0;
            int maxFile = 3000;

            foreach (string s3file in FilesToProcessInS3)
            {
                ic++;
                if (ic > maxFile)
                {
                    break;
                }
                Console.WriteLine("Processing: {0}", s3file);
                if (skipList.Contains(s3file))
                {
                    logger.Warn("file is skip list, skipping");
                    continue;
                }

                string docxPath     = Path.Combine(localDirForDocxFiles, s3file);
                string newDocxFile  = Path.Combine(tempDocDir, s3file);
                string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

                logger.Info("Local file: {0}", docxPath);
                // check do we have json file anready, if so skip
                if (File.Exists(jsonFileName))
                {
                    logger.Info("Json file already exist, skipping");
                    continue;
                }

                // first download s3 file to local dir
                // do not load if file already exist ( better to put this in the method for some other time )
                if (!File.Exists(docxPath))
                {
                    isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath);
                    if (!isOk)
                    {
                        logger.Error("file not downloaded {0}", s3file);
                        break;
                    }
                }
                else
                {
                    logger.Info("file aready downloaded");
                }

                // now filter out what is not needed in docx
                isOk = filter.ApplyFilter(docxPath, newDocxFile, false);
                if (!isOk)
                {
                    logger.Error("Error while filtering docx");
                    break;
                }
                // convert docx to txt
                logger.Debug("Starting extraction of the text");
                string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
                DocxToText docxToText   = new DocxToText();
                isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
                if (!isOk)
                {
                    logger.Error("Error while Extracting text");
                    break;
                }

                // now collect metadata

                int id = collector.GetId(s3file);
                if (id == FileToIdMapCollector.MISSING_ID)
                {
                    logger.Warn("id not found: {0}", s3file);
                    continue;
                }
                logger.Info("ID: {0}", id);

                List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id });
                MetaDataHolder        holder = mhlist[0];
                isOk = holder.LoadContentFromFile(textFileName);
                if (!isOk)
                {
                    logger.Error("Error while loading content from text file {0}", textFileName);
                    continue;
                }
                // now save json file

                holder.SaveAsJSON(jsonFileName);

                isOk = bankDataProcessing.IsIdPresent(id);
                if (isOk)
                {
                    logger.Info("id in dynamo db");
                }
                else
                {
                    logger.Info("id NOT in dynamo db");
                }
            }
            MetaDataHolderFactory.CloseConnection();
            bankDataProcessing.Disconnect();
            logger.Info("Done");
        }