private void HandleDOCXFileEntry(FileEntry pFileEntry)
        {
            DocxToText dtt = new DocxToText(pFileEntry.FilePath);

            if (dtt.IsFileContainString(mTextToSearch))
            {
                PutFileEntryToSearchResult(pFileEntry);
            }
        }
Exemple #2
0
        public static async Task FileToWav(HttpPostedFileBase file, string path)
        {
            if (!File.Exists(path + file.FileName))
            {
                var extension = Path.GetExtension(file.FileName);
                var text      = "";

                if (extension == ".pdf")
                {
                    using (PdfReader pfdReader = new PdfReader(file.InputStream))
                    {
                        for (var i = 1; i <= pfdReader.NumberOfPages; i++)
                        {
                            text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pfdReader, i);
                        }
                    }
                }
                else if (extension == ".txt")
                {
                    using (var reader = new StreamReader(file.InputStream))
                    {
                        text = reader.ReadToEnd();
                    }
                }
                else if (extension == ".docx")
                {
                    file.SaveAs(path + file.FileName);
                    DocxToText dtt = new DocxToText(path + file.FileName);
                    text = dtt.ExtractText();
                    File.Delete(path + file.FileName);
                }

                await Task.Run(() =>
                {
                    using (var reader = new SpeechSynthesizer())
                    {
                        var fileName = path + Path.GetFileNameWithoutExtension(file.FileName) + ".wav";
                        reader.SelectVoiceByHints(VoiceGender.Female, VoiceAge.Senior);
                        reader.SetOutputToWaveFile(fileName);
                        var builder = new PromptBuilder();
                        builder.AppendText(text);
                        reader.Speak(builder);
                    }
                });
            }
        }
        private string GetTextFromDocxUsingCustomLibrary(Stream ms)
        {
            try
            {
                var docxToText = new DocxToText(ms);
                var text       = docxToText.ExtractText();

                if (text.Contains('\0'))
                {
                    return(null);
                }

                return(text);
            }
            catch (Exception ex)
            {
                _logger.LogWarning($"GetTextFromDocxUsingCustomLibrary failed: {ex}");
                return(null);
            }
        }
        public static string parseDocx(Stream fileStream)
        {
            //string filename = System.Web.HttpContext.Current.Server.MapPath("~") + "\\..\\temp\\" + Guid.NewGuid().ToString() + ".docx";
            string path = ConfigurationManager.AppSettings.Get("tempPath");
            string filename = path + Guid.NewGuid().ToString() + ".docx";
            FileStream writer = new FileStream(filename, FileMode.Create, FileAccess.Write);
            int len = 10240;
            Byte[] buffer = new Byte[len];
            int read = fileStream.Read(buffer, 0, len);
            // write the required bytes
            while (read > 0)
            {
                writer.Write(buffer, 0, read);
                read = fileStream.Read(buffer, 0, len);
            }
            writer.Close();
            writer.Dispose();
            writer = null;

            DocxToText docx = new DocxToText(filename);
            string text = docx.ExtractText();

            File.Delete(filename);

            text = text.Replace("\t", " ").Replace("\n", " ").Replace("\r", " ");
            return text;
        }
        /*
         * do all the work
         */

        public static bool DoFileConversion(IAmazonS3 s3Client, string pdfBucketName,
                                            IAmazonS3 s3DocxClient, string docxBucketName,
                                            string licensePath,
                                            string outputDir, string extractWorkDir, string tempDocDir,
                                            string objectKey, MetaDataHolder holder,
                                            out string textFileS3Path)
        {
            textFileS3Path = String.Empty;  // this is output file location
            // we will need this array
            char delimiter = '/';

            String[] keyElements = objectKey.Split(delimiter);
            string   fileName    = keyElements[keyElements.Length - 1];

            string pdfFilePath = outputDir + fileName;

            logger.Debug("downloading " + objectKey + " --> " + pdfFilePath);

            Stopwatch swAll = new Stopwatch();

            swAll.Start();

            bool ok = DownloadFileFromS3(s3Client, pdfBucketName, objectKey, pdfFilePath);

            if (!ok)
            {
                logger.Error("Error while downloading");
                return(ok);
            }
            // get file length
            long length = new System.IO.FileInfo(pdfFilePath).Length;

            // construct the name of the doc file
            string docxPath = Path.ChangeExtension(pdfFilePath, ".docx");

            // start stop watch
            Stopwatch sw = new Stopwatch();

            sw.Start();
            ok = ConvertPDFDocument(pdfFilePath, docxPath, licensePath);
            sw.Stop();
            if (!ok)
            {
                logger.Error("Error while converting");
                DeleteFile(pdfFilePath);
                return(ok);
            }
            double conversionRate = (double)length / sw.Elapsed.TotalSeconds;

            logger.Info("Done conversion, size: " + length + " time:" + sw.Elapsed.TotalSeconds + " sec, rate:" + conversionRate + " b/s");

            // new filter parts of the doc that are not needed
            // this is not the best place to have it
            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            // construct the location of final output file
            logger.Debug("Starting Filtering");

            string newDocxFile = tempDocDir + @"\" + fileName;

            newDocxFile = Path.ChangeExtension(newDocxFile, ".docx");

            ok = filter.ApplyFilter(docxPath, newDocxFile, false);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath
                });
                logger.Error("Error while filtering docx");
                return(ok);
            }

            // one more step convert docx to txt
            logger.Debug("Starting extraction of the text");
            string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
            DocxToText docxToText   = new DocxToText();

            ok = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath, newDocxFile
                });
                logger.Error("Error while Extracting text");
                return(ok);
            }

            /* now we have text file and we will need json file and we need to
             * collect data from database
             */

            ok = holder.LoadContentFromFile(textFileName);
            if (!ok)
            {
                DeleteListOfFiles(new List <string> {
                    docxPath, pdfFilePath, newDocxFile, textFileName
                });
                logger.Error("Error while loading content from text file");
                return(ok);
            }

            // now save json file
            string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

            holder.SaveAsJSON(jsonFileName);

            // construct output object name
            // we are now uploading json file not docx !!!

            string jsonS3FileName = Path.GetFileName(jsonFileName);

            Array.Resize(ref keyElements, keyElements.Length - 1);
            string jsonObjectName = string.Join(delimiter.ToString(), keyElements) + delimiter.ToString() + jsonS3FileName;

            logger.Debug("uploading " + newDocxFile + " --> " + jsonObjectName);

            ok = UploadFileToS3(s3DocxClient, docxBucketName, jsonObjectName, jsonFileName);
            if (!ok)
            {
                logger.Error("Error while uploading");
                return(ok);
            }
            textFileS3Path = jsonObjectName;
            swAll.Stop();

            logger.Info("Time for the cycle:" + swAll.Elapsed.TotalSeconds + " sec");

            // all good, delete files
            DeleteListOfFiles(new List <string> {
                docxPath, pdfFilePath, newDocxFile, jsonFileName, textFileName
            });

            return(true);
        }
        public static void CheckAttachmentsForDocOrPDFText(ActiveRecord record)
        {
            //walk the field list for this record looking for attachments
            foreach (var fieldName in record.GetFieldNames())
            {
                if (fieldName.Contains("Attachment") && fieldName.DoesntContain("RawText"))
                {
                    //if (record.Fields.Attachment.IsDirty) {
                    if (ActiveFieldBase.IsDirtyObj(record[fieldName].ValueObject, record[fieldName].OriginalValueObject))
                    {
                        if (record[fieldName].ToString().Contains(".doc") || record[fieldName].ToString().EndsWith(".pdf") || record[fieldName].ToString().EndsWith(".rtf"))
                        {
                            if (!record.FieldExists(fieldName + "RawText"))
                            {
                                (new Sql("ALTER TABLE ", record.GetTableName().SqlizeName(), " ADD [" + fieldName + "RawText] nvarchar (MAX);")).Execute();
                            }
                            string output = "";
                            if (record[fieldName].ToString().ToLower().EndsWith(".doc"))
                            {
                                OfficeFileReader.OfficeFileReader objOFR = new OfficeFileReader.OfficeFileReader();
                                if (objOFR.GetText(Web.MapPath(Web.Attachments) + record[fieldName].ToString(), ref output) > 0)
                                {
                                    //ok
                                }
                            }
                            else if (record[fieldName].ToString().ToLower().EndsWith(".docx"))
                            {
                                BewebCore.ThirdParty.ReadWordDocText.DocxToText objOFR = new DocxToText(Web.MapPath(Web.Attachments) + record[fieldName].ToString());
                                if ((output = objOFR.ExtractText()).Length > 0)
                                {
                                    //ok
                                }
                            }
                            else if (record[fieldName].ToString().Contains(".pdf"))
                            {
                                PdfToText.PDFParser pdf = new PDFParser();
                                if (pdf.ExtractText(Web.MapPath(Web.Attachments) + record[fieldName].ToString(), ref output))
                                {
                                    //ok
                                }
                            }
                            else if (record[fieldName].ToString().Contains(".rtf"))
                            {
#if RTFProcessingAvailable
                                //Create the RTF tree object
                                RtfTree tree = new RtfTree();

                                //Load and parse RTF document
                                tree.LoadRtfFile(Web.MapPath(Web.Attachments) + record[fieldName].ToString());
                                output = tree.Text;
#else
                                throw new Exception("rtf library not included");
#endif
                            }
                            if (output.Trim() != "")
                            {
                                (new Sql("update ", record.GetTableName().SqlizeName(), "set " + fieldName + "RawText=", output.SqlizeText(), " where ",
                                         record.GetPrimaryKeyName().SqlizeName(), "=", record.ID_Field.Sqlize(), "")).Execute();
                            }
                        }
                        else
                        {
                            //no doc any more
                            if (record.FieldExists(fieldName + "RawText"))
                            {
                                (new Sql("update ", record.GetTableName().SqlizeName(), "set " + fieldName + "RawText=null where ",
                                         record.GetPrimaryKeyName().SqlizeName(), "=", record.ID_Field.Sqlize(), "")).Execute();
                            }
                        }
                    }
                }
            }
        }
Exemple #7
0
        private static void ProcessFiles()
        {
            RegionEndpoint bucketRegion = RegionEndpoint.USWest2;

            string bucketName           = "sumup-test-mm";
            string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input";
            string extractWorkDir       = @"C:\radnidio\japan-ocr-files\work";
            string tempDocDir           = @"C:\radnidio\japan-ocr-files\tempdoc";

            string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;";
            string languageMapFile    = @"C:\transfer\solid-conversions\mappings\language-codes.csv";
            string threeMapFile       = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv";
            string twoMapFile         = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv";
            string nonMapFile         = @"C:\temp\non-mapped-document-categories.txt";
            string pdfBucketName      = "sua-liveboard";

            string docxBucketName = "sumup-docx-outbound";
            // setup various objects neesed

            DocxTagFilter filter = new DocxTagFilter(extractWorkDir);

            // set default tags
            filter.SetupDefaultTags();

            FileToIdMapCollector collector = new FileToIdMapCollector();

            collector.connectionString = dbConnectionString;
            bool isOk;

            isOk = collector.LoadLists();
            if (!isOk)
            {
                logger.Error("Can not collect file id maps");
                System.Environment.Exit(0);
            }


            MetaDataHolderFactory.connectionString = dbConnectionString;
            MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile);
            MetaDataHolderFactory.S3bucket = pdfBucketName;
            // text is needed like us-west-2
            MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName;

            MetaDataHolderFactory.GetConnection();

            IAmazonS3 client = new AmazonS3Client(bucketRegion);

            BankDataProcessingDynamoDbDAO bankDataProcessing =
                new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName);

            isOk = bankDataProcessing.Connect();
            if (!isOk)
            {
                logger.Error("Error in connecting to dynamo db: ");
                System.Environment.Exit(1);
            }

            // skip list
            List <string> skipList = new List <string>();

            skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx");
            logger.Info("Started working ");



            ListingObjectsAsync(bucketName, client).Wait();

            int ic      = 0;
            int maxFile = 3000;

            foreach (string s3file in FilesToProcessInS3)
            {
                ic++;
                if (ic > maxFile)
                {
                    break;
                }
                Console.WriteLine("Processing: {0}", s3file);
                if (skipList.Contains(s3file))
                {
                    logger.Warn("file is skip list, skipping");
                    continue;
                }

                string docxPath     = Path.Combine(localDirForDocxFiles, s3file);
                string newDocxFile  = Path.Combine(tempDocDir, s3file);
                string jsonFileName = Path.ChangeExtension(newDocxFile, ".json");

                logger.Info("Local file: {0}", docxPath);
                // check do we have json file anready, if so skip
                if (File.Exists(jsonFileName))
                {
                    logger.Info("Json file already exist, skipping");
                    continue;
                }

                // first download s3 file to local dir
                // do not load if file already exist ( better to put this in the method for some other time )
                if (!File.Exists(docxPath))
                {
                    isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath);
                    if (!isOk)
                    {
                        logger.Error("file not downloaded {0}", s3file);
                        break;
                    }
                }
                else
                {
                    logger.Info("file aready downloaded");
                }

                // now filter out what is not needed in docx
                isOk = filter.ApplyFilter(docxPath, newDocxFile, false);
                if (!isOk)
                {
                    logger.Error("Error while filtering docx");
                    break;
                }
                // convert docx to txt
                logger.Debug("Starting extraction of the text");
                string     textFileName = Path.ChangeExtension(newDocxFile, ".txt");
                DocxToText docxToText   = new DocxToText();
                isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName);
                if (!isOk)
                {
                    logger.Error("Error while Extracting text");
                    break;
                }

                // now collect metadata

                int id = collector.GetId(s3file);
                if (id == FileToIdMapCollector.MISSING_ID)
                {
                    logger.Warn("id not found: {0}", s3file);
                    continue;
                }
                logger.Info("ID: {0}", id);

                List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id });
                MetaDataHolder        holder = mhlist[0];
                isOk = holder.LoadContentFromFile(textFileName);
                if (!isOk)
                {
                    logger.Error("Error while loading content from text file {0}", textFileName);
                    continue;
                }
                // now save json file

                holder.SaveAsJSON(jsonFileName);

                isOk = bankDataProcessing.IsIdPresent(id);
                if (isOk)
                {
                    logger.Info("id in dynamo db");
                }
                else
                {
                    logger.Info("id NOT in dynamo db");
                }
            }
            MetaDataHolderFactory.CloseConnection();
            bankDataProcessing.Disconnect();
            logger.Info("Done");
        }