private void HandleDOCXFileEntry(FileEntry pFileEntry) { DocxToText dtt = new DocxToText(pFileEntry.FilePath); if (dtt.IsFileContainString(mTextToSearch)) { PutFileEntryToSearchResult(pFileEntry); } }
public static async Task FileToWav(HttpPostedFileBase file, string path) { if (!File.Exists(path + file.FileName)) { var extension = Path.GetExtension(file.FileName); var text = ""; if (extension == ".pdf") { using (PdfReader pfdReader = new PdfReader(file.InputStream)) { for (var i = 1; i <= pfdReader.NumberOfPages; i++) { text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pfdReader, i); } } } else if (extension == ".txt") { using (var reader = new StreamReader(file.InputStream)) { text = reader.ReadToEnd(); } } else if (extension == ".docx") { file.SaveAs(path + file.FileName); DocxToText dtt = new DocxToText(path + file.FileName); text = dtt.ExtractText(); File.Delete(path + file.FileName); } await Task.Run(() => { using (var reader = new SpeechSynthesizer()) { var fileName = path + Path.GetFileNameWithoutExtension(file.FileName) + ".wav"; reader.SelectVoiceByHints(VoiceGender.Female, VoiceAge.Senior); reader.SetOutputToWaveFile(fileName); var builder = new PromptBuilder(); builder.AppendText(text); reader.Speak(builder); } }); } }
private string GetTextFromDocxUsingCustomLibrary(Stream ms) { try { var docxToText = new DocxToText(ms); var text = docxToText.ExtractText(); if (text.Contains('\0')) { return(null); } return(text); } catch (Exception ex) { _logger.LogWarning($"GetTextFromDocxUsingCustomLibrary failed: {ex}"); return(null); } }
public static string parseDocx(Stream fileStream) { //string filename = System.Web.HttpContext.Current.Server.MapPath("~") + "\\..\\temp\\" + Guid.NewGuid().ToString() + ".docx"; string path = ConfigurationManager.AppSettings.Get("tempPath"); string filename = path + Guid.NewGuid().ToString() + ".docx"; FileStream writer = new FileStream(filename, FileMode.Create, FileAccess.Write); int len = 10240; Byte[] buffer = new Byte[len]; int read = fileStream.Read(buffer, 0, len); // write the required bytes while (read > 0) { writer.Write(buffer, 0, read); read = fileStream.Read(buffer, 0, len); } writer.Close(); writer.Dispose(); writer = null; DocxToText docx = new DocxToText(filename); string text = docx.ExtractText(); File.Delete(filename); text = text.Replace("\t", " ").Replace("\n", " ").Replace("\r", " "); return text; }
/* * do all the work */ public static bool DoFileConversion(IAmazonS3 s3Client, string pdfBucketName, IAmazonS3 s3DocxClient, string docxBucketName, string licensePath, string outputDir, string extractWorkDir, string tempDocDir, string objectKey, MetaDataHolder holder, out string textFileS3Path) { textFileS3Path = String.Empty; // this is output file location // we will need this array char delimiter = '/'; String[] keyElements = objectKey.Split(delimiter); string fileName = keyElements[keyElements.Length - 1]; string pdfFilePath = outputDir + fileName; logger.Debug("downloading " + objectKey + " --> " + pdfFilePath); Stopwatch swAll = new Stopwatch(); swAll.Start(); bool ok = DownloadFileFromS3(s3Client, pdfBucketName, objectKey, pdfFilePath); if (!ok) { logger.Error("Error while downloading"); return(ok); } // get file length long length = new System.IO.FileInfo(pdfFilePath).Length; // construct the name of the doc file string docxPath = Path.ChangeExtension(pdfFilePath, ".docx"); // start stop watch Stopwatch sw = new Stopwatch(); sw.Start(); ok = ConvertPDFDocument(pdfFilePath, docxPath, licensePath); sw.Stop(); if (!ok) { logger.Error("Error while converting"); DeleteFile(pdfFilePath); return(ok); } double conversionRate = (double)length / sw.Elapsed.TotalSeconds; logger.Info("Done conversion, size: " + length + " time:" + sw.Elapsed.TotalSeconds + " sec, rate:" + conversionRate + " b/s"); // new filter parts of the doc that are not needed // this is not the best place to have it DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); // construct the location of final output file logger.Debug("Starting Filtering"); string newDocxFile = tempDocDir + @"\" + fileName; newDocxFile = Path.ChangeExtension(newDocxFile, ".docx"); ok = filter.ApplyFilter(docxPath, newDocxFile, false); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath }); logger.Error("Error while filtering docx"); return(ok); } // one more step convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); ok = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile }); logger.Error("Error while Extracting text"); return(ok); } /* now we have text file and we will need json file and we need to * collect data from database */ ok = holder.LoadContentFromFile(textFileName); if (!ok) { DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile, textFileName }); logger.Error("Error while loading content from text file"); return(ok); } // now save json file string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); holder.SaveAsJSON(jsonFileName); // construct output object name // we are now uploading json file not docx !!! string jsonS3FileName = Path.GetFileName(jsonFileName); Array.Resize(ref keyElements, keyElements.Length - 1); string jsonObjectName = string.Join(delimiter.ToString(), keyElements) + delimiter.ToString() + jsonS3FileName; logger.Debug("uploading " + newDocxFile + " --> " + jsonObjectName); ok = UploadFileToS3(s3DocxClient, docxBucketName, jsonObjectName, jsonFileName); if (!ok) { logger.Error("Error while uploading"); return(ok); } textFileS3Path = jsonObjectName; swAll.Stop(); logger.Info("Time for the cycle:" + swAll.Elapsed.TotalSeconds + " sec"); // all good, delete files DeleteListOfFiles(new List <string> { docxPath, pdfFilePath, newDocxFile, jsonFileName, textFileName }); return(true); }
public static void CheckAttachmentsForDocOrPDFText(ActiveRecord record) { //walk the field list for this record looking for attachments foreach (var fieldName in record.GetFieldNames()) { if (fieldName.Contains("Attachment") && fieldName.DoesntContain("RawText")) { //if (record.Fields.Attachment.IsDirty) { if (ActiveFieldBase.IsDirtyObj(record[fieldName].ValueObject, record[fieldName].OriginalValueObject)) { if (record[fieldName].ToString().Contains(".doc") || record[fieldName].ToString().EndsWith(".pdf") || record[fieldName].ToString().EndsWith(".rtf")) { if (!record.FieldExists(fieldName + "RawText")) { (new Sql("ALTER TABLE ", record.GetTableName().SqlizeName(), " ADD [" + fieldName + "RawText] nvarchar (MAX);")).Execute(); } string output = ""; if (record[fieldName].ToString().ToLower().EndsWith(".doc")) { OfficeFileReader.OfficeFileReader objOFR = new OfficeFileReader.OfficeFileReader(); if (objOFR.GetText(Web.MapPath(Web.Attachments) + record[fieldName].ToString(), ref output) > 0) { //ok } } else if (record[fieldName].ToString().ToLower().EndsWith(".docx")) { BewebCore.ThirdParty.ReadWordDocText.DocxToText objOFR = new DocxToText(Web.MapPath(Web.Attachments) + record[fieldName].ToString()); if ((output = objOFR.ExtractText()).Length > 0) { //ok } } else if (record[fieldName].ToString().Contains(".pdf")) { PdfToText.PDFParser pdf = new PDFParser(); if (pdf.ExtractText(Web.MapPath(Web.Attachments) + record[fieldName].ToString(), ref output)) { //ok } } else if (record[fieldName].ToString().Contains(".rtf")) { #if RTFProcessingAvailable //Create the RTF tree object RtfTree tree = new RtfTree(); //Load and parse RTF document tree.LoadRtfFile(Web.MapPath(Web.Attachments) + record[fieldName].ToString()); output = tree.Text; #else throw new Exception("rtf library not included"); #endif } if (output.Trim() != "") { (new Sql("update ", record.GetTableName().SqlizeName(), "set " + fieldName + "RawText=", output.SqlizeText(), " where ", record.GetPrimaryKeyName().SqlizeName(), "=", record.ID_Field.Sqlize(), "")).Execute(); } } else { //no doc any more if (record.FieldExists(fieldName + "RawText")) { (new Sql("update ", record.GetTableName().SqlizeName(), "set " + fieldName + "RawText=null where ", record.GetPrimaryKeyName().SqlizeName(), "=", record.ID_Field.Sqlize(), "")).Execute(); } } } } } }
private static void ProcessFiles() { RegionEndpoint bucketRegion = RegionEndpoint.USWest2; string bucketName = "sumup-test-mm"; string localDirForDocxFiles = @"C:\radnidio\japan-ocr-files\input"; string extractWorkDir = @"C:\radnidio\japan-ocr-files\work"; string tempDocDir = @"C:\radnidio\japan-ocr-files\tempdoc"; string dbConnectionString = "server=liveboard0913.cjvgiw4swlyc.us-west-1.rds.amazonaws.com;database=sum_up;uid=yerem;pwd=sua.liveboard.2018;"; string languageMapFile = @"C:\transfer\solid-conversions\mappings\language-codes.csv"; string threeMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-cleaned-win.csv"; string twoMapFile = @"C:\transfer\solid-conversions\mappings\mapping-from-structure-and-data-one-level.csv"; string nonMapFile = @"C:\temp\non-mapped-document-categories.txt"; string pdfBucketName = "sua-liveboard"; string docxBucketName = "sumup-docx-outbound"; // setup various objects neesed DocxTagFilter filter = new DocxTagFilter(extractWorkDir); // set default tags filter.SetupDefaultTags(); FileToIdMapCollector collector = new FileToIdMapCollector(); collector.connectionString = dbConnectionString; bool isOk; isOk = collector.LoadLists(); if (!isOk) { logger.Error("Can not collect file id maps"); System.Environment.Exit(0); } MetaDataHolderFactory.connectionString = dbConnectionString; MetaDataHolderFactory.loadMaps(languageMapFile, threeMapFile, twoMapFile, nonMapFile); MetaDataHolderFactory.S3bucket = pdfBucketName; // text is needed like us-west-2 MetaDataHolderFactory.S3region = Amazon.RegionEndpoint.USWest1.SystemName; MetaDataHolderFactory.GetConnection(); IAmazonS3 client = new AmazonS3Client(bucketRegion); BankDataProcessingDynamoDbDAO bankDataProcessing = new BankDataProcessingDynamoDbDAO(Amazon.RegionEndpoint.USWest2.SystemName, pdfBucketName, docxBucketName); isOk = bankDataProcessing.Connect(); if (!isOk) { logger.Error("Error in connecting to dynamo db: "); System.Environment.Exit(1); } // skip list List <string> skipList = new List <string>(); skipList.Add("1eb5f50c344634929709f81ac09593b365f0120e.docx"); logger.Info("Started working "); ListingObjectsAsync(bucketName, client).Wait(); int ic = 0; int maxFile = 3000; foreach (string s3file in FilesToProcessInS3) { ic++; if (ic > maxFile) { break; } Console.WriteLine("Processing: {0}", s3file); if (skipList.Contains(s3file)) { logger.Warn("file is skip list, skipping"); continue; } string docxPath = Path.Combine(localDirForDocxFiles, s3file); string newDocxFile = Path.Combine(tempDocDir, s3file); string jsonFileName = Path.ChangeExtension(newDocxFile, ".json"); logger.Info("Local file: {0}", docxPath); // check do we have json file anready, if so skip if (File.Exists(jsonFileName)) { logger.Info("Json file already exist, skipping"); continue; } // first download s3 file to local dir // do not load if file already exist ( better to put this in the method for some other time ) if (!File.Exists(docxPath)) { isOk = DownloadFileFromS3(client, bucketName, s3file, docxPath); if (!isOk) { logger.Error("file not downloaded {0}", s3file); break; } } else { logger.Info("file aready downloaded"); } // now filter out what is not needed in docx isOk = filter.ApplyFilter(docxPath, newDocxFile, false); if (!isOk) { logger.Error("Error while filtering docx"); break; } // convert docx to txt logger.Debug("Starting extraction of the text"); string textFileName = Path.ChangeExtension(newDocxFile, ".txt"); DocxToText docxToText = new DocxToText(); isOk = docxToText.ExtractTextAndSave(newDocxFile, textFileName); if (!isOk) { logger.Error("Error while Extracting text"); break; } // now collect metadata int id = collector.GetId(s3file); if (id == FileToIdMapCollector.MISSING_ID) { logger.Warn("id not found: {0}", s3file); continue; } logger.Info("ID: {0}", id); List <MetaDataHolder> mhlist = MetaDataHolderFactory.PopulateMetaDataHoldersFromDb(new int[] { id }); MetaDataHolder holder = mhlist[0]; isOk = holder.LoadContentFromFile(textFileName); if (!isOk) { logger.Error("Error while loading content from text file {0}", textFileName); continue; } // now save json file holder.SaveAsJSON(jsonFileName); isOk = bankDataProcessing.IsIdPresent(id); if (isOk) { logger.Info("id in dynamo db"); } else { logger.Info("id NOT in dynamo db"); } } MetaDataHolderFactory.CloseConnection(); bankDataProcessing.Disconnect(); logger.Info("Done"); }