public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using(File file = new File(filePath)) { Document document = file.Document; // 2. Text extraction from the document pages. TextExtractor extractor = new TextExtractor(); foreach(Page page in document.Pages) { if(!PromptNextPage(page, false)) { Quit(); break; } IList<ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea]; foreach(ITextString textString in textStrings) { RectangleF textStringBox = textString.Box.Value; Console.WriteLine( "Text [" + "x:" + Math.Round(textStringBox.X) + "," + "y:" + Math.Round(textStringBox.Y) + "," + "w:" + Math.Round(textStringBox.Width) + "," + "h:" + Math.Round(textStringBox.Height) + "]: " + textString.Text ); } } } }
public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using(File file = new File(filePath)) { // Define the text pattern to look for! string textRegEx = PromptChoice("Please enter the pattern to look for: "); Regex pattern = new Regex(textRegEx, RegexOptions.IgnoreCase); // 2. Iterating through the document pages... TextExtractor textExtractor = new TextExtractor(true, true); foreach(Page page in file.Document.Pages) { Console.WriteLine("\nScanning page " + (page.Index+1) + "...\n"); // 2.1. Extract the page text! IDictionary<RectangleF?,IList<ITextString>> textStrings = textExtractor.Extract(page); // 2.2. Find the text pattern matches! MatchCollection matches = pattern.Matches(TextExtractor.ToString(textStrings)); // 2.3. Highlight the text pattern matches! textExtractor.Filter( textStrings, new TextHighlighter(page, matches) ); } // 3. Highlighted file serialization. Serialize(file); } }
public List<CardCriterion> GetAll() { var Ds = new DataSet(); //dummy var to not blow the method DataRowCollection rows = Ds.Tables[0].Rows; //Map all Database Rows To DataModels. Add each Model to Dict. List<CardCriterion> cardCriteria = new List<CardCriterion>(rows.Count); #region Linq //cardCriteria.AddRange(from DataRow row in Ds.Tables[0].Rows // select row.ItemArray // into values // let textExtractor = new TextExtractor() // select // new CardCriterion() // { // CardId = Convert.ToInt16(values[0]), // Territory = Convert.ToString(values[1]), // CardKind = Convert.ToString(values[2]), // CardAdoptionKindSubject = Convert.ToString(values[3]), // CardAdoptionSubject = Convert.ToString(values[4]), // CardName = Convert.ToString(values[5]), // CardEdition = textExtractor.Extract((byte[]) values[6]).Text, // ContentMetadataDict = textExtractor.Extract((byte[]) values[6]).Metadata, // ContentType = textExtractor.Extract((byte[]) values[6]).ContentType // }); #endregion foreach (DataRow row in rows) { var criterion = new CardCriterion(); object[] values = row.ItemArray; var textExtractionResult = new TextExtractor().Extract((byte[]) values[7]); criterion.EditionId = Convert.ToUInt32(values[0]); criterion.Territory = Convert.ToString(values[1]); criterion.CardKind = Convert.ToString(values[2]); criterion.CardAdoptionKindSubject = Convert.ToString(values[3]); criterion.CardAdoptionSubject = Convert.ToString(values[4]); criterion.CardName = Convert.ToString(values[5]); criterion.CardAdoptionDate = Convert.ToDateTime(values[6]); criterion.CardEdition = textExtractionResult.Text; criterion.ContentMetadataDict = textExtractionResult.Metadata; criterion.CardAdoptionNumber = Convert.ToString(values[8]); cardCriteria.Add(criterion); Function(1, rows.Count); //http://stackoverflow.com/questions/6471378/implementing-a-progress-bar-to-show-work-being-done } return cardCriteria; //http://codereview.stackexchange.com/questions/30714/faster-way-to-convert-datatable-to-list-of-class }
public ActionResult CountStatistics([FromBody] string fileName) { List <string> extractedText = new List <string>(); string filePath = Server.MapPath("../App_Data//Uploads//" + fileName); try { string[] arguments = new string[] { filePath }; int maxWordLength = 0; for (int i = 0; i < arguments.Length; i++) { if (arguments[i].Length == 1 || !int.TryParse(arguments[i], out maxWordLength)) { maxWordLength = 5; } } ExtractorFactory factory = new ExtractorFactory(); Dictionary <string, int> statistic = new Dictionary <string, int>(); TextExtractor extractor = factory.CreateTextExtractor(filePath); if (extractor == null) { extractedText.Add("The document's format is not supported"); } try { string line = null; do { line = extractor.ExtractLine(); if (line != null) { string[] words = line.Split(' ', ',', ';', '.'); foreach (string w in words) { string word = w.Trim().ToLower(); if (word.Length > maxWordLength) { if (!statistic.ContainsKey(word)) { statistic[word] = 0; } statistic[word]++; } } } }while (line != null); } finally { extractor.Dispose(); } extractedText.Add("Top words:"); for (int i = 0; i < 10; i++) { int count = -1; string maxKey = null; foreach (string key in statistic.Keys) { if (statistic[key] > count) { count = statistic[key]; maxKey = key; } } if (maxKey == null) { break; } extractedText.Add(maxKey + " : " + count); statistic.Remove(maxKey); } } catch (Exception ex) { extractedText.Add(ex.Message); } return(Json(extractedText, JsonRequestBehavior.AllowGet)); }
private List<string> ExtractPageList(Document document) { // extract page list List<string> pageList = new List<string>(); TextExtractor textExtractor = new TextExtractor(); try { foreach (var page in document.Pages) { var textStrings = textExtractor.Extract(page); string pageContent = TextExtractor.ToString(textStrings); //string[] ssize = content.Split(null); // splits by whitespace pageList.Add(pageContent); } } catch (Exception e) { Utility.Log("Blad"); } return pageList; }
private static CardCriterion _extractRowToCriterion(IDataRecord record) { CardCriterion criterion = new CardCriterion(); var textExtractionResult = new TextExtractor().Extract((byte[]) record[5]); criterion.EditionId = Convert.ToUInt32(record[0]); criterion.CardId = Convert.ToUInt32(record[1]); criterion.Territory = Convert.ToString(record[2]); criterion.CardKind = Convert.ToString(record[3]); criterion.CardName = Convert.ToString(record[4]); criterion.CardEdition = textExtractionResult.Text; criterion.ContentMetadataDict = textExtractionResult.Metadata; criterion.FileExt = Convert.ToString(record[6]); criterion.CardAdoptionKindSubject = Convert.ToString(record[7]); criterion.CardAdoptionSubject = Convert.ToString(record[8]); criterion.CardAdoptionDate = Convert.ToDateTime(record[9]); criterion.CardAdoptionNumber = Convert.ToString(record[10]); return criterion; }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
public void TestExtractMethodWithInvalidFile() { Assert.ThrowsException <NotSupportedException>( () => TextExtractor.Extract(@"Documents\Invalid.docx") ); }
public void TestIsValidFileTypeMethodWithMissingStream() { Assert.IsFalse(TextExtractor.IsValidFileType(new MemoryStream())); }
public void TestIsValidFileTypeMethodWithUndefinedBytes() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.IsValidFileType((byte[])null) ); }
public void TestExtractMethodWithUndefinedFile() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.Extract((string)null) ); }
public override ApiResult Init() { Extractor = new TextExtractor(); return(SetInitializedStatusAndReturnSucces()); }
public WordStatistic(string fileName, int maxWordLength) { //ExStart:WordStatistic ExtractorFactory factory = new ExtractorFactory(); Dictionary <string, int> statistic = new Dictionary <string, int>(); TextExtractor extractor = factory.CreateTextExtractor(fileName); if (extractor == null) { Console.WriteLine("The document's format is not supported"); return; } try { string line = null; do { line = extractor.ExtractLine(); if (line != null) { string[] words = line.Split(' ', ',', ';', '.'); foreach (string w in words) { string word = w.Trim().ToLower(); if (word.Length > maxWordLength) { if (!statistic.ContainsKey(word)) { statistic[word] = 0; } statistic[word]++; } } } }while (line != null); } finally { extractor.Dispose(); } Console.WriteLine("Top words:"); for (int i = 0; i < 10; i++) { int count = -1; string maxKey = null; foreach (string key in statistic.Keys) { if (statistic[key] > count) { count = statistic[key]; maxKey = key; } } if (maxKey == null) { break; } Console.WriteLine("{0}: {1}", maxKey, count); statistic.Remove(maxKey); } //ExEnd:WordStatistic }
public ActionResult ExtractText([FromBody] string fileName, string password = null) { //ExStart:ExtractText ExtractorFactory factory = new ExtractorFactory(); string path = Server.MapPath("../App_Data//Uploads//" + fileName); string ext = Path.GetExtension(path); List <string> extractedText = new List <string>(); try { string line = null; //If file password procted if (!string.IsNullOrWhiteSpace(password)) { if (ext == ".one") { var loadOptions = new LoadOptions(); loadOptions.Password = password; using (var extractor = new NoteTextExtractor(path, loadOptions)) { do { int lineNumber = 0; do { line = extractor.ExtractLine(); lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); }while (line != null); } } else { LoadOptions loadOptions = new LoadOptions(); loadOptions.Password = password; WordsTextExtractor protectedDocument = new WordsTextExtractor(path, loadOptions); do { int lineNumber = 0; do { line = protectedDocument.ExtractLine(); lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); }while (line != null); } } else { //if file type is zip if (ext == ".zip") { using (var container = new ZipContainer(path)) { for (int i = 0; i < container.Entities.Count; i++) { using (TextExtractor extractor = factory.CreateTextExtractor(container.Entities[i].OpenStream())) { int lineNumber = 0; do { line = extractor.ExtractLine(); lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); } } } } else { TextExtractor extractor = factory.CreateTextExtractor(path); do { int lineNumber = 0; do { try { line = extractor.ExtractLine(); } catch (Exception) { if (ext == ".one") { extractedText.Add("Invalid password"); break; } } lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); }while (line != null); } } //extractedText.Add(extractor.ExtractAll()); } catch (Exception ex) { extractedText.Add(ex.Message); } return(Json(extractedText, JsonRequestBehavior.AllowGet)); }
public void TestExtractMethodWithMissingBytes() { Assert.ThrowsException <NotSupportedException>( () => TextExtractor.Extract(new byte[] { }) ); }
public void TestIsValidFileTypeMethodWithMissingBytes() { Assert.IsFalse(TextExtractor.IsValidFileType(new byte[] { })); }
public void TestExtractMethodWithMissingFile() { Assert.ThrowsException <FileNotFoundException>( () => TextExtractor.Extract(string.Empty) ); }
public void TestIsValidFileTypeMethodWithMissingFile() { Assert.ThrowsException <FileNotFoundException>( () => TextExtractor.IsValidFileType(string.Empty) ); }
public void TestExtractMethodWithValidFileWithWhitespacePreserveFile() { var text = TextExtractor.Extract(@"Documents\ValidWithWhitespacePreserve.docx"); Assert.IsTrue(text == "This is a Word document."); }
public void TestExtractMethodWithUndefinedStream() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.Extract((Stream)null) ); }
static void Main(string[] args) { // Create TextExtractor instance TextExtractor textExtractor = new TextExtractor("demo", "demo"); textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch; // Set exact search (default is SmartSearch that works like in Adobe Reader) // Create XMLExtractor instance XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo"); // Load document textExtractor.LoadDocumentFromFile("Invoice.pdf"); xmlExtractor.LoadDocumentFromFile("Invoice.pdf"); // Results string invoiceNo = string.Empty; string invoiceDate = string.Empty; string total = string.Empty; string tableData = string.Empty; // Iterate pages for (int i = 0; i < textExtractor.GetPageCount(); i++) { RectangleF pageRectangle = textExtractor.GetPageRectangle(i); RectangleF tableRect = new RectangleF(0, 0, pageRectangle.Width, 0); // Search for "Invoice No." if (textExtractor.Find(i, "Invoice No.", false)) { // Get the found text rectangle RectangleF textRect = textExtractor.FoundText.Bounds; // Assume the text at right is the invoice number. // Shift the rectangle to the right: textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; // Set the extraction region and extract the text textExtractor.SetExtractionArea(textRect); invoiceNo = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Invoice Date" and extract text at right if (textExtractor.Find(i, "Invoice Date", false)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); invoiceDate = textExtractor.GetTextFromPage(i).Trim(); } // Search for "Quantity" keyword to detect the top of the tabular data rectangle if (textExtractor.Find(i, "Quantity", false)) { // Keep the top table coordinate tableRect.Y = textExtractor.FoundText.Bounds.Top; // use textRect.Bottom if you want to skip column headers } // Search for "TOTAL" (it will be also the bottom of tabular data rectangle) if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */)) { RectangleF textRect = textExtractor.FoundText.Bounds; textRect.X = textRect.Right; textRect.Width = pageRectangle.Right - textRect.Left; textExtractor.SetExtractionArea(textRect); total = textExtractor.GetTextFromPage(i).Trim(); // Calculate the table height tableRect.Height = textRect.Top - tableRect.Top; } // Extract tabular data using XMLExtractor if (tableRect.Height > 0) { xmlExtractor.SetExtractionArea(tableRect); tableData = xmlExtractor.GetXMLFromPage(i); } } // Display extracted data Console.WriteLine("Invoice No.: " + invoiceNo); Console.WriteLine("Invoice Date: " + invoiceDate); Console.WriteLine("TOTAL: " + total); Console.WriteLine("Table Data: "); Console.WriteLine(tableData); Console.WriteLine("Press any key..."); Console.ReadKey(); }
public void TestIsValidFileTypeMethodWithInvalidFile() { Assert.IsFalse(TextExtractor.IsValidFileType(@"Documents\Invalid.docx")); }
public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using(files::File file = new files::File(filePath)) { Document document = file.Document; // 2. Link extraction from the document pages. TextExtractor extractor = new TextExtractor(); extractor.AreaTolerance = 2; // 2 pt tolerance on area boundary detection. bool linkFound = false; foreach(Page page in document.Pages) { if(!PromptNextPage(page, !linkFound)) { Quit(); break; } IDictionary<RectangleF?,IList<ITextString>> textStrings = null; linkFound = false; // Get the page annotations! PageAnnotations annotations = page.Annotations; if(!annotations.Exists()) { Console.WriteLine("No annotations here."); continue; } // Iterating through the page annotations looking for links... foreach(Annotation annotation in annotations) { if(annotation is Link) { linkFound = true; if(textStrings == null) {textStrings = extractor.Extract(page);} Link link = (Link)annotation; RectangleF linkBox = link.Box; // Text. /* Extracting text superimposed by the link... NOTE: As links have no strong relation to page text but a weak location correspondence, we have to filter extracted text by link area. */ StringBuilder linkTextBuilder = new StringBuilder(); foreach(ITextString linkTextString in extractor.Filter(textStrings,linkBox)) {linkTextBuilder.Append(linkTextString.Text);} Console.WriteLine("Link '" + linkTextBuilder + "' "); // Position. Console.WriteLine( " Position: " + "x:" + Math.Round(linkBox.X) + "," + "y:" + Math.Round(linkBox.Y) + "," + "w:" + Math.Round(linkBox.Width) + "," + "h:" + Math.Round(linkBox.Height) ); // Target. Console.Write(" Target: "); PdfObjectWrapper target = link.Target; if(target is Destination) {PrintDestination((Destination)target);} else if(target is actions::Action) {PrintAction((actions::Action)target);} else if(target == null) {Console.WriteLine("[not available]");} else {Console.WriteLine("[unknown type: " + target.GetType().Name + "]");} } } if(!linkFound) { Console.WriteLine("No links here."); continue; } } } }
public void TestIsValidFileTypeMethodWithValidFile() { Assert.IsTrue(TextExtractor.IsValidFileType(@"Documents\Valid.docx")); }
private void btInput_Click(object sender, EventArgs e) { if (rbFromFile.Checked) { openFileDialog.Filter = "IFO Files (*.ifo)|*.ifo|MPLS Files (*.mpls)|*.mpls|Text Files (*.txt)|*.txt|All Files supported (*.ifo,*.mpls,*.txt)|*.ifo;*.mpls;*.txt"; openFileDialog.FilterIndex = 4; if (this.openFileDialog.ShowDialog() == DialogResult.OK) { input.Text = openFileDialog.FileName; if (input.Text.ToLower().EndsWith("ifo")) { ChapterExtractor ex = new IfoExtractor(); pgc = ex.GetStreams(input.Text)[0]; FreshChapterView(); updateTimeLine(); } else if (input.Text.ToLower().EndsWith("mpls")) { ChapterExtractor ex = new MplsExtractor(); pgc = ex.GetStreams(input.Text)[0]; FreshChapterView(); updateTimeLine(); } else { ChapterExtractor ex = new TextExtractor(); pgc = ex.GetStreams(input.Text)[0]; FreshChapterView(); updateTimeLine(); } } } else { using (FolderBrowserDialog d = new FolderBrowserDialog()) { d.ShowNewFolderButton = false; d.Description = "Select DVD, BluRay disc, or folder."; if (d.ShowDialog() == DialogResult.OK) { input.Text = d.SelectedPath; try { ChapterExtractor ex = Directory.Exists(Path.Combine(input.Text, "VIDEO_TS")) ? new DvdExtractor() as ChapterExtractor : Directory.Exists(Path.Combine(Path.Combine(input.Text, "BDMV"), "PLAYLIST")) ? new BlurayExtractor() as ChapterExtractor : null; if (ex == null) throw new Exception("The location was not detected as DVD, or Blu-Ray."); using (frmStreamSelect frm = new frmStreamSelect(ex)) { if (ex is DvdExtractor) frm.Text = "Select your PGC"; else frm.Text = "Select your Playlist"; ex.GetStreams(input.Text); if (frm.ShowDialog(this) == DialogResult.OK) { pgc = frm.ProgramChain; if (pgc.FramesPerSecond == 0) pgc.FramesPerSecond = 25.0; if (pgc.LangCode == null) pgc.LangCode = "und"; } } FreshChapterView(); updateTimeLine(); } catch (Exception ex) { MessageBox.Show(ex.Message); } } } } if (chapterListView.Items.Count != 0) chapterListView.Items[0].Selected = true; }
public void TestExtractMethodWithUndefinedBytes() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.Extract((byte[])null) ); }
public virtual void SetUp() { _cut = new TextExtractor(); }
public void TestExtractMethodWithMissingStream() { Assert.ThrowsException <NotSupportedException>( () => TextExtractor.Extract(new MemoryStream()) ); }
private void button3_Click(object sender, EventArgs e) { timer1.Stop(); button5.Text = "P L A Y"; OpenFileDialog dlg = new OpenFileDialog(); string filepath; dlg.Filter = "Text files(*.txt)|*.txt|Doc files(*.doc)|*.doc|Docx files(*.docx)|*.docx|All files(*.*)|*.*"; if (dlg.ShowDialog() == DialogResult.OK) { filepath = dlg.FileName.ToString(); string strText = string.Empty; try { string ext = System.IO.Path.GetExtension(dlg.FileName); if (ext == ".doc" || ext == ".docx") { Code7248.word_reader.TextExtractor extractor = new TextExtractor(dlg.FileName); string contents = extractor.ExtractText(); MyText.Text = contents; } else { MyText.Text = File.ReadAllText(dlg.FileName); } MyText.Text = MyText.Text.Replace("\t", " "); MyText.Text = MyText.Text.Replace(Environment.NewLine, "\n"); if (mychar == '\n') { MyText.Text = MyText.Text.Replace("\n", " "); } string nolstrTextine = MyText.Text.Replace("\n", System.Convert.ToString(mychar)); i = 0; if (mychar == '\n') { nolstrTextine = nolstrTextine.Replace(".", ". " + mychar); } words = nolstrTextine.Split(new char[] { mychar }, StringSplitOptions.RemoveEmptyEntries); ; progressBar1.Maximum = words.Length; timer1.Stop(); button5.Text = "P L A Y"; } catch (Exception ex) { MessageBox.Show(ex.Message); } } }
static void Main(string[] args) { try { // Files string fileName = "hindi_text_with_image.pdf"; string destFileName = "output_hindi_text_with_image.pdf"; string destFileName_serachable = "output_hindi_text_with_image_searchable.pdf"; // Read all text from pdf file string allTextExtracted = ""; using (TextExtractor extractor = new TextExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); // Read all text directly allTextExtracted = extractor.GetText(); } // Get image from pdf file MemoryStream memoryStream = new MemoryStream(); using (ImageExtractor extractor = new ImageExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); if (extractor.GetFirstImage()) { extractor.SaveCurrentImageToStream(memoryStream, ImageFormat.Png); } } // Load image from file to System.Drawing.Image object (we need it to get the image resolution) using (System.Drawing.Image sysImage = System.Drawing.Image.FromStream(memoryStream)) { // Compute image size in PDF units (Points) float widthInPoints = sysImage.Width / sysImage.HorizontalResolution * 72f; float heightInPoints = sysImage.Height / sysImage.VerticalResolution * 72f; // Create new PDF document using (Document outPdfDocument = new Document()) { outPdfDocument.RegistrationName = "demo"; outPdfDocument.RegistrationKey = "demo"; // Create page of computed size Page page = new Page(widthInPoints, heightInPoints); // Add page to the document outPdfDocument.Pages.Add(page); Canvas canvas = page.Canvas; // Create Bytescout.PDF.Image object from loaded image Image pdfImage = new Image(sysImage); // Draw the image canvas.DrawImage(pdfImage, 0, 0, widthInPoints, heightInPoints); // Dispose the System.Drawing.Image object to free resources sysImage.Dispose(); // Create brush SolidBrush transparentBrush = new SolidBrush(new ColorGray(0)); // ... and make it transparent transparentBrush.Opacity = 0; // Draw text with transparent brush // Need to set Font which supports hindi characters. Font font16 = new Font("Arial Unicode MS", 16); canvas.DrawString(allTextExtracted, font16, transparentBrush, 40, 40); // Save document to file outPdfDocument.Save(destFileName); } } // Make PDF file with hindi text searchable to OCR. using (SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker()) { //Load PDF document searchablePDFMaker.LoadDocumentFromFile(destFileName); // Set the location of "tessdata" folder containing language data files /* * It used following files for hindi language support. Need to put these files into "testdata" folder. Below location contains these files. * https://github.com/tesseract-ocr/tessdata/tree/3.04.00 * hin.traineddata * hin.cube.bigrams * hin.cube.lm * hin.cube.nn * hin.cube.params * hin.cube.word-freq * hin.tesseract_cube.nn */ searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language searchablePDFMaker.OCRLanguage = "hin"; // Need to set Font which supports hindi characters searchablePDFMaker.LabelingFont = "Arial Unicode MS"; // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; searchablePDFMaker.MakePDFSearchable(destFileName_serachable); } // Open document in default PDF viewer app Process.Start(destFileName_serachable); } catch (Exception ex) { Console.WriteLine("ERROR:" + ex.Message); } Console.ReadLine(); }