TextExtractor C# (CSharp) Code Examples

Example #1

1

Show file

File: AdvancedTextExtractionSample.cs Project: n9/pdfclown

        public override void Run(
            )
        {
            // 1. Opening the PDF file...
              string filePath = PromptFileChoice("Please select a PDF file");
              using(File file = new File(filePath))
              {
            Document document = file.Document;

            // 2. Text extraction from the document pages.
            TextExtractor extractor = new TextExtractor();
            foreach(Page page in document.Pages)
            {
              if(!PromptNextPage(page, false))
              {
            Quit();
            break;
              }

              IList<ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea];
              foreach(ITextString textString in textStrings)
              {
            RectangleF textStringBox = textString.Box.Value;
            Console.WriteLine(
              "Text ["
                + "x:" + Math.Round(textStringBox.X) + ","
                + "y:" + Math.Round(textStringBox.Y) + ","
                + "w:" + Math.Round(textStringBox.Width) + ","
                + "h:" + Math.Round(textStringBox.Height)
                + "]: " + textString.Text
                );
              }
            }
              }
        }

Example #2

1

Show file

File: TextHighlightSample.cs Project: n9/pdfclown

        public override void Run(
            )
        {
            // 1. Opening the PDF file...
              string filePath = PromptFileChoice("Please select a PDF file");
              using(File file = new File(filePath))
              {
            // Define the text pattern to look for!
            string textRegEx = PromptChoice("Please enter the pattern to look for: ");
            Regex pattern = new Regex(textRegEx, RegexOptions.IgnoreCase);

            // 2. Iterating through the document pages...
            TextExtractor textExtractor = new TextExtractor(true, true);
            foreach(Page page in file.Document.Pages)
            {
              Console.WriteLine("\nScanning page " + (page.Index+1) + "...\n");

              // 2.1. Extract the page text!
              IDictionary<RectangleF?,IList<ITextString>> textStrings = textExtractor.Extract(page);

              // 2.2. Find the text pattern matches!
              MatchCollection matches = pattern.Matches(TextExtractor.ToString(textStrings));

              // 2.3. Highlight the text pattern matches!
              textExtractor.Filter(
            textStrings,
            new TextHighlighter(page, matches)
            );
            }

            // 3. Highlighted file serialization.
            Serialize(file);
              }
        }

Example #3

0

Show file

File: CardCriteriaRepository.cs Project: antgerasim/ArmProtoClean

        public List<CardCriterion> GetAll()
        {
            var Ds = new DataSet(); //dummy var to not blow the method
            DataRowCollection rows = Ds.Tables[0].Rows;

            //Map all Database Rows To DataModels. Add each Model to Dict.
            List<CardCriterion> cardCriteria = new List<CardCriterion>(rows.Count);

            #region Linq

            //cardCriteria.AddRange(from DataRow row in Ds.Tables[0].Rows
            //    select row.ItemArray
            //    into values
            //    let textExtractor = new TextExtractor()
            //    select
            //        new CardCriterion()
            //        {
            //            CardId = Convert.ToInt16(values[0]),
            //            Territory = Convert.ToString(values[1]),
            //            CardKind = Convert.ToString(values[2]),
            //            CardAdoptionKindSubject = Convert.ToString(values[3]),
            //            CardAdoptionSubject = Convert.ToString(values[4]),
            //            CardName = Convert.ToString(values[5]),
            //            CardEdition = textExtractor.Extract((byte[]) values[6]).Text,
            //            ContentMetadataDict = textExtractor.Extract((byte[]) values[6]).Metadata,
            //            ContentType = textExtractor.Extract((byte[]) values[6]).ContentType
            //        });

            #endregion

            foreach (DataRow row in rows)
            {
                var criterion = new CardCriterion();
                object[] values = row.ItemArray;
                var textExtractionResult = new TextExtractor().Extract((byte[]) values[7]);

                criterion.EditionId = Convert.ToUInt32(values[0]);
                criterion.Territory = Convert.ToString(values[1]);
                criterion.CardKind = Convert.ToString(values[2]);
                criterion.CardAdoptionKindSubject = Convert.ToString(values[3]);
                criterion.CardAdoptionSubject = Convert.ToString(values[4]);
                criterion.CardName = Convert.ToString(values[5]);
                criterion.CardAdoptionDate = Convert.ToDateTime(values[6]);
                criterion.CardEdition = textExtractionResult.Text;
                criterion.ContentMetadataDict = textExtractionResult.Metadata;
                criterion.CardAdoptionNumber = Convert.ToString(values[8]);

                cardCriteria.Add(criterion);

                Function(1, rows.Count);
                //http://stackoverflow.com/questions/6471378/implementing-a-progress-bar-to-show-work-being-done

            }
            return cardCriteria;
            //http://codereview.stackexchange.com/questions/30714/faster-way-to-convert-datatable-to-list-of-class
        }

Example #4

0

Show file

        public ActionResult CountStatistics([FromBody] string fileName)
        {
            List <string> extractedText = new List <string>();
            string        filePath      = Server.MapPath("../App_Data//Uploads//" + fileName);

            try
            {
                string[] arguments = new string[] { filePath };

                int maxWordLength = 0;
                for (int i = 0; i < arguments.Length; i++)
                {
                    if (arguments[i].Length == 1 || !int.TryParse(arguments[i], out maxWordLength))
                    {
                        maxWordLength = 5;
                    }
                }
                ExtractorFactory         factory   = new ExtractorFactory();
                Dictionary <string, int> statistic = new Dictionary <string, int>();

                TextExtractor extractor = factory.CreateTextExtractor(filePath);
                if (extractor == null)
                {
                    extractedText.Add("The document's format is not supported");
                }
                try
                {
                    string line = null;
                    do
                    {
                        line = extractor.ExtractLine();
                        if (line != null)
                        {
                            string[] words = line.Split(' ', ',', ';', '.');
                            foreach (string w in words)
                            {
                                string word = w.Trim().ToLower();
                                if (word.Length > maxWordLength)
                                {
                                    if (!statistic.ContainsKey(word))
                                    {
                                        statistic[word] = 0;
                                    }

                                    statistic[word]++;
                                }
                            }
                        }
                    }while (line != null);
                }
                finally
                {
                    extractor.Dispose();
                }

                extractedText.Add("Top words:");

                for (int i = 0; i < 10; i++)
                {
                    int    count  = -1;
                    string maxKey = null;
                    foreach (string key in statistic.Keys)
                    {
                        if (statistic[key] > count)
                        {
                            count  = statistic[key];
                            maxKey = key;
                        }
                    }

                    if (maxKey == null)
                    {
                        break;
                    }

                    extractedText.Add(maxKey + " : " + count);
                    statistic.Remove(maxKey);
                }
            }
            catch (Exception ex)
            {
                extractedText.Add(ex.Message);
            }
            return(Json(extractedText, JsonRequestBehavior.AllowGet));
        }

Example #5

0

Show file

File: Classifier.cs Project: nieznanysprawiciel/PDFAnal

        private List<string> ExtractPageList(Document document)
        {
            //  extract page list
            List<string> pageList = new List<string>();
            TextExtractor textExtractor = new TextExtractor();
            try {
                foreach (var page in document.Pages)
                {

                    var textStrings = textExtractor.Extract(page);
                    string pageContent = TextExtractor.ToString(textStrings);
                    //string[] ssize = content.Split(null);   //  splits by whitespace
                    pageList.Add(pageContent);
                }
            }
            catch (Exception e)
            {
                Utility.Log("Blad");
            }
            return pageList;
        }

Example #6

0

Show file

File: SqlManager.cs Project: antgerasim/ArmProtoClean

        private static CardCriterion _extractRowToCriterion(IDataRecord record)
        {
            CardCriterion criterion = new CardCriterion();
            var textExtractionResult = new TextExtractor().Extract((byte[]) record[5]);

            criterion.EditionId = Convert.ToUInt32(record[0]);
            criterion.CardId = Convert.ToUInt32(record[1]);
            criterion.Territory = Convert.ToString(record[2]);
            criterion.CardKind = Convert.ToString(record[3]);
            criterion.CardName = Convert.ToString(record[4]);
            criterion.CardEdition = textExtractionResult.Text;
            criterion.ContentMetadataDict = textExtractionResult.Metadata;
            criterion.FileExt = Convert.ToString(record[6]);
            criterion.CardAdoptionKindSubject = Convert.ToString(record[7]);
            criterion.CardAdoptionSubject = Convert.ToString(record[8]);
            criterion.CardAdoptionDate = Convert.ToDateTime(record[9]);
            criterion.CardAdoptionNumber = Convert.ToString(record[10]);

            return criterion;
        }

Example #7

0

Show file

File: Program.cs Project: bytescout/data-extraction-suite-samples-c-sharp

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Add Contrast
            //extractor.OCRImagePreprocessingFilters.AddContrast(20);


            // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
            // filters for your specific document.
            // See "OCR Analyser" example.


            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

Example #8

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithInvalidFile()
 {
     Assert.ThrowsException <NotSupportedException>(
         () => TextExtractor.Extract(@"Documents\Invalid.docx")
         );
 }

Example #9

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestIsValidFileTypeMethodWithMissingStream()
 {
     Assert.IsFalse(TextExtractor.IsValidFileType(new MemoryStream()));
 }

Example #10

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestIsValidFileTypeMethodWithUndefinedBytes()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.IsValidFileType((byte[])null)
         );
 }

Example #11

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithUndefinedFile()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.Extract((string)null)
         );
 }

Example #12

0

Show file

File: Documents.cs Project: sycomix/OLAF

 public override ApiResult Init()
 {
     Extractor = new TextExtractor();
     return(SetInitializedStatusAndReturnSucces());
 }

Example #13

0

Show file

File: WordStatistic.cs Project: usman-aziz/GroupDocs.Parser-for-.NET

        public WordStatistic(string fileName, int maxWordLength)
        {
            //ExStart:WordStatistic
            ExtractorFactory         factory   = new ExtractorFactory();
            Dictionary <string, int> statistic = new Dictionary <string, int>();

            TextExtractor extractor = factory.CreateTextExtractor(fileName);

            if (extractor == null)
            {
                Console.WriteLine("The document's format is not supported");
                return;
            }

            try
            {
                string line = null;
                do
                {
                    line = extractor.ExtractLine();
                    if (line != null)
                    {
                        string[] words = line.Split(' ', ',', ';', '.');
                        foreach (string w in words)
                        {
                            string word = w.Trim().ToLower();
                            if (word.Length > maxWordLength)
                            {
                                if (!statistic.ContainsKey(word))
                                {
                                    statistic[word] = 0;
                                }

                                statistic[word]++;
                            }
                        }
                    }
                }while (line != null);
            }
            finally
            {
                extractor.Dispose();
            }

            Console.WriteLine("Top words:");

            for (int i = 0; i < 10; i++)
            {
                int    count  = -1;
                string maxKey = null;
                foreach (string key in statistic.Keys)
                {
                    if (statistic[key] > count)
                    {
                        count  = statistic[key];
                        maxKey = key;
                    }
                }

                if (maxKey == null)
                {
                    break;
                }

                Console.WriteLine("{0}: {1}", maxKey, count);
                statistic.Remove(maxKey);
            }
            //ExEnd:WordStatistic
        }

Example #14

0

Show file

        public ActionResult ExtractText([FromBody] string fileName, string password = null)
        {
            //ExStart:ExtractText
            ExtractorFactory factory       = new ExtractorFactory();
            string           path          = Server.MapPath("../App_Data//Uploads//" + fileName);
            string           ext           = Path.GetExtension(path);
            List <string>    extractedText = new List <string>();

            try
            {
                string line = null;
                //If file password procted
                if (!string.IsNullOrWhiteSpace(password))
                {
                    if (ext == ".one")
                    {
                        var loadOptions = new LoadOptions();
                        loadOptions.Password = password;

                        using (var extractor = new NoteTextExtractor(path, loadOptions))
                        {
                            do
                            {
                                int lineNumber = 0;
                                do
                                {
                                    line = extractor.ExtractLine();
                                    lineNumber++;
                                    if (line != null)
                                    {
                                        extractedText.Add(line);
                                    }
                                }while (line != null);
                            }while (line != null);
                        }
                    }
                    else
                    {
                        LoadOptions loadOptions = new LoadOptions();
                        loadOptions.Password = password;
                        WordsTextExtractor protectedDocument = new WordsTextExtractor(path, loadOptions);
                        do
                        {
                            int lineNumber = 0;
                            do
                            {
                                line = protectedDocument.ExtractLine();
                                lineNumber++;
                                if (line != null)
                                {
                                    extractedText.Add(line);
                                }
                            }while (line != null);
                        }while (line != null);
                    }
                }
                else
                {
                    //if file type is zip
                    if (ext == ".zip")
                    {
                        using (var container = new ZipContainer(path))
                        {
                            for (int i = 0; i < container.Entities.Count; i++)
                            {
                                using (TextExtractor extractor = factory.CreateTextExtractor(container.Entities[i].OpenStream()))
                                {
                                    int lineNumber = 0;
                                    do
                                    {
                                        line = extractor.ExtractLine();
                                        lineNumber++;
                                        if (line != null)
                                        {
                                            extractedText.Add(line);
                                        }
                                    }while (line != null);
                                }
                            }
                        }
                    }
                    else
                    {
                        TextExtractor extractor = factory.CreateTextExtractor(path);
                        do
                        {
                            int lineNumber = 0;
                            do
                            {
                                try
                                {
                                    line = extractor.ExtractLine();
                                }
                                catch (Exception)
                                {
                                    if (ext == ".one")
                                    {
                                        extractedText.Add("Invalid password");
                                        break;
                                    }
                                }

                                lineNumber++;
                                if (line != null)
                                {
                                    extractedText.Add(line);
                                }
                            }while (line != null);
                        }while (line != null);
                    }
                }

                //extractedText.Add(extractor.ExtractAll());
            }
            catch (Exception ex)
            {
                extractedText.Add(ex.Message);
            }
            return(Json(extractedText, JsonRequestBehavior.AllowGet));
        }

Example #15

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithMissingBytes()
 {
     Assert.ThrowsException <NotSupportedException>(
         () => TextExtractor.Extract(new byte[] { })
         );
 }

Example #16

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestIsValidFileTypeMethodWithMissingBytes()
 {
     Assert.IsFalse(TextExtractor.IsValidFileType(new byte[] { }));
 }

Example #17

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithMissingFile()
 {
     Assert.ThrowsException <FileNotFoundException>(
         () => TextExtractor.Extract(string.Empty)
         );
 }

Example #18

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestIsValidFileTypeMethodWithMissingFile()
 {
     Assert.ThrowsException <FileNotFoundException>(
         () => TextExtractor.IsValidFileType(string.Empty)
         );
 }

Example #19

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

        public void TestExtractMethodWithValidFileWithWhitespacePreserveFile()
        {
            var text = TextExtractor.Extract(@"Documents\ValidWithWhitespacePreserve.docx");

            Assert.IsTrue(text == "This is a Word document.");
        }

Example #20

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithUndefinedStream()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.Extract((Stream)null)
         );
 }

Example #21

0

Show file

        static void Main(string[] args)
        {
            // Create TextExtractor instance
            TextExtractor textExtractor = new TextExtractor("demo", "demo");

            textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch;             // Set exact search (default is SmartSearch that works like in Adobe Reader)

            // Create XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor("demo", "demo");

            // Load document
            textExtractor.LoadDocumentFromFile("Invoice.pdf");
            xmlExtractor.LoadDocumentFromFile("Invoice.pdf");

            // Results
            string invoiceNo   = string.Empty;
            string invoiceDate = string.Empty;
            string total       = string.Empty;
            string tableData   = string.Empty;

            // Iterate pages
            for (int i = 0; i < textExtractor.GetPageCount(); i++)
            {
                RectangleF pageRectangle = textExtractor.GetPageRectangle(i);
                RectangleF tableRect     = new RectangleF(0, 0, pageRectangle.Width, 0);

                // Search for "Invoice No."
                if (textExtractor.Find(i, "Invoice No.", false))
                {
                    // Get the found text rectangle
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    // Assume the text at right is the invoice number.
                    // Shift the rectangle to the right:
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    // Set the extraction region and extract the text
                    textExtractor.SetExtractionArea(textRect);
                    invoiceNo = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Invoice Date" and extract text at right
                if (textExtractor.Find(i, "Invoice Date", false))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    invoiceDate = textExtractor.GetTextFromPage(i).Trim();
                }

                // Search for "Quantity" keyword to detect the top of the tabular data rectangle
                if (textExtractor.Find(i, "Quantity", false))
                {
                    // Keep the top table coordinate
                    tableRect.Y = textExtractor.FoundText.Bounds.Top;                     // use textRect.Bottom if you want to skip column headers
                }

                // Search for "TOTAL" (it will be also the bottom of tabular data rectangle)
                if (textExtractor.Find(i, "TOTAL", true /* case sensitive! */))
                {
                    RectangleF textRect = textExtractor.FoundText.Bounds;
                    textRect.X     = textRect.Right;
                    textRect.Width = pageRectangle.Right - textRect.Left;
                    textExtractor.SetExtractionArea(textRect);
                    total = textExtractor.GetTextFromPage(i).Trim();

                    // Calculate the table height
                    tableRect.Height = textRect.Top - tableRect.Top;
                }

                // Extract tabular data using XMLExtractor
                if (tableRect.Height > 0)
                {
                    xmlExtractor.SetExtractionArea(tableRect);
                    tableData = xmlExtractor.GetXMLFromPage(i);
                }
            }

            // Display extracted data
            Console.WriteLine("Invoice No.: " + invoiceNo);
            Console.WriteLine("Invoice Date: " + invoiceDate);
            Console.WriteLine("TOTAL: " + total);
            Console.WriteLine("Table Data: ");
            Console.WriteLine(tableData);

            Console.WriteLine("Press any key...");
            Console.ReadKey();
        }

Example #22

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestIsValidFileTypeMethodWithInvalidFile()
 {
     Assert.IsFalse(TextExtractor.IsValidFileType(@"Documents\Invalid.docx"));
 }

Example #23

0

Show file

File: LinkParsingSample.cs Project: n9/pdfclown

        public override void Run(
            )
        {
            // 1. Opening the PDF file...
              string filePath = PromptFileChoice("Please select a PDF file");
              using(files::File file = new files::File(filePath))
              {
            Document document = file.Document;

            // 2. Link extraction from the document pages.
            TextExtractor extractor = new TextExtractor();
            extractor.AreaTolerance = 2; // 2 pt tolerance on area boundary detection.
            bool linkFound = false;
            foreach(Page page in document.Pages)
            {
              if(!PromptNextPage(page, !linkFound))
              {
            Quit();
            break;
              }

              IDictionary<RectangleF?,IList<ITextString>> textStrings = null;
              linkFound = false;

              // Get the page annotations!
              PageAnnotations annotations = page.Annotations;
              if(!annotations.Exists())
              {
            Console.WriteLine("No annotations here.");
            continue;
              }

              // Iterating through the page annotations looking for links...
              foreach(Annotation annotation in annotations)
              {
            if(annotation is Link)
            {
              linkFound = true;

              if(textStrings == null)
              {textStrings = extractor.Extract(page);}

              Link link = (Link)annotation;
              RectangleF linkBox = link.Box;

              // Text.
              /*
                Extracting text superimposed by the link...
                NOTE: As links have no strong relation to page text but a weak location correspondence,
                we have to filter extracted text by link area.
              */
              StringBuilder linkTextBuilder = new StringBuilder();
              foreach(ITextString linkTextString in extractor.Filter(textStrings,linkBox))
              {linkTextBuilder.Append(linkTextString.Text);}
              Console.WriteLine("Link '" + linkTextBuilder + "' ");

              // Position.
              Console.WriteLine(
                "    Position: "
                  + "x:" + Math.Round(linkBox.X) + ","
                  + "y:" + Math.Round(linkBox.Y) + ","
                  + "w:" + Math.Round(linkBox.Width) + ","
                  + "h:" + Math.Round(linkBox.Height)
                  );

              // Target.
              Console.Write("    Target: ");
              PdfObjectWrapper target = link.Target;
              if(target is Destination)
              {PrintDestination((Destination)target);}
              else if(target is actions::Action)
              {PrintAction((actions::Action)target);}
              else if(target == null)
              {Console.WriteLine("[not available]");}
              else
              {Console.WriteLine("[unknown type: " + target.GetType().Name + "]");}
            }
              }
              if(!linkFound)
              {
            Console.WriteLine("No links here.");
            continue;
              }
            }
              }
        }

Example #24

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestIsValidFileTypeMethodWithValidFile()
 {
     Assert.IsTrue(TextExtractor.IsValidFileType(@"Documents\Valid.docx"));
 }

Example #25

0

Show file

File: ChapterCreator.cs Project: RoDaniel/featurehouse

        private void btInput_Click(object sender, EventArgs e)
        {
            if (rbFromFile.Checked)
            {
                openFileDialog.Filter = "IFO Files (*.ifo)|*.ifo|MPLS Files (*.mpls)|*.mpls|Text Files (*.txt)|*.txt|All Files supported (*.ifo,*.mpls,*.txt)|*.ifo;*.mpls;*.txt";
                openFileDialog.FilterIndex = 4;

               if (this.openFileDialog.ShowDialog() == DialogResult.OK)
                {
                    input.Text = openFileDialog.FileName;

                    if (input.Text.ToLower().EndsWith("ifo"))
                    {
                        ChapterExtractor ex = new IfoExtractor();
                        pgc = ex.GetStreams(input.Text)[0];
                        FreshChapterView();
                        updateTimeLine();
                    }
                    else if (input.Text.ToLower().EndsWith("mpls"))
                    {
                        ChapterExtractor ex = new MplsExtractor();
                        pgc = ex.GetStreams(input.Text)[0];
                        FreshChapterView();
                        updateTimeLine();
                    }
                    else
                    {
                        ChapterExtractor ex = new TextExtractor();
                        pgc = ex.GetStreams(input.Text)[0];
                        FreshChapterView();
                        updateTimeLine();
                    }
                }
            }
            else
            {
                using (FolderBrowserDialog d = new FolderBrowserDialog())
                {
                    d.ShowNewFolderButton = false;
                    d.Description = "Select DVD, BluRay disc, or folder.";
                    if (d.ShowDialog() == DialogResult.OK)
                    {
                        input.Text = d.SelectedPath;
                        try
                        {
                            ChapterExtractor ex =
                              Directory.Exists(Path.Combine(input.Text, "VIDEO_TS")) ?
                              new DvdExtractor() as ChapterExtractor :
                              Directory.Exists(Path.Combine(Path.Combine(input.Text, "BDMV"), "PLAYLIST")) ?
                              new BlurayExtractor() as ChapterExtractor :
                              null;

                            if (ex == null)
                                throw new Exception("The location was not detected as DVD, or Blu-Ray.");

                            using (frmStreamSelect frm = new frmStreamSelect(ex))
                            {
                                if (ex is DvdExtractor)
                                    frm.Text = "Select your PGC";
                                else
                                    frm.Text = "Select your Playlist";
                                ex.GetStreams(input.Text);
                                if (frm.ShowDialog(this) == DialogResult.OK)
                                {
                                    pgc = frm.ProgramChain;
                                    if (pgc.FramesPerSecond == 0) pgc.FramesPerSecond = 25.0;
                                    if (pgc.LangCode == null) pgc.LangCode = "und";
                                }
                            }
                            FreshChapterView();
                            updateTimeLine();
                        }
                        catch (Exception ex)
                        {
                            MessageBox.Show(ex.Message);
                        }
                    }
                }
            }

            if (chapterListView.Items.Count != 0)
                chapterListView.Items[0].Selected = true;
        }

Example #26

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithUndefinedBytes()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.Extract((byte[])null)
         );
 }

Example #27

0

Show file

File: text_extraction.cs Project: vorou/tikaondotnet

 public virtual void SetUp()
 {
     _cut = new TextExtractor();
 }

Example #28

0

Show file

File: TextExtractorUnitTest.cs Project: MintyPeterson/text-extractor

 public void TestExtractMethodWithMissingStream()
 {
     Assert.ThrowsException <NotSupportedException>(
         () => TextExtractor.Extract(new MemoryStream())
         );
 }

Example #29

0

Show file

File: Form1.cs Project: zewreader/ZewReader

        private void button3_Click(object sender, EventArgs e)
        {

            timer1.Stop();
            button5.Text = "P L A Y";
            OpenFileDialog dlg = new OpenFileDialog();
            string filepath;
            dlg.Filter = "Text files(*.txt)|*.txt|Doc files(*.doc)|*.doc|Docx files(*.docx)|*.docx|All files(*.*)|*.*";

            if (dlg.ShowDialog() == DialogResult.OK)
            {
                filepath = dlg.FileName.ToString();

                string strText = string.Empty;
                try
                {

                    string ext = System.IO.Path.GetExtension(dlg.FileName);
                  
                    if (ext == ".doc" || ext == ".docx")
                    {
                        Code7248.word_reader.TextExtractor extractor = new TextExtractor(dlg.FileName);

                        string contents = extractor.ExtractText();
                        MyText.Text = contents;
                    }
                    else
                    {

                        MyText.Text = File.ReadAllText(dlg.FileName);
                        
                    }
                   MyText.Text =  MyText.Text.Replace("\t", " ");
                    MyText.Text = MyText.Text.Replace(Environment.NewLine, "\n");
                    if (mychar == '\n')
                    {
                        MyText.Text = MyText.Text.Replace("\n", " ");
                    }
                    string nolstrTextine = MyText.Text.Replace("\n", System.Convert.ToString(mychar));
                    i = 0;
                    if (mychar == '\n')
                    {
                        nolstrTextine = nolstrTextine.Replace(".", ". " + mychar);
                    }
                
                    words = nolstrTextine.Split(new char[] { mychar }, StringSplitOptions.RemoveEmptyEntries); ;
              
                    progressBar1.Maximum = words.Length;

                    timer1.Stop();
                    button5.Text = "P L A Y";


                }
                catch (Exception ex)
                {

                    MessageBox.Show(ex.Message);
                }
            }

        }

Example #30

0

Show file

        static void Main(string[] args)
        {
            try
            {
                // Files
                string fileName                = "hindi_text_with_image.pdf";
                string destFileName            = "output_hindi_text_with_image.pdf";
                string destFileName_serachable = "output_hindi_text_with_image_searchable.pdf";

                // Read all text from pdf file
                string allTextExtracted = "";
                using (TextExtractor extractor = new TextExtractor())
                {
                    // Load PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    // Read all text directly
                    allTextExtracted = extractor.GetText();
                }

                // Get image from pdf file
                MemoryStream memoryStream = new MemoryStream();
                using (ImageExtractor extractor = new ImageExtractor())
                {
                    // Load PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    if (extractor.GetFirstImage())
                    {
                        extractor.SaveCurrentImageToStream(memoryStream, ImageFormat.Png);
                    }
                }

                // Load image from file to System.Drawing.Image object (we need it to get the image resolution)
                using (System.Drawing.Image sysImage = System.Drawing.Image.FromStream(memoryStream))
                {
                    // Compute image size in PDF units (Points)
                    float widthInPoints  = sysImage.Width / sysImage.HorizontalResolution * 72f;
                    float heightInPoints = sysImage.Height / sysImage.VerticalResolution * 72f;

                    // Create new PDF document
                    using (Document outPdfDocument = new Document())
                    {
                        outPdfDocument.RegistrationName = "demo";
                        outPdfDocument.RegistrationKey  = "demo";

                        // Create page of computed size
                        Page page = new Page(widthInPoints, heightInPoints);

                        // Add page to the document
                        outPdfDocument.Pages.Add(page);

                        Canvas canvas = page.Canvas;

                        // Create Bytescout.PDF.Image object from loaded image
                        Image pdfImage = new Image(sysImage);

                        // Draw the image
                        canvas.DrawImage(pdfImage, 0, 0, widthInPoints, heightInPoints);

                        // Dispose the System.Drawing.Image object to free resources
                        sysImage.Dispose();

                        // Create brush
                        SolidBrush transparentBrush = new SolidBrush(new ColorGray(0));

                        // ... and make it transparent
                        transparentBrush.Opacity = 0;

                        // Draw text with transparent brush
                        // Need to set Font which supports hindi characters.
                        Font font16 = new Font("Arial Unicode MS", 16);
                        canvas.DrawString(allTextExtracted, font16, transparentBrush, 40, 40);

                        // Save document to file
                        outPdfDocument.Save(destFileName);
                    }
                }


                // Make PDF file with hindi text searchable to OCR.
                using (SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker())
                {
                    //Load PDF document
                    searchablePDFMaker.LoadDocumentFromFile(destFileName);

                    // Set the location of "tessdata" folder containing language data files

                    /*
                     * It used following files for hindi language support. Need to put these files into "testdata" folder. Below location contains these files.
                     * https://github.com/tesseract-ocr/tessdata/tree/3.04.00
                     * hin.traineddata
                     * hin.cube.bigrams
                     * hin.cube.lm
                     * hin.cube.nn
                     * hin.cube.params
                     * hin.cube.word-freq
                     * hin.tesseract_cube.nn
                     */
                    searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

                    // Set OCR language
                    searchablePDFMaker.OCRLanguage = "hin";

                    // Need to set Font which supports hindi characters
                    searchablePDFMaker.LabelingFont = "Arial Unicode MS";

                    // Set PDF document rendering resolution
                    searchablePDFMaker.OCRResolution = 300;

                    searchablePDFMaker.MakePDFSearchable(destFileName_serachable);
                }

                // Open document in default PDF viewer app
                Process.Start(destFileName_serachable);
            }
            catch (Exception ex)
            {
                Console.WriteLine("ERROR:" + ex.Message);
            }

            Console.ReadLine();
        }

C# (CSharp) TextExtractor Examples