/// <summary> /// Extract from OST container /// </summary> public static void ExtractFromOstContainer(string fileName) { //ExStart:ExtractFromOstContainer ExtractorFactory factory = new ExtractorFactory(); //get OST file's path string filePath = Common.getFilePath(fileName); using (var container = new PersonalStorageContainer(filePath)) { for (int i = 0; i < container.Entities.Count; i++) { Console.WriteLine(container.Entities[i].Name); Console.WriteLine(container.Entities[i].Path.ToString()); Console.WriteLine(container.Entities[i].MediaType); Console.WriteLine(container.Entities[i][PersonalStorageContainer.EmailSubject]); Console.WriteLine(container.Entities[i][PersonalStorageContainer.EmailSender]); Console.WriteLine(container.Entities[i][PersonalStorageContainer.EmailReceiver]); using (TextExtractor extractor = factory.CreateTextExtractor(container.Entities[i].OpenStream())) { Console.WriteLine("Content:"); Console.WriteLine(extractor != null ? extractor.ExtractAll() : "The document format is not supported"); } } } //ExEnd:ExtractFromOstContainer }
public ExtractText(string fileName, bool formatted) { //ExStart:ExtractText int linesPerPage = Console.WindowHeight; ExtractorFactory factory = new ExtractorFactory(); TextExtractor extractor = formatted ? factory.CreateFormattedTextExtractor(fileName) : factory.CreateTextExtractor(fileName); if (extractor == null) { Console.WriteLine("The document's format is not supported"); return; } try { string line = null; do { Console.Clear(); Console.WriteLine("{0}", fileName); int lineNumber = 0; do { line = extractor.ExtractLine(); lineNumber++; if (line != null) { Console.WriteLine(line); } }while (line != null && lineNumber < linesPerPage); Console.WriteLine(); Console.WriteLine("Press Esc to exit or any other key to move to the next page"); }while (line != null && Console.ReadKey().Key != ConsoleKey.Escape); } finally { extractor.Dispose(); } //ExEnd:ExtractText }
/// <summary> /// For enumerating all the entities of the group of containers ContainerEnumerator class is used /// </summary> public static void EnumeratingAllEntities() { //ExStart:EnumeratingAllEntities IContainerFactory containerFactory = null; MediaTypeDetector containerMediaTypeDetector = null; Container container = null; ExtractorFactory readerFactory = new ExtractorFactory(); var enumerator = new ContainerEnumerator(containerFactory, containerMediaTypeDetector, container); while (enumerator.MoveNext()) { using (var stream = enumerator.Current.OpenStream()) { using (var extractor = readerFactory.CreateTextExtractor(stream)) { Console.WriteLine(extractor == null ? "document isn't supported" : extractor.ExtractAll()); } } } //ExEnd:EnumeratingAllEntities }
/// <summary> /// Reads concrete files from a ZIP folder /// </summary> /// <param name="folderName">Name of the zipped folder</param> public static void ReadConcreteFile(string folderName) { //ExStart:ReadConcreteFile //get ZIP folder's path string folderPath = Common.getFilePath(folderName); ExtractorFactory extractorFactory = new ExtractorFactory(); //initialize ZIP container using (var container = new ZipContainer(folderPath)) { //loop through all the entities in the folder for (int i = 0; i < container.Entities.Count; i++) { //extract content of each entity by creating a textextractor using extractfactory's CreateTextExtractor function using (TextExtractor extractor = extractorFactory.CreateTextExtractor(container.Entities[i].OpenStream())) { //display the extracted text Console.WriteLine(extractor.ExtractAll()); } } } //ExEnd:ReadConcreteFile }
/// <summary> /// Extracts text from the entity of ZIP container: /// </summary> /// <param name="folderName">Name of the zipped folder</param> public static void RetrieveEntity(string folderName) { //ExStart:RetrieveEntity_17.12 //get ZIP folder's path string folderPath = Common.GetFilePath(folderName); ExtractorFactory extractorFactory = new ExtractorFactory(); //initialize ZIP container using (var container = new ZipContainer(folderPath)) { Container.Entity containerEntry = container.GetEntity("META-INF\\container.xml"); // If the entity isn't found if (containerEntry == null) { throw new GroupDocsTextException("File not found"); } // Try to create a text extractor TextExtractor extractor = extractorFactory.CreateTextExtractor(containerEntry.OpenStream()); try { // Extract a text (if the document type is supported) Console.WriteLine(extractor == null ? "Document type isn't supported" : extractor.ExtractAll()); } finally { // Cleanup if (extractor != null) { extractor.Dispose(); } } } //ExEnd:RetrieveEntity_17.12 }
private async Task <Response> ParseFileText(string fileName, string folderName) { string logMsg = "ControllerName: GroupDocsParserController FileName: " + fileName + " FolderName: " + folderName; try { return(await ProcessTask(fileName, folderName, ".txt", false, "", delegate(string inFilePath, string outPath, string zipOutFolder) { EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251)); if (!Directory.Exists(zipOutFolder)) { Directory.CreateDirectory(zipOutFolder); } using (Stream stream = new FileStream(inFilePath, FileMode.Open)) { System.IO.File.WriteAllText(outPath, "Encoding: " + detector.Detect(stream, true) + Environment.NewLine); } ExtractorFactory factory = new ExtractorFactory(); MetadataExtractor metadataExtractor = factory.CreateMetadataExtractor(inFilePath); if (metadataExtractor != null) { MetadataCollection metadataCollection = metadataExtractor.ExtractMetadata(inFilePath); System.IO.File.AppendAllText(outPath, Environment.NewLine + "Metadata:" + Environment.NewLine); foreach (string key in metadataCollection.Keys) { System.IO.File.AppendAllText(outPath, string.Format("{0} = {1}", key, metadataCollection[key]) + Environment.NewLine); } } System.IO.File.AppendAllText(outPath, Environment.NewLine + "Parsed content:" + Environment.NewLine); string fileExt = Path.GetExtension(fileName).Substring(1).ToLower(); if (GetFormatType(fileExt) == FormatType.Excel) { CellsTextExtractor extractor = new CellsTextExtractor(inFilePath); extractor.ExtractMode = ExtractMode.Standard; for (int sheetIndex = 0; sheetIndex < extractor.SheetCount; sheetIndex++) { System.IO.File.AppendAllText(outPath, Environment.NewLine + "Sheet # " + extractor.SheetCount + Environment.NewLine); System.IO.File.AppendAllText(outPath, extractor.ExtractSheet(sheetIndex)); } } else { TextExtractor textExtractor = factory.CreateFormattedTextExtractor(inFilePath); if (textExtractor == null) { textExtractor = factory.CreateTextExtractor(inFilePath); } System.IO.File.AppendAllText(outPath, textExtractor.ExtractAll()); } })); } catch (Exception exc) { return(new Response { FileName = fileName, FolderName = folderName, OutputType = "txt", Status = exc.Message, StatusCode = 500, Text = exc.ToString() }); } }
public WordStatistic(string fileName, int maxWordLength) { //ExStart:WordStatistic ExtractorFactory factory = new ExtractorFactory(); Dictionary <string, int> statistic = new Dictionary <string, int>(); TextExtractor extractor = factory.CreateTextExtractor(fileName); if (extractor == null) { Console.WriteLine("The document's format is not supported"); return; } try { string line = null; do { line = extractor.ExtractLine(); if (line != null) { string[] words = line.Split(' ', ',', ';', '.'); foreach (string w in words) { string word = w.Trim().ToLower(); if (word.Length > maxWordLength) { if (!statistic.ContainsKey(word)) { statistic[word] = 0; } statistic[word]++; } } } }while (line != null); } finally { extractor.Dispose(); } Console.WriteLine("Top words:"); for (int i = 0; i < 10; i++) { int count = -1; string maxKey = null; foreach (string key in statistic.Keys) { if (statistic[key] > count) { count = statistic[key]; maxKey = key; } } if (maxKey == null) { break; } Console.WriteLine("{0}: {1}", maxKey, count); statistic.Remove(maxKey); } //ExEnd:WordStatistic }
public ActionResult ExtractText([FromBody] string fileName, string password = null) { //ExStart:ExtractText ExtractorFactory factory = new ExtractorFactory(); string path = Server.MapPath("../App_Data//Uploads//" + fileName); string ext = Path.GetExtension(path); List <string> extractedText = new List <string>(); try { string line = null; //If file password procted if (!string.IsNullOrWhiteSpace(password)) { if (ext == ".one") { var loadOptions = new LoadOptions(); loadOptions.Password = password; using (var extractor = new NoteTextExtractor(path, loadOptions)) { do { int lineNumber = 0; do { line = extractor.ExtractLine(); lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); }while (line != null); } } else { LoadOptions loadOptions = new LoadOptions(); loadOptions.Password = password; WordsTextExtractor protectedDocument = new WordsTextExtractor(path, loadOptions); do { int lineNumber = 0; do { line = protectedDocument.ExtractLine(); lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); }while (line != null); } } else { //if file type is zip if (ext == ".zip") { using (var container = new ZipContainer(path)) { for (int i = 0; i < container.Entities.Count; i++) { using (TextExtractor extractor = factory.CreateTextExtractor(container.Entities[i].OpenStream())) { int lineNumber = 0; do { line = extractor.ExtractLine(); lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); } } } } else { TextExtractor extractor = factory.CreateTextExtractor(path); do { int lineNumber = 0; do { try { line = extractor.ExtractLine(); } catch (Exception) { if (ext == ".one") { extractedText.Add("Invalid password"); break; } } lineNumber++; if (line != null) { extractedText.Add(line); } }while (line != null); }while (line != null); } } //extractedText.Add(extractor.ExtractAll()); } catch (Exception ex) { extractedText.Add(ex.Message); } return(Json(extractedText, JsonRequestBehavior.AllowGet)); }
public ActionResult CountStatistics([FromBody] string fileName) { List <string> extractedText = new List <string>(); string filePath = Server.MapPath("../App_Data//Uploads//" + fileName); try { string[] arguments = new string[] { filePath }; int maxWordLength = 0; for (int i = 0; i < arguments.Length; i++) { if (arguments[i].Length == 1 || !int.TryParse(arguments[i], out maxWordLength)) { maxWordLength = 5; } } ExtractorFactory factory = new ExtractorFactory(); Dictionary <string, int> statistic = new Dictionary <string, int>(); TextExtractor extractor = factory.CreateTextExtractor(filePath); if (extractor == null) { extractedText.Add("The document's format is not supported"); } try { string line = null; do { line = extractor.ExtractLine(); if (line != null) { string[] words = line.Split(' ', ',', ';', '.'); foreach (string w in words) { string word = w.Trim().ToLower(); if (word.Length > maxWordLength) { if (!statistic.ContainsKey(word)) { statistic[word] = 0; } statistic[word]++; } } } }while (line != null); } finally { extractor.Dispose(); } extractedText.Add("Top words:"); for (int i = 0; i < 10; i++) { int count = -1; string maxKey = null; foreach (string key in statistic.Keys) { if (statistic[key] > count) { count = statistic[key]; maxKey = key; } } if (maxKey == null) { break; } extractedText.Add(maxKey + " : " + count); statistic.Remove(maxKey); } } catch (Exception ex) { extractedText.Add(ex.Message); } return(Json(extractedText, JsonRequestBehavior.AllowGet)); }