/// <summary> /// Override /// </summary> protected override void ProcessRecord() { IdResult idResult = null; DocumentContent content = null; ContentExtractorType extractorType = ContentExtractorType.Document; using (var stream = File.OpenRead(Path)) { idResult = DocumentIdentifier.Identify(stream, Path); // // Extract metadata content from document: // var settings = new ContentExtractionSettings(); settings.ExtractionType = ExtractionType.TextAndMetadata; settings.EmbeddedObjectExtraction = EmbeddedExtractionType.EmbeddedDocumentsAndMedia; settings.SensitiveItemCheck.Check = true; // Enable sensitive item checks settings.Hashing.HashingType = HashingType.BinaryAndContentHash; settings.LargeDocumentCritera = 100 * 1024 * 1024; // Define a 'large' file as >= 100MB (this determines when // ContentExtractorType.LargeUnsupported and ContentExtractorType.LargeEncodedText // extractor interfaces are returned. // // Get Content Extractor for identified file format type: // var contentExtractorResult = ContentExtractorFactory.GetContentExtractor(stream, idResult, Path, settings); if (contentExtractorResult.HasError) { WriteObject("Error: " + contentExtractorResult.Error); return; } else { extractorType = contentExtractorResult.ContentExtractor.ContentExtractorType; switch (extractorType) { case ContentExtractorType.Archive: #region Archive Extraction... { var archiveExtractor = (IArchiveExtractor)contentExtractorResult.ContentExtractor; if (archiveExtractor.IsSplit) { // Detected that currently selected file is the main split segment for a split archive. Now we will use archive // extractor helper method 'GetSplitSegmentStreamsInOrder' to get the other split archive segments (in proper order) // in the same directory: Stream[] splitSegmentStreamsInOrder = null; string[] splitSegmentNameInOrder = null; archiveExtractor.GetSplitSegmentStreamsInOrder(Path, out splitSegmentStreamsInOrder, out splitSegmentNameInOrder); content = archiveExtractor.ExtractContent(splitSegmentStreamsInOrder, splitSegmentNameInOrder, Password); // // We have an archive level password (versus item level passwords): // if (content.Result == ContentResult.WrongPassword) { // wrong password } } else { content = archiveExtractor.ExtractContent(Password); // // We have an archive level password (versus item level passwords): // if (content.Result == ContentResult.WrongPassword) { // wrong password } } } #endregion break; case ContentExtractorType.Document: #region Document Extraction... { var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); content = docExtractor.ExtractContent(); if (content.Result == ContentResult.WrongPassword) { // wrong password } } #endregion break; case ContentExtractorType.MailStore: #region MailStore Extraction... { var mailStoreExtractor = ((IMailStoreExtractor)contentExtractorResult.ContentExtractor); content = mailStoreExtractor.ExtractContent(); } #endregion break; case ContentExtractorType.Database: #region Database Extraction... { // We will only get table/column info (individual table extracted text can be quite large): var databaseExtractor = ((IDatabaseExtractor)contentExtractorResult.ContentExtractor); content = databaseExtractor.ExtractContent(Path); } #endregion break; case ContentExtractorType.DocumentStore: #region DocumentStore Extraction... { var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); content = docExtractor.ExtractContent(); } #endregion break; case ContentExtractorType.Unsupported: #region Unsupported Type Extraction... { // // Binary-to-text extraction: Note, if property ContentExtractionSettings.BinaryToTextOnUnsupportedTypes is false, then calling // IUnsupportedExtractor.ExtractContent will only calculate binary hashes without performing binary-to-text. // Binary-to-text is not useful for file formats that do not have any textual content (e.g., compressed archives or encrypted files) // It is up to the user to filter these formats out using either file format Id or file format classification. // var docExtractor = ((IUnsupportedExtractor)contentExtractorResult.ContentExtractor); content = docExtractor.ExtractContent(); } #endregion break; case ContentExtractorType.LargeUnsupported: #region 'Large' Unsupported Type Extraction... { // Ignore for this example, very 'large' binary-to-text that needs a FileStream could be extracted content = new DocumentContent(idResult); content.Result = ContentResult.UnsupportedError; content.ErrorMessage = "Not supported for this example. Users should write output to a file stream when implemented"; } #endregion break; case ContentExtractorType.LargeEncodedText: #region 'Large' Encoded Text File Extraction... { // Ignore for this example content = new DocumentContent(idResult); content.Result = ContentResult.UnsupportedError; content.ErrorMessage = "Not supported for this example. Users should write output to a file stream when implemented"; } #endregion break; } } } WriteObject(content); }
/// <summary> /// Override /// </summary> protected override void ProcessRecord() { IdResult idResult = null; DocumentContent content = null; var strBuilder = new StringBuilder(); var extractorType = ContentExtractorType.Document; using (var stream = File.OpenRead(Path)) { idResult = DocumentIdentifier.Identify(stream, Path); // // Content extraction settings: // var settings = new ContentExtractionSettings(); settings.ExtractionType = ExtractionType.TextAndMetadata; settings.Hashing.HashingType = HashingType.BinaryAndContentHash; settings.SensitiveItemCheck.Check = true; // // Get Content Extractor for identified file format type: // var contentExtractorResult = ContentExtractorFactory.GetContentExtractor(stream, idResult, Path, settings); if (contentExtractorResult.HasError) { WriteObject("Error: " + contentExtractorResult.Error); return; } else { extractorType = contentExtractorResult.ContentExtractor.ContentExtractorType; switch (extractorType) { case ContentExtractorType.Archive: #region Archive Extraction... { var archiveExtractor = (IArchiveExtractor)contentExtractorResult.ContentExtractor; if (archiveExtractor.IsSplit) { // Detected that currently selected file is the main split segment for a split archive. Now we will use archive // extractor helper method 'GetSplitSegmentStreamsInOrder' to get the other split archive segments (in proper order) // in the same directory: Stream[] splitSegmentStreamsInOrder = null; string[] splitSegmentNameInOrder = null; archiveExtractor.GetSplitSegmentStreamsInOrder(Path, out splitSegmentStreamsInOrder, out splitSegmentNameInOrder); content = archiveExtractor.ExtractContent(splitSegmentStreamsInOrder, splitSegmentNameInOrder, Password); // // We have an archive level password (versus item level passwords): // if (content.Result == ContentResult.WrongPassword) { if (!string.IsNullOrWhiteSpace(Password)) { strBuilder.AppendLine("ERROR: Wrong Password"); } else { strBuilder.AppendLine("ERROR: Archive requires a password"); } } } else { content = archiveExtractor.ExtractContent(Password); // // We have an archive level password (versus item level passwords): // if (content.Result == ContentResult.WrongPassword) { if (!string.IsNullOrWhiteSpace(Password)) { strBuilder.AppendLine("ERROR: Wrong Password"); } else { strBuilder.AppendLine("ERROR: Archive requires a password"); } } } } #endregion break; case ContentExtractorType.Document: #region Document Extraction... { var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); content = docExtractor.ExtractContent(); // We have an encrypted document that is supported for decryption, keep prompting user for passwords until result is not // ContentResult.WrongPassword or until user presses "Cancel" button: if (content.Result == ContentResult.WrongPassword && content.IsEncrypted && docExtractor.SupportsDecryption) { if (!string.IsNullOrWhiteSpace(Password)) { strBuilder.AppendLine("ERROR: Wrong Password"); } else { strBuilder.AppendLine("ERROR: Document requires a password"); } } else if (content.Result == ContentResult.WrongPassword && content.IsEncrypted && !docExtractor.SupportsDecryption) { strBuilder.AppendLine("ERROR: Document is encrypted with a password but format is not supported for decryption."); } } #endregion break; case ContentExtractorType.MailStore: #region MailStore Extraction... { var mailStoreExtractor = ((IMailStoreExtractor)contentExtractorResult.ContentExtractor); content = mailStoreExtractor.ExtractContent(); } #endregion break; case ContentExtractorType.Database: #region Database Extraction... { // We will only get table/column info (individual table extracted text can be quite large): var databaseExtractor = ((IDatabaseExtractor)contentExtractorResult.ContentExtractor); content = databaseExtractor.ExtractContent(Path); } #endregion break; case ContentExtractorType.DocumentStore: #region DocumentStore Extraction... { var docExtractor = ((IDocumentContentExtractor)contentExtractorResult.ContentExtractor); content = docExtractor.ExtractContent(); } #endregion break; case ContentExtractorType.Unsupported: #region Unsupported Type Extraction... { // // Binary-to-text extraction: Note, if property ContentExtractionSettings.BinaryToTextOnUnsupportedTypes is false, then calling // IUnsupportedExtractor.ExtractContent will only calculate binary hashes without performing binary-to-text. // Binary-to-text is not useful for file formats that do not have any textual content (e.g., compressed archives or encrypted files) // It is up to the user to filter these formats out using either file format Id or file format classification. // var docExtractor = ((IUnsupportedExtractor)contentExtractorResult.ContentExtractor); content = docExtractor.ExtractContent(); } #endregion break; case ContentExtractorType.LargeUnsupported: // Ignore for this example break; case ContentExtractorType.LargeEncodedText: // Ignore for this example break; } } } strBuilder.AppendLine("File Format:"); strBuilder.AppendLine("------------"); strBuilder.AppendLine(string.Format(" ID: {0}", idResult.ID.ToString())); strBuilder.AppendLine(string.Format(" Classification: {0}", idResult.Classification.ToString())); strBuilder.AppendLine(string.Format(" MatchType: {0}", idResult.MatchType.ToString())); strBuilder.AppendLine(string.Format(" Text Encoding ID: {0}", idResult.EncodingID.ToString())); strBuilder.AppendLine(string.Format(" IsEncrypted: {0}", idResult.IsEncrypted.ToString())); strBuilder.AppendLine(string.Format(" MediaType: {0}", idResult.MediaType.ToString())); strBuilder.AppendLine(string.Format(" Description: {0}", idResult.Description.ToString())); if (content != null) { strBuilder.AppendLine(); strBuilder.AppendLine("File Metadata:"); strBuilder.AppendLine("---------------"); foreach (var meta in content.Metadata) { string value = ""; switch (meta.Value.PropertyType) { case PropertyType.Boolean: value = ((BooleanProperty)meta.Value).Value.ToString(); break; case PropertyType.DateTime: value = ((DateTimeProperty)meta.Value).Value.ToString(); break; case PropertyType.Double: value = ((DoubleProperty)meta.Value).Value.ToString(); break; case PropertyType.Int32: value = ((Int32Property)meta.Value).Value.ToString(); break; case PropertyType.Int64: value = ((Int64Property)meta.Value).Value.ToString(); break; case PropertyType.String: value = ((StringProperty)meta.Value).Value; break; case PropertyType.BooleanList: value = string.Join("; ", ((BooleanListProperty)meta.Value).Value); break; case PropertyType.DateTimeList: value = string.Join("; ", ((DateTimeListProperty)meta.Value).Value); break; case PropertyType.DoubleList: value = string.Join("; ", ((DoubleListProperty)meta.Value).Value); break; case PropertyType.Int32List: value = string.Join("; ", ((Int32ListProperty)meta.Value).Value); break; case PropertyType.Int64List: value = string.Join("; ", ((Int64ListProperty)meta.Value).Value); break; case PropertyType.StringList: value = string.Join("; ", ((StringListProperty)meta.Value).Value); break; } strBuilder.AppendLine(string.Format(" {0,-35} {1}", meta.Key, value)); } strBuilder.AppendLine(); strBuilder.AppendLine("Custom Metadata:"); strBuilder.AppendLine("-----------------"); foreach (var meta in content.CustomMetadata) { string value = ""; switch (meta.Value.PropertyType) { case PropertyType.Boolean: value = ((BooleanProperty)meta.Value).Value.ToString(); break; case PropertyType.DateTime: value = ((DateTimeProperty)meta.Value).Value.ToString(); break; case PropertyType.Double: value = ((DoubleProperty)meta.Value).Value.ToString(); break; case PropertyType.Int32: value = ((Int32Property)meta.Value).Value.ToString(); break; case PropertyType.Int64: value = ((Int64Property)meta.Value).Value.ToString(); break; case PropertyType.String: value = ((StringProperty)meta.Value).Value; break; case PropertyType.BooleanList: value = string.Join("; ", ((BooleanListProperty)meta.Value).Value); break; case PropertyType.DateTimeList: value = string.Join("; ", ((DateTimeListProperty)meta.Value).Value); break; case PropertyType.DoubleList: value = string.Join("; ", ((DoubleListProperty)meta.Value).Value); break; case PropertyType.Int32List: value = string.Join("; ", ((Int32ListProperty)meta.Value).Value); break; case PropertyType.Int64List: value = string.Join("; ", ((Int64ListProperty)meta.Value).Value); break; case PropertyType.StringList: value = string.Join("; ", ((StringListProperty)meta.Value).Value); break; } strBuilder.AppendLine(string.Format(" {0,-35} {1}", meta.Key, value)); } strBuilder.AppendLine(); strBuilder.AppendLine("File Attributes:"); strBuilder.AppendLine("----------------"); if (content.Attributes.Count > 0) { foreach (var attr in content.Attributes) { strBuilder.AppendLine(string.Format(" {0}", attr.ToString())); } } strBuilder.AppendLine(); if (content is DatabaseContent) { var dbContent = (DatabaseContent)content; strBuilder.AppendLine("Database Tables:"); strBuilder.AppendLine("----------------"); if (dbContent.Tables != null && dbContent.Tables.Count > 0) { strBuilder.AppendLine(" [Name] [Row Count] [Num Columns] [Is User Table] "); foreach (var table in dbContent.Tables) { strBuilder.AppendLine(string.Format(" {0,-30} {1,-20} {2,-15} {3}", table.Name, table.RowCount, table.Columns != null ? table.Columns.Count.ToString() : "0", table.IsUserTable.ToString())); } } strBuilder.AppendLine(); } strBuilder.AppendLine("File Hyperlinks:"); strBuilder.AppendLine("----------------"); if (content.HyperLinks != null && content.HyperLinks.Count > 0) { foreach (var link in content.HyperLinks) { strBuilder.AppendLine(string.Format(" {0}", link.Url)); } } strBuilder.AppendLine(); strBuilder.AppendLine(); strBuilder.AppendLine("Detected Sensitive Items:"); strBuilder.AppendLine("-------------------------"); if (content.SensitiveItemResult != null && content.SensitiveItemResult.Items.Count > 0) { foreach (var item in content.SensitiveItemResult.Items) { strBuilder.AppendLine(string.Format(" {0,-30} {1,-20} {2,-15} {3}", item.ItemType.ToString(), item.MatchType.ToString(), item.LocationType.ToString(), item.Text)); } } strBuilder.AppendLine(); strBuilder.AppendLine(); strBuilder.AppendLine("Detected Languages:"); strBuilder.AppendLine("-------------------"); if (content.LanguageIdResults != null && content.LanguageIdResults.Count > 0) { foreach (var langIdResult in content.LanguageIdResults) { strBuilder.AppendLine(string.Format(" {0,-30} {1,-20} {2,-15}", langIdResult.Language, langIdResult.LangIso639, langIdResult.PercentOfFullText)); } } strBuilder.AppendLine(); if (ShowText) { strBuilder.AppendLine(); if (content.ExtractedText != null) { var charsToDisplay = Math.Min(1000, content.ExtractedText.Length); strBuilder.AppendLine(string.Format("Extracted Text: Total Chars = {0}, Displayed Chars = {1}", content.ExtractedText.Length, charsToDisplay)); strBuilder.AppendLine("-------------------------------------------------------------------"); strBuilder.AppendLine(content.ExtractedText.Substring(0, charsToDisplay)); strBuilder.AppendLine(); } else { strBuilder.AppendLine(string.Format("Extracted Text: Total Chars = {0}, Displayed Chars = {1}", 0, 0)); strBuilder.AppendLine("-------------------------------------------------------------------"); strBuilder.AppendLine(); } } } WriteObject(strBuilder.ToString()); }