public static dynamic GetDocumentObject(string category, string documentName, string textOcrResult,
                                                List <string> keyPhrasesV2, List <string> keyPhrasesV3,
                                                List <string> entitiesV2, List <CognitiveServiceClasses.Entities.Entity> entitiesV3,
                                                int pages, string uri, string documentType, long documentSizeInBytes,
                                                PIIResult piiResult, List <BingEntityData> bingEntityDataResult,
                                                SentimentV3Response sentimentV3Prediction,
                                                CognitiveServicesApiCalls cognitiveServicesApiCalls)
        {
            var docID = category + documentName;

            List <string> azureBlobOcrPagesList = Config.USE_AZURE_BLOB_STORAGE ?
                                                  Util.GenerateDocumentPagesList("png", pages, category, documentName) : new List <string>();

            dynamic documentToProcess = new
            {
                id                  = docID,
                PartitionKey        = category,
                RowKey              = documentName,
                DocumentType        = documentType,
                Pages               = pages,
                DocumentSizeInBytes = documentSizeInBytes,
                TextSize            = textOcrResult.Length,
                CognitiveServicesApiCallsApiCallCount   = cognitiveServicesApiCalls.ApiCallCount,
                CognitiveServicesApiCallsApiCallV2Count = cognitiveServicesApiCalls.ApiCallV2Count,
                CognitiveServicesApiCallsApiCallV3Count = cognitiveServicesApiCalls.ApiCallV3Count,
                CognitiveServicesApiCallsTotalCount     = cognitiveServicesApiCalls.TotalCount,
                TextAnalyticsV2EntitiesCount            = entitiesV2.Count(),
                TextAnalyticsV2Entities                  = entitiesV2,
                TextAnalyticsV2EntitiesDistinct          = entitiesV2.Distinct().ToList(),
                TextAnalyticsV2KeyPhrasesCount           = keyPhrasesV2.Count(),
                TextAnalyticsV2KeyPhrases                = keyPhrasesV2,
                TextAnalyticsV2KeyPhrasesDistinct        = keyPhrasesV2.Select(a => a.ToUpper()).Distinct().ToList(),
                TextAnalyticsV2EntitiesBingTaxonomies    = bingEntityDataResult.Select(a => a.Taxony).ToList(),
                TextAnalyticsV3EntitiesCount             = entitiesV3.Count(),
                TextAnalyticsV3Entities                  = entitiesV3,
                TextAnalyticsV3KeyPhrasesCount           = keyPhrasesV3.Count(),
                TextAnalyticsV3KeyPhrases                = keyPhrasesV3,
                TextAnalyticsV3KeyPhrasesDistinct        = keyPhrasesV3.Select(a => a.ToUpper()).Distinct().ToList(),
                TextAnalyticsV3SentimentAnalysis         = sentimentV3Prediction,
                TextAnalyticsV3SentimentAnalysisPositive = sentimentV3Prediction,
                TextOcrResult          = textOcrResult,
                AzureBlobJsonPagesList = Config.USE_AZURE_BLOB_STORAGE ?
                                         Util.GenerateDocumentPagesList("json", pages, category, documentName) : new List <string>(),
                AzureBlobOcrPagesList = azureBlobOcrPagesList,
                PIIResult             = piiResult,
                BingEntitityDataFull  = bingEntityDataResult
            };

            return(documentToProcess);
        }
        public static void WriteToLocalStorage(string category, string documentName, string textOcrResult,
                                               List <string> keyPhrasesV2, List <string> keyPhrasesV3,
                                               List <string> entitiesV2, List <CognitiveServiceClasses.Entities.Entity> entitiesV3,
                                               int pages, string uri, string documentType, long documentSizeInBytes,
                                               PIIResult piiResultV2, List <CognitiveServiceClasses.PII.Entity> piiResultV3,
                                               List <BingEntityData> bingEntityDataResult,
                                               SentimentV3Response sentimentV3Prediction,
                                               CognitiveServicesApiCalls cognitiveServicesApiCalls)
        {
            var documentToProcess = Util.GetDocumentObject(category, documentName, textOcrResult,
                                                           keyPhrasesV2, keyPhrasesV3,
                                                           entitiesV2, entitiesV3,
                                                           pages, uri, documentType,
                                                           documentSizeInBytes, piiResultV2, bingEntityDataResult, sentimentV3Prediction,
                                                           cognitiveServicesApiCalls);

            var jsonString = JsonConvert.SerializeObject(documentToProcess);
            var fullEnrichedDocumentPath = category.ToLower() + @"\" + documentName.ToLower() + @"\fullEnrichedDocument.json";

            // Write JSON to Local Disk
            System.IO.File.WriteAllText(Config.LOCAL_LOCATION_FILES_PROCESSED_OUTPUTS + @"\" + fullEnrichedDocumentPath, jsonString);
        }
        public static void WriteToCosmosDbStorageSQLApi(DocumentClient documentDbClient, string category, string documentName,
                                                        string textOcrResult,
                                                        List <string> keyPhrasesV2, List <string> keyPhrasesV3,
                                                        List <string> entitiesV2, List <CognitiveServiceClasses.Entities.Entity> entitiesV3,
                                                        int pages, string uri, string documentType, long documentSizeInBytes,
                                                        PIIResult piiResultV2, List <CognitiveServiceClasses.PII.Entity> piiResultV3,
                                                        List <BingEntityData> bingEntityDataResult,
                                                        SentimentV3Response sentimentV3Prediction,
                                                        CognitiveServicesApiCalls cognitiveServicesApiCalls)
        {
            var documentToProcess = Util.GetDocumentObject(category, documentName, textOcrResult,
                                                           keyPhrasesV2, keyPhrasesV3,
                                                           entitiesV2, entitiesV3,
                                                           pages, uri, documentType,
                                                           documentSizeInBytes, piiResultV2, bingEntityDataResult, sentimentV3Prediction,
                                                           cognitiveServicesApiCalls);

            var jsonString = JsonConvert.SerializeObject(documentToProcess);
            var fullEnrichedDocumentPath = category.ToLower() + @"\" + documentName.ToLower() + @"\fullEnrichedDocument.json";

            // Write JSON to Blob Storage
            if (Config.USE_AZURE_BLOB_STORAGE)
            {
                var cloudStorageBloblClient  = AzureStorage.BlobStorageAccount.CreateCloudBlobClient();
                var enrichmentContainer      = cloudStorageBloblClient.GetContainerReference(Config.STORAGE_TABLE_AND_CONTAINER_NAMES.ToLower());
                var enrichedDocumentLocation = enrichmentContainer.GetBlockBlobReference(fullEnrichedDocumentPath);

                byte[] byteArray = Encoding.UTF8.GetBytes(jsonString);
                using (MemoryStream ms = new MemoryStream(byteArray))
                {
                    enrichedDocumentLocation.UploadFromStream(ms);
                }
            }

            CreateNewDoc(documentDbClient, Config.COSMOSDB_DOCUMENTS_SELFLINK, documentToProcess);
        }
        // Writes to Azure Table Storage
        // Note: Azure Table Storage will not write records that are longer than 64Kb
        public static void WriteToBlobStorageTable(WindowsAzureTable.CloudTable cloudTable, string category, string documentName, string ocrResult,
                                                   string keyPhraseResult, string distinctKeyPhraseString,
                                                   string entities, string distinctEntitiesString,
                                                   int pages, string uri, string documentType, long documentSizeInBytes,
                                                   PIIResult piiResultV2, List <CognitiveServiceClasses.PII.Entity> piiResultV3,
                                                   List <BingEntityData> bingEntityDataResult,
                                                   SentimentV3Response sentimentV3Prediction,
                                                   CognitiveServicesApiCalls cognitiveServicesApiCalls)
        {
            ocrResult       = ocrResult.Trim();
            keyPhraseResult = keyPhraseResult.Trim();

            var entityTaxonomies =
                (bingEntityDataResult is null) ? string.Empty :
                string.Join(" ;;;; ", bingEntityDataResult.Select(a => a.Taxony).ToArray());

            var size = ocrResult.Length * sizeof(char);
            var keyPhraseResultSize         = keyPhraseResult.Length * sizeof(char);
            var distinctKeyPhraseResultSize = distinctKeyPhraseString.Length * sizeof(char);
            var entitiesSize         = entities.Length * sizeof(char);
            var entityTaxonomiesSize = entityTaxonomies.Length * sizeof(char);

            // Only for Table Storage API (CosmosDB can handle large values)
            if (size > 63999)
            {
                var lengthToTake = Convert.ToInt32(Math.Round((double)(32000 * 1.0 / size) * ocrResult.Length, 0));
                ocrResult = ocrResult.Substring(0, Math.Min(ocrResult.Length, lengthToTake));
            }

            if (keyPhraseResultSize > 31999)
            {
                var lengthToTake = Convert.ToInt32(Math.Round((double)(32000 * 1.0 / size) * keyPhraseResult.Length, 0));
                keyPhraseResult = keyPhraseResult.Substring(0, Math.Min(keyPhraseResult.Length, lengthToTake));
            }

            if (distinctKeyPhraseResultSize > 31999)
            {
                var lengthToTake = Convert.ToInt32(Math.Round((double)(32000 * 1.0 / size) * distinctKeyPhraseString.Length, 0));
                distinctKeyPhraseString = keyPhraseResult.Substring(0, Math.Min(distinctKeyPhraseString.Length, lengthToTake));
            }

            if (entitiesSize > 31999)
            {
                var lengthToTake = Convert.ToInt32(Math.Round((double)(32000 * 1.0 / size) * entities.Length, 0));
                entities = keyPhraseResult.Substring(0, Math.Min(entities.Length, lengthToTake));
            }

            if (entityTaxonomiesSize > 31999)
            {
                var lengthToTake = Convert.ToInt32(Math.Round((double)(32000 * 1.0 / size) * entityTaxonomies.Length, 0));
                entityTaxonomies = keyPhraseResult.Substring(0, Math.Min(entityTaxonomies.Length, lengthToTake));
            }

            // Create a new customer entity.
            var document = new DocumentEntity(category, documentName);

            document.CognitiveServicesApiCallsApiCallCount   = cognitiveServicesApiCalls.ApiCallCount;
            document.CognitiveServicesApiCallsApiCallV2Count = cognitiveServicesApiCalls.ApiCallV2Count;
            document.CognitiveServicesApiCallsApiCallV3Count = cognitiveServicesApiCalls.ApiCallV3Count;
            document.CognitiveServicesApiCallsTotalCount     = cognitiveServicesApiCalls.TotalCount;
            document.OcrResult = ocrResult.Trim();
            document.TextAnalyticsKeyPhraseResult          = keyPhraseResult;
            document.TextAnalyticsDistinctKeyPhraseResult  = distinctKeyPhraseString;
            document.TextAnalyticsEntitiesResult           = entities;
            document.TextAnalyticsDistinctEntititesResult  = new string(distinctEntitiesString.Take(31999).ToArray());
            document.TextAnalyticsEntitiesTaxonomiesResult = entityTaxonomies;
            document.TextSize            = size;
            document.Pages               = pages;
            document.Uri                 = uri;
            document.DocumentType        = documentType;
            document.DocumentSizeInBytes = documentSizeInBytes;

            if (piiResultV2.Addresses != null)
            {
                document.PIIEmailsCount       = piiResultV2.Emails.Count;
                document.PIIAddressesCount    = piiResultV2.Addresses.Count;
                document.PIIPhoneNumbersCount = piiResultV2.PhoneNumbers.Count;
                document.PIISSNSCount         = piiResultV2.SSNs.Count;
            }

            if (sentimentV3Prediction.Documents != null)
            {
                document.SentimentAnalysis =
                    "Positive: " + sentimentV3Prediction.Documents[0].DocumentScores.Positive +
                    ", Neutral: " + sentimentV3Prediction.Documents[0].DocumentScores.Neutral +
                    ", Negative: " + sentimentV3Prediction.Documents[0].DocumentScores.Negative;
            }

            // Create the TableOperation object that inserts the customer entity.
            var insertOperation = WindowsAzureTable.TableOperation.InsertOrReplace(document);

            // Execute the insert operation.
            cloudTable.Execute(insertOperation);
        }
        static void Main(string[] args)
        {
            Console.ForegroundColor = ConsoleColor.Cyan;
            Console.WriteLine("Checking Configuration Values...");
            Console.ResetColor();

            Console.WriteLine("--------------------------------");

            // Check for valid directories
            if (string.IsNullOrEmpty(Config.LOCAL_LOCATION_FILES_SOURCE_DOCUMENTS))
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("Local directory for source documents is blank.");
                Console.ReadLine();
                Environment.Exit(0);
            }

            if (!Directory.Exists(Config.LOCAL_LOCATION_FILES_SOURCE_DOCUMENTS))
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("AppConfig - Local directory for SOURCE documents: {0} does not exist.", Config.LOCAL_LOCATION_FILES_SOURCE_DOCUMENTS);
                Console.ReadLine();
                Environment.Exit(0);
            }

            if (!Directory.Exists(Config.LOCAL_LOCATION_FILES_PROCESSED_OUTPUTS))
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("AppConfig - Local directory for PROCESSED documents: {0} does not exist.", Config.LOCAL_LOCATION_FILES_PROCESSED_OUTPUTS);
                Console.ReadLine();
                Environment.Exit(0);
            }

            if (!Config.USE_COGNITIVE_SERVICES_V2 && !Config.USE_COGNITIVE_SERVICES_V3)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("AppConfig - You must have either Cognitive Services V2 or V3 enabled in the Config file.");
                Console.ReadLine();
                Environment.Exit(0);
            }

            if (string.IsNullOrEmpty(Config.COGNITIVE_SERVICES_KEY) || string.IsNullOrEmpty(Config.COGNITIVE_SERVICES_REGION_URI))
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("AppConfig - COGNITIVE_SERVICES_KEY or COGNITIVE_SERVICES_REGION is empty.");
                Console.ReadLine();
                Environment.Exit(0);
            }

            Console.WriteLine("Use Cognitive Services Bing Entity Search: " + Config.USE_COGNITIVE_SERVICES_BING_ENTITY_SEARCH);
            Console.WriteLine("Use Azure Blob Storage: " + Config.USE_AZURE_BLOB_STORAGE);
            Console.WriteLine("Use Azure Table Storage: " + Config.USE_AZURE_TABLE_STORAGE);
            Console.WriteLine("Use CosmosDB Storage: " + Config.USE_COSMOSDB_STORAGE);
            Console.WriteLine("--------------------------------");
            Console.WriteLine(string.Empty);

            Dictionary <string, Exception> errors        = new Dictionary <string, Exception>();
            Dictionary <string, int>       longDocuments = new Dictionary <string, int>();
            Dictionary <string, Tuple <string, string, string> > processedTrainingFiles = new Dictionary <string, Tuple <string, string, string> >(1700);

            var scoringTableEntities = new List <Microsoft.Azure.CosmosDB.Table.DynamicTableEntity>();
            var topThreeClassificationNamesDictionary         = new Dictionary <string, List <string> >();
            var topThreeClassificationProbabilitiesDictionary = new Dictionary <string, List <double> >();

            Console.ForegroundColor = ConsoleColor.Cyan;
            Console.WriteLine("Extracting content from documents...");
            Console.ResetColor();

            // List of types of extensions
            var fileTypes             = new List <Tuple <string, string> >();
            var currentFilesDirectory = string.Empty;
            var filePath = string.Empty;

            currentFilesDirectory = Config.LOCAL_LOCATION_FILES_SOURCE_DOCUMENTS;

            var docExt = new List <string> {
                ".DOC", ".DOCX", ".DOTX", ".DOT"
            };
            var files = Util.DirectoryTraverseForFiles(currentFilesDirectory).ToList();

            try
            {
                Console.WriteLine("--------------------------------");
                Console.WriteLine("Processing Files...");

                // 2) Process Files
                for (int fileNum = 0; fileNum != files.Count; fileNum++)
                {
                    // Cognitive Services API Calls
                    var cognitiveServicesApiCalls = new CognitiveServicesApiCalls();

                    // Retrieve the file path
                    filePath = files[fileNum];

                    // Retrieve the directory, file name & extension
                    var categoryAndFileNames = filePath.Replace(currentFilesDirectory, string.Empty)
                                               .Split(new string[] { "\\" }, StringSplitOptions.None);
                    var originalDocumentExtension = Path.GetExtension(filePath).ToUpper();

                    // Clean up file names
                    var category            = categoryAndFileNames[0];
                    var cleanCategory       = category.Replace(" ", "_").Replace(".", "_");
                    var fileName            = Path.GetFileName(filePath);
                    var cleanFileName       = fileName.Replace(" ", "_").Replace(".", "_");
                    var fileTotalOcr        = string.Empty;
                    var keyPhraseString     = string.Empty;
                    var entitiesString      = string.Empty;
                    var pages               = 0;
                    var uri                 = string.Empty;
                    var documentType        = "Unknown"; //type of document (i.e. PDF, Word, Excel etc.)
                    var documentSizeInBytes = 0L;

                    Console.ForegroundColor = ConsoleColor.Yellow;
                    Console.WriteLine("Processing file {0} : ID={1}  [{2} of {3}]", fileName, cleanFileName,
                                      files.IndexOf(filePath) + 1,
                                      files.Count);
                    Console.ResetColor();

                    // Group file types based on the required processing
                    fileTypes.Add(new Tuple <string, string>(fileName, originalDocumentExtension));
                    var wordDocuments = new List <string> {
                        ".DOC", ".DOCX", ".DOTX", ".DOT"
                    };
                    var excelDocuments = new List <string> {
                        ".XLS", ".XLSX", ".XLT", ".XLTX"
                    };
                    var htmlDocuments = new List <string> {
                        ".HTM", ".HTML", ".SHTML"
                    };

                    // Hold the values for the image pages
                    List <MemoryStream> imageStreams = new List <MemoryStream>();
                    // Hold the values for the OCR from pages
                    List <string> imagePagesOcr = new List <string>();

                    // Process/read document
                    MemoryStream documentStream = new MemoryStream();
                    using (var file = File.OpenRead(filePath))
                    {
                        documentSizeInBytes = new System.IO.FileInfo(filePath).Length;
                        var pdfDocumentPartFileNames = new List <string>();

                        // PDF Files
                        if (originalDocumentExtension == ".PDF")
                        {
                            documentType = "PDF";
                            // Setup PDFReader for unethical reading
                            PdfReader.unethicalreading = true;

                            // Setup PDF part cache location
                            var directoryCategoryPdfCache = Config.LOCAL_LOCATION_FILES_PROCESSED_OUTPUTS + @"\PDFCache\" + cleanCategory + @"\";
                            var directoryFilePdfCache     = directoryCategoryPdfCache + cleanFileName;
                            System.IO.Directory.CreateDirectory(directoryCategoryPdfCache.ToLower());
                            System.IO.Directory.CreateDirectory(directoryFilePdfCache.ToLower());

                            Console.WriteLine("\tCracking Document into Pages...");
                            using (PdfReader pdfReader = new PdfReader(filePath))
                            {
                                for (int pagenumber = 1; pagenumber <= pdfReader.NumberOfPages; pagenumber++)
                                {
                                    iTextSharp.text.Document iTextDocument = new iTextSharp.text.Document();

                                    var fullFilePdfPartName = directoryFilePdfCache + @"\" + cleanFileName + (pagenumber) + ".pdf";
                                    pdfDocumentPartFileNames.Add(fullFilePdfPartName);

                                    PdfCopy copy = new PdfCopy(iTextDocument, new FileStream(fullFilePdfPartName, FileMode.Create));

                                    iTextDocument.Open();
                                    copy.AddPage(copy.GetImportedPage(pdfReader, pagenumber));
                                    iTextDocument.Close();
                                }
                            }

                            Aspose.Pdf.Document document = new Aspose.Pdf.Document(file);
                            document.Save(documentStream);
                            pages = document.Pages.Count;
                            Console.WriteLine(string.Format("\tPages: {0}", pages));

                            //imageStreams.Add(document.ConvertPageToPNGMemoryStream(document.Pages[1]));
                        }
                        // Excel Files
                        else if (excelDocuments.Contains(originalDocumentExtension))
                        {
                            documentType = "Excel";
                            Aspose.Cells.Workbook workBook = new Aspose.Cells.Workbook(file);
                            pages = workBook.Worksheets.Count;
                            workBook.Save(documentStream, Aspose.Cells.SaveFormat.Pdf);
                        }
                        // HTML & Word Documents
                        else if (htmlDocuments.Contains(originalDocumentExtension) ||
                                 wordDocuments.Contains(originalDocumentExtension))
                        {
                            documentType = "Html";
                            Aspose.Words.Document document = new Aspose.Words.Document(file);
                            pages = document.PageCount;
                            document.Save(documentStream, Aspose.Words.SaveFormat.Pdf);
                        }

                        // Convert any documents into images
                        if (originalDocumentExtension == ".PDF")
                        {
                            foreach (var pdfDcoumentFileName in pdfDocumentPartFileNames)
                            {
                                var pdfDocument     = new Aspose.Pdf.Document(pdfDcoumentFileName);
                                var pdfDocumentPage = pdfDocument.Pages[1];

                                try
                                {
                                    imageStreams.Add(pdfDocument.ConvertPageToPNGMemoryStream(pdfDocumentPage));
                                }
                                catch (Exception e)
                                {
                                    Console.WriteLine("!!! ERROR !!!: Converting PDF To Image - " + pdfDcoumentFileName + " ||| " + e.ToString());
                                }
                            }
                        }
                        // Word Documents
                        else if (wordDocuments.Contains(originalDocumentExtension.ToUpper()))
                        {
                            documentType = "Word";
                            Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application();
                            var    wordDocument = new Microsoft.Office.Interop.Word.Document();
                            object missing      = System.Type.Missing;
                            wordDocument = wordApp.Documents.Open(filePath);

                            foreach (Microsoft.Office.Interop.Word.Window window in wordDocument.Windows)
                            {
                                foreach (Microsoft.Office.Interop.Word.Pane pane in window.Panes)
                                {
                                    // set the pages
                                    pages = pane.Pages.Count;

                                    for (var i = 1; i <= pane.Pages.Count; i++)
                                    {
                                        var bits = pane.Pages[i].EnhMetaFileBits;
                                        try
                                        {
                                            using (var ms = new MemoryStream((byte[])(bits)))
                                            {
                                                var imageStream = new MemoryStream();
                                                var image       = System.Drawing.Image.FromStream(ms);
                                                image.Save(imageStream, ImageFormat.Png);
                                                //ms.Position = 0;

                                                imageStreams.Add(imageStream);
                                            }
                                        }
                                        catch (System.Exception ex)
                                        {
                                            var error = ex.Message;
                                            throw (ex);
                                        }
                                    }
                                }
                            }
                            wordDocument.Close(Type.Missing, Type.Missing, Type.Missing);
                            wordApp.Quit(Type.Missing, Type.Missing, Type.Missing);
                        }
                        else if (documentStream != null && imageStreams.Count == 0)
                        {
                            var pdfDocument = new Aspose.Pdf.Document(documentStream);

                            // Done here as Aspose workaround for free version
                            for (int pageNum = 0; pageNum != pdfDocument.Pages.Count; pageNum++)
                            {
                                if (pageNum == 4)
                                {
                                    // Track documents over 4 pages
                                    longDocuments.Add(fileName, pdfDocument.Pages.Count);
                                    break;
                                }

                                // PageCollection starts at 1, not 0 index
                                imageStreams.Add(pdfDocument.ConvertPageToPNGMemoryStream(pdfDocument.Pages[pageNum + 1]));
                            }
                        }
                    }  // EOF reading file

                    Console.WriteLine("\tConverting Pages to Images...");
                    for (int i = 0; i != imageStreams.Count; i++)
                    {
                        //Console.WriteLine(string.Format("\tProcessing Image {0} of {1}", (i + 1), imageStreams.Count));

                        var imageStream = imageStreams[i];

                        // Cognitive Services Requirement: Convert to MB, Computer Vision images max out at 4 MB
                        // Fine for individual page documents
                        var megaBytes = Util.ConvertBytesToMegabytes(imageStream.Length);

                        do
                        {
                            var reducedImage = Util.ReduceImageQuality(imageStream);
                            imageStream = reducedImage;
                        }while(
                            Util.ConvertBytesToMegabytes(imageStream.Length) > 3.99
                            );


                        // Setup Image store location
                        var directoryCategory = Config.LOCAL_LOCATION_FILES_PROCESSED_OUTPUTS + @"\" + cleanCategory + @"\";
                        var directoryFile     = directoryCategory + cleanFileName;
                        var fullFileImageName = directoryFile + @"\" + cleanFileName + (i + 1) + ".png";
                        System.IO.Directory.CreateDirectory(directoryCategory.ToLower());
                        System.IO.Directory.CreateDirectory(directoryFile.ToLower());

                        // Save the image to local image folder (cache)
                        using (Bitmap image = (Bitmap)System.Drawing.Image.FromStream(imageStream))
                        {
                            var resizedImage = Util.ResizeImageForCognitiveOCR(image);
                            resizedImage.Save(fullFileImageName, ImageFormat.Png);
                        }

                        var basePath = cleanCategory + @"\" + cleanFileName + @"\" + cleanFileName + (i + 1);

                        // Set up cloud path for JSON content
                        var cloudImagePath = (basePath + ".png").ToLower();
                        var cloudOcrPath   = (basePath + ".json").ToLower();

                        // Use API that passes image binary directly
                        var ocrResult = CognitiveServices.VisionOCRResultBatchReadFromImageAsync(fullFileImageName, "v2.1").Result;
                        //var ocrResult = CognitiveServices.OCRResultBatchRead(imageUrl, "v2.1").Result;
                        cognitiveServicesApiCalls.ApiCallCount++;

                        if (Config.USE_AZURE_BLOB_STORAGE)
                        {
                            // Azure Storage Objects Init
                            var cloudStorageBloblClient = AzureStorage.BlobStorageAccount.CreateCloudBlobClient();
                            var enrichmentContainer     = cloudStorageBloblClient.GetContainerReference(Config.STORAGE_TABLE_AND_CONTAINER_NAMES.ToLower());
                            uri = enrichmentContainer.StorageUri.PrimaryUri + @"/" + cleanCategory + @"/" + cleanFileName + @"/";
                            var trainingImage    = enrichmentContainer.GetBlockBlobReference(cloudImagePath);
                            var trainingImageOcr = enrichmentContainer.GetBlockBlobReference(cloudOcrPath);

                            // Send PNG image to Blob Storage
                            using (var fs = new FileStream(fullFileImageName, FileMode.Open))
                            {
                                trainingImage.UploadFromStream(fs);
                            }

                            // Retrieve OCR keyPhraseResult and upload JSON to cloud
                            // Uses API based on cloud storage
                            var imageUrl = trainingImage.Uri.AbsoluteUri + Config.STORAGE_ACCOUNT_TEMP_SAS_KEY;

                            // Upload the JSON response to the blob containers
                            trainingImageOcr.UploadText(ocrResult.Item1);
                        }

                        // Write JSON to local disk
                        var jsonFileName = basePath + ".json";
                        System.IO.File.WriteAllText(Config.LOCAL_LOCATION_FILES_PROCESSED_OUTPUTS + @"\" + jsonFileName, ocrResult.Item1);


                        var ocrString = ocrResult.Item2.ToString();
                        imagePagesOcr.Add(ocrString);

                        // Console.WriteLine("Number of OCR Regions Found - " + ocrResult.Item2.regions.Count);
                    }

                    var ocrPhrases = new List <KeyValuePair <string, string> >();

                    foreach (var ocrItem in imagePagesOcr)
                    {
                        // remove the trial items from Aspose
                        var tempOcrItem = ocrItem.
                                          Replace("Evaluation Only. Created with Aspose.PDF. Copyright 2002-2019 Aspose Pty Ltd.", string.Empty).
                                          Replace("Evaluation Only. Created with Aspose.PDF. Copyright 2002-2018 Aspose Pty Ltd.", string.Empty).
                                          Replace("Evaluation Only. Created with Aspose.PDF", string.Empty).
                                          Replace("Evaluation Only. Created with Aspose.Words", string.Empty).
                                          Replace("Evaluation Only. Created with Aspose.Cells", string.Empty).
                                          Replace("Created with Aspose.Cells for .NET.Copyright 2003 - 2018", string.Empty).
                                          Replace("Copyright 2002-2019 Aspose Pty Ltd.", string.Empty).
                                          Replace("Copyright 2002-2018 Aspose Pty Ltd.", string.Empty).
                                          Replace("Aspose Pty Ltd.", string.Empty).
                                          Replace("Created with Aspose.", string.Empty).
                                          Replace("Copyright 2002-2018.", string.Empty).
                                          Replace("Copyright 2003-2018.", string.Empty).
                                          Replace("Copyright 2002-2018", string.Empty).
                                          Replace("Copyright 2003-2018", string.Empty).
                                          Replace("spose Pty Ltd.", string.Empty).
                                          Replace("Pty Ltd.", string.Empty).
                                          Replace("Evaluation With A", string.Empty).
                                          Replace("Evaluation With %002-2018 A", string.Empty).
                                          Replace("Evaluation Only.", string.Empty).
                                          Replace(".  Aspose Pty ", string.Empty).
                                          Replace("Evaluatqoh With %002-2018 A", string.Empty).
                                          Replace("Created with Aspose.Cells for .NET.Copyright 2003 - 2018 A", string.Empty).
                                          Replace("Evaluation Only• created 18 Aspose Pty", string.Empty).
                                          Replace("Evaluation Only. Created with Aspose.PDF", string.Empty).
                                          Replace("with Aspose. PDF", string.Empty).
                                          Replace("Aspose.PDF", string.Empty);
                        // add to main OCR file
                        fileTotalOcr += tempOcrItem + System.Environment.NewLine;

                        // Add to phrases to process (english)
                        if (tempOcrItem.Length > 5000)
                        {
                            var tempOcrItemOne = tempOcrItem.Substring(0, 5000);
                            var tempOcrItemTwo = tempOcrItem.Substring(5000, (tempOcrItem.Length - 5000));

                            // Add both
                            ocrPhrases.Add(new KeyValuePair <string, string>("en", tempOcrItemOne));
                            ocrPhrases.Add(new KeyValuePair <string, string>("en", tempOcrItemTwo));
                        }
                        else
                        {
                            ocrPhrases.Add(new KeyValuePair <string, string>("en", tempOcrItem));
                        }
                    }

                    List <string> keyPhrasesV2            = new List <string>();
                    List <string> entitiesV2              = new List <string>();
                    var           piiResultV2             = new PIIResult();
                    string        distinctKeyPhraseString = string.Empty;
                    string        distinctEntitiesString  = string.Empty;

                    if (Config.USE_COGNITIVE_SERVICES_V2)
                    {
                        // Key Phrases - V2
                        Console.WriteLine("\tKey Phrases V2...");
                        var keyPhraseResult = CognitiveServices.TextAnalyticsKeyPhrasesAndEntities(ocrPhrases, ref cognitiveServicesApiCalls);
                        keyPhrasesV2            = keyPhraseResult.Item1.Documents.SelectMany(i => i.KeyPhrases).Where(a => Helpers.IsEntity(a)).ToList();
                        distinctKeyPhraseString = string.Join(" ;;;; ", keyPhrasesV2.Distinct().ToArray());
                        keyPhraseString         = string.Join(" ;;;; ", keyPhrasesV2.ToArray());

                        // Entities - V2
                        Console.WriteLine("\tEntities V2...");
                        var entitiesRecords = keyPhraseResult.Item2.Documents.SelectMany(i => i.Entities).Where(a => Helpers.IsEntity(a.Name)).ToList();
                        entitiesV2             = entitiesRecords.Select(i => i.Name.Replace(System.Environment.NewLine, string.Empty).Trim()).ToList();
                        distinctEntitiesString = string.Join(" ;;;; ", entitiesV2.Distinct().ToArray());
                        entitiesString         = string.Join(" ;;;; ", entitiesV2.ToArray());

                        // PII Result - V2
                        Console.WriteLine("\tPII Information V2...");
                        piiResultV2 = CognitiveServices.TextAnalyticsPIIResultV2(ocrPhrases, ref cognitiveServicesApiCalls);
                    }

                    List <string> keyPhrasesV3 = new List <string>();
                    List <CognitiveServiceClasses.Entities.Entity> entitiesV3  = new List <CognitiveServiceClasses.Entities.Entity>();
                    List <CognitiveServiceClasses.PII.Entity>      piiResultV3 = new List <CognitiveServiceClasses.PII.Entity>();
                    SentimentV3Response sentimentV3Prediction = new SentimentV3Response();

                    if (Config.USE_COGNITIVE_SERVICES_V3)
                    {
                        // Key Phrases - V3
                        Console.WriteLine("\tKey Phrases V3...");
                        var textAnalyticsV3KeyPhrasesPrediction = CognitiveServices.TextAnalyticsKeyPhrasesV3PreviewAsync(ocrPhrases).Result;
                        keyPhrasesV3 = textAnalyticsV3KeyPhrasesPrediction.documents.SelectMany(a => a.keyPhrases).ToList();
                        cognitiveServicesApiCalls.ApiCallV3Count++;

                        // Entities - V3
                        Console.WriteLine("\tEntities V3...");
                        var textAnalyticsV3EntitiesPrediction = CognitiveServices.TextAnalyticsEntitiesV3PreviewAsync(ocrPhrases).Result;
                        entitiesV3 = textAnalyticsV3EntitiesPrediction.documents.SelectMany(a => a.entities).ToList();
                        cognitiveServicesApiCalls.ApiCallV3Count++;

                        // PIIs - V3
                        Console.WriteLine("\tPIIs V3...");
                        var textAnalyticsV3PIIPrediction = CognitiveServices.TextAnalyticsPIIV3PreviewAsync(ocrPhrases).Result;
                        piiResultV3 = textAnalyticsV3PIIPrediction.documents.SelectMany(a => a.entities).ToList();
                        cognitiveServicesApiCalls.ApiCallV3Count++;

                        // Sentiment Analysis - V3
                        Console.WriteLine("\tSentiment Analysis V3...");
                        var textAnalyticsInput = new TextAnalyticsInput()
                        {
                            Id   = "1",
                            Text = fileTotalOcr.Length > 5100 ? fileTotalOcr.Substring(0, 5100) : fileTotalOcr
                        };
                        var textAnalyticsInputs = new List <TextAnalyticsInput> {
                            textAnalyticsInput
                        };
                        sentimentV3Prediction = CognitiveServices.TextAnalyticsSentimentAnalysisV3PreviewAsync(textAnalyticsInputs).Result;
                        cognitiveServicesApiCalls.ApiCallV3Count++;
                    }


                    List <BingEntityData> entityTaxonyResult = new List <BingEntityData>();

                    if (Config.USE_COGNITIVE_SERVICES_BING_ENTITY_SEARCH)
                    {
                        Console.WriteLine("\tRetrieving Bing Entitites...");
                        entityTaxonyResult = CognitiveServices.BingEntities(entitiesV2);
                        Console.WriteLine("\tFinished Retrieving Bing Entitites.");
                    }


                    // Add up OCR pages & Process Key Entities
                    processedTrainingFiles.Add(cleanFileName, new Tuple <string, string, string>(cleanCategory, fileTotalOcr, keyPhraseString));

                    // Azure Table Storage
                    // Note: most attributes are truncated due to limitations for Azure Table Storage
                    // For large documents use: CosmosDb or blob storage
                    if (Config.USE_AZURE_TABLE_STORAGE)
                    {
                        var tableStorageClient = AzureStorage.BlobStorageAccount.CreateCloudTableClient();
                        Microsoft.WindowsAzure.Storage.Table.CloudTable table = tableStorageClient.GetTableReference(Config.STORAGE_TABLE_AND_CONTAINER_NAMES);
                        table.CreateIfNotExists();

                        Util.WriteToBlobStorageTable(table, cleanCategory, cleanFileName, fileTotalOcr,
                                                     keyPhraseString, distinctKeyPhraseString,
                                                     entitiesString, distinctEntitiesString,
                                                     pages, uri, documentType, documentSizeInBytes,
                                                     piiResultV2, piiResultV3,
                                                     entityTaxonyResult, sentimentV3Prediction,
                                                     cognitiveServicesApiCalls);
                        Console.WriteLine("\tPersisted: Azure Table Storage");
                    }

                    if (Config.USE_COSMOSDB_STORAGE)
                    {
                        // Cosmos DB - Documents Client
                        DocumentClient documentClient = new DocumentClient(Config.COSMOSDB_DOCUMENTS_URI, Config.COSMOSDB_DOCUMENTS_KEY);

                        //// CosmosDB SQL API
                        Util.WriteToCosmosDbStorageSQLApi(documentClient, cleanCategory, cleanFileName, fileTotalOcr,
                                                          keyPhrasesV2, keyPhrasesV3,
                                                          entitiesV2, entitiesV3,
                                                          pages, uri, documentType, documentSizeInBytes,
                                                          piiResultV2, piiResultV3,
                                                          entityTaxonyResult, sentimentV3Prediction,
                                                          cognitiveServicesApiCalls);
                        Console.WriteLine("\tPersisted: CosmosDB - SQL API");
                    }

                    //// CosmosDB SQL API
                    Util.WriteToLocalStorage(cleanCategory, cleanFileName, fileTotalOcr,
                                             keyPhrasesV2, keyPhrasesV3,
                                             entitiesV2, entitiesV3,
                                             pages, uri, documentType, documentSizeInBytes,
                                             piiResultV2, piiResultV3,
                                             entityTaxonyResult, sentimentV3Prediction,
                                             cognitiveServicesApiCalls);
                    Console.WriteLine("\tPersisted: Local Storage");
                }
                ;  // EOF for loop
            }
            catch (Exception e)
            {
                errors.Add(filePath, e);
                Console.WriteLine("!!! ERROR !!!: " + e.ToString());
            }

            // Write File
            // Util.WriteCsvFile(processedTrainingFiles);

            // Distribution of various file types (by extension)
            var fileTypeCounts = fileTypes.GroupBy(a => a.Item2).
                                 Select(group => new
            {
                Extension = group.Key.ToUpper(),
                Count     = group.Count()
            }).OrderByDescending(o => o.Count);


            // Print out errors file
            File.Delete(Config.LOCAL_LOCATION_FILES_SOURCE_DOCUMENTS + "Errors.txt");
            using (TextWriter tw = new StreamWriter(Config.LOCAL_LOCATION_FILES_SOURCE_DOCUMENTS + "Errors.txt"))
            {
                foreach (var error in errors)
                {
                    tw.WriteLine(error.Key);
                }
            }

            Console.WriteLine("Number of errors: " + errors.Count);
            Console.ReadLine();
        }
Beispiel #6
0
        /// <summary>
        /// Text Analytics - V2 - PII Result
        /// </summary>
        /// <param name="ocrPhrasesSamples"></param>
        /// <returns></returns>
        public static PIIResult TextAnalyticsPIIResultV2(List <KeyValuePair <string, string> > ocrPhrasesSamples, ref CognitiveServicesApiCalls cognitiveServicesApiCalls)
        {
            var contentModeratorClient = new ContentModeratorClient(new ApiKeyServiceClientCredentialsContentModerator())
            {
                Endpoint = Config.COGNITIVE_SERVICES_REGION_URI
            };

            var textInputs = (ocrPhrasesSamples.Select((v, i) => v.ToString()).ToList());

            var emails       = new List <string>();
            var phoneNumbers = new List <string>();
            var addresses    = new List <string>();
            var ssnNumbers   = new List <string>();

            foreach (var textInput in textInputs)
            {
                // Content Moderator needs strings max at 1024 chararacters
                var textsToModerate = Helpers.SplitAndPadFourtyChars(textInput, 980);

                foreach (var truncatedTextForModerator in textsToModerate)
                {
                    //truncatedTextForModerator = truncatedTextForModerator.Trim();

                    if (truncatedTextForModerator.Trim() != string.Empty)
                    {
                        var result = contentModeratorClient.TextModeration.ScreenText("text/plain",
                                                                                      new MemoryStream(Encoding.UTF8.GetBytes(truncatedTextForModerator)),
                                                                                      string.Empty, true, true, null, true);
                        cognitiveServicesApiCalls.ApiCallV2Count++;

                        // Check if PII is not NULL
                        if (!(result.PII is null))
                        {
                            var emailsToAdd = result.PII.Email.Select(a => a.Text).ToList();
                            emails.AddRange(emailsToAdd);
                            var phoneNumbersToAdd = result.PII.Phone.Select(a => a.Text).ToList();
                            phoneNumbers.AddRange(phoneNumbersToAdd);
                            var addressesToAdd = result.PII.Address.Select(a => a.Text).ToList();
                            addresses.AddRange(addressesToAdd);
                            var ssnNumbersToAdd = result.PII.SSN.Select(a => a.Text).ToList();
                            ssnNumbers.AddRange(ssnNumbersToAdd);
                        }
                    }
                }
            }

            var piiResult = new PIIResult
            {
                Emails       = emails,
                Addresses    = addresses,
                PhoneNumbers = phoneNumbers,
                SSNs         = ssnNumbers
            };

            return(piiResult);
        }
Beispiel #7
0
        /// <summary>
        /// Text Analytics - V2 - Key Phrases & Entities
        /// </summary>
        /// <param name="keyPhrasesSamples"></param>
        /// <returns></returns>
        public static Tuple <KeyPhraseBatchResult, EntitiesBatchResult> TextAnalyticsKeyPhrasesAndEntities(List <KeyValuePair <string, string> > keyPhrasesSamples, ref CognitiveServicesApiCalls cognitiveServicesApiCalls)
        {
            var creds = new ApiKeyServiceClientCredentials();

            // Build client API call
            ITextAnalyticsClient client = new TextAnalyticsClient(creds)
            {
                Endpoint = Config.COGNITIVE_SERVICES_REGION_URI
            };

            // Getting key-phrases
            var lengthofText = keyPhrasesSamples.Select((v, i) => v.Value.ToString().Length).Sum();

            //Console.WriteLine(string.Format("\tDocs: {0}", keyPhrasesSamples.Count));
            Console.WriteLine(string.Format("\tCharacters: {0}", lengthofText));

            var multiLanguageInputs       = (keyPhrasesSamples.Select((v, i) => new MultiLanguageInput(v.Key, i.ToString(), v.Value)).ToList());
            var multiLanguageInputsString = String.Join(string.Empty, multiLanguageInputs.Select(a => a.Text).ToList());
            //Console.WriteLine("OCR Text Sent for key phrases: " + Math.Round(mb, 3));

            // Send batches of 100 inputs
            int batches = multiLanguageInputs.Count / 100 + 1;

            var test = new Tuple <KeyPhraseBatchResult, EntitiesBatchResult>(null, null);

            ;
            var keyPhraseBatchResults = new List <KeyPhraseBatchResult>();
            var entityBatchResults    = new List <EntitiesBatchResult>();

            for (int i = 0; i != batches; i++)
            {
                // set up the batches
                var multiLanguageInputsToProcess = multiLanguageInputs.Skip(i * 100).Take(100).ToList();

                if (multiLanguageInputsToProcess.Count > 0)
                {
                    var multiLanguageBatch = new MultiLanguageBatchInput(multiLanguageInputsToProcess);

                    Console.WriteLine(string.Format("\tProcessing Batch {0} of {1}", (i + 1), batches));

                    // key phrases result
                    var keyPhraseMiniBatchResult = client.KeyPhrasesAsync(true,
                                                                          new MultiLanguageBatchInput(multiLanguageInputsToProcess)).Result;
                    keyPhraseBatchResults.Add(keyPhraseMiniBatchResult);
                    cognitiveServicesApiCalls.ApiCallV2Count++;

                    var entitiesMiniBatchResult = client.EntitiesAsync(true,
                                                                       new MultiLanguageBatchInput(multiLanguageInputsToProcess)).Result;
                    entityBatchResults.Add(entitiesMiniBatchResult);
                    cognitiveServicesApiCalls.ApiCallV2Count++;
                }
            }

            var keyPhraseDocuments   = keyPhraseBatchResults.SelectMany(i => i.Documents).ToList();
            var keyPhraseErrors      = keyPhraseBatchResults.SelectMany(i => i.Errors).ToList();
            var keyPhraseBatchResult = new KeyPhraseBatchResult(keyPhraseDocuments, keyPhraseErrors);

            var entitiesDcouments   = entityBatchResults.SelectMany(i => i.Documents).ToList();
            var entitiesErrors      = entityBatchResults.SelectMany(i => i.Errors).ToList();
            var entitiesBatchResult = new EntitiesBatchResult(entitiesDcouments, entitiesErrors);

            //var tuple = (KeyPhraseBatchResult: keyPhraseBatchResult, EntitiesBatchResult: entitiesBatchResult);

            return(new Tuple <KeyPhraseBatchResult, EntitiesBatchResult>(keyPhraseBatchResult, entitiesBatchResult));
        }