Ejemplo n.º 1
0
        public static async Task HighlightHits(ProcessedDocument procesedDocument, Stream originalImage, CloudBlockBlob markedUpImage)
        {
            using (Image <Rgba32> image = Image.Load <Rgba32>(originalImage))
            {
                foreach (var line in procesedDocument.Lines)
                {
                    int maxX = max(line.BoundingBox.x1, line.BoundingBox.x2, line.BoundingBox.x3, line.BoundingBox.x4);
                    int maxY = max(line.BoundingBox.y1, line.BoundingBox.y2, line.BoundingBox.y3, line.BoundingBox.y4);
                    int minX = min(line.BoundingBox.x1, line.BoundingBox.x2, line.BoundingBox.x3, line.BoundingBox.x4);
                    int minY = min(line.BoundingBox.y1, line.BoundingBox.y2, line.BoundingBox.y3, line.BoundingBox.y4);

                    using (Image <Rgba32> highlight = new Image <Rgba32>(new Configuration(), maxX - minX, maxY - minY, Rgba32.Yellow))
                    {
                        image.Mutate(x =>
                        {
                            x.DrawImage(highlight, .5f, new SixLabors.Primitives.Point(minX, minY));
                        });
                    }
                }

                markedUpImage.Properties.ContentType = "image/jpeg";
                using (var outboundStream = await markedUpImage.OpenWriteAsync())
                {
                    image.Save(outboundStream, ImageFormats.Jpeg);
                }
            }
        }
Ejemplo n.º 2
0
        public async Task <string> TestDocument(Document document)
        {
            ProcessedDocument processedDocument = _iPreProcessTextService.PreProcessDocument(document);
            string            topic             = _iLearningService.Classify(processedDocument);

            return(topic);
        }
Ejemplo n.º 3
0
        public async Task <Document> TrainDocument(Document document)
        {
            ProcessedDocument processedDocument = _iPreProcessTextService.PreProcessDocument(document);

            _iLearningService.Train(processedDocument);
            return(await _documentRepository.AddAsync(document));
        }
Ejemplo n.º 4
0
        private static DataField FindMoneyLine(ProcessedDocument document, string name, string title, MortgageApplicationDocument mortgageApplication, params string[] titlesToFind)
        {
            DataField dataField = FindDataLine(document, name, title, mortgageApplication, titlesToFind);

            if (dataField != null)
            {
                double tmpVal;
                if (double.TryParse(CLEANUPMONEY.Replace(dataField.Value, ""), out tmpVal))
                {
                    dataField.Value = tmpVal.ToString("c");
                }
            }

            return(dataField);
        }
        public string Classify(ProcessedDocument document)
        {
            Console.WriteLine("## Start classifying document ##");

            _document = document;
            //todo keep this value in cache
            _nrDistinctWords = _iWordRepository.GetTotalCountOfDistinctWords();
            CalculateProbabilityPerTopic("business");
            CalculateProbabilityPerTopic("entertainment");
            CalculateProbabilityPerTopic("politics");
            CalculateProbabilityPerTopic("sport");
            CalculateProbabilityPerTopic("tech");
            Console.WriteLine("## Finished classifying document ##");

            return(ChooseClass());
        }
Ejemplo n.º 6
0
        public ProcessedDocument PreProcessDocument(Document document)
        {
            Console.WriteLine("## Pre processing the document ##");

            string text = document.Text;

            LowerText(text);
            ReplaceNonAlphaNumericCharacters(text);
            List <string> bagOfWords = SplitInList(text);

            RemoveSingleLetters(bagOfWords);
            RemoveStopWords(bagOfWords);
            ProcessedDocument processedDocument = new ProcessedDocument(document, BuildWordList(bagOfWords, document));

            Console.WriteLine("## Finished pre processing the document ##");
            return(processedDocument);
        }
Ejemplo n.º 7
0
        public static async Task Run([BlobTrigger("parseddocuments/w2/{name}", Connection = "Documents")] CloudBlockBlob parsedBlob,
                                     [Blob("parseddocuments", Connection = "Documents")] CloudBlobContainer blobDirectory,
                                     [Blob("rawdocuments/{name}", FileAccess.Read)] Stream originalImage,
                                     [Blob("parseddocuments/markedup/{name}", FileAccess.ReadWrite)] CloudBlockBlob markedUpImage,
                                     ILogger log, Microsoft.Azure.WebJobs.ExecutionContext context, CancellationToken cancellationToken)
        {
            try
            {
                MortgageApplicationDocument mortgageApplicationDocument = new MortgageApplicationDocument();
                mortgageApplicationDocument.PopuplateFromBlobProperties(parsedBlob);
                mortgageApplicationDocument.HasParsedResults = true;
                mortgageApplicationDocument.DocumentType     = "w2";

                string jsonContents = await parsedBlob.DownloadTextAsync();

                ProcessedDocument document = JsonConvert.DeserializeObject <ProcessedDocument>(jsonContents);

                CloudBlockBlob finalBlob = blobDirectory.GetBlockBlobReference(parsedBlob.Name.Replace("w2/", "final/"));
                mortgageApplicationDocument.Status        = MortgageApplicationStatus.Processed;
                mortgageApplicationDocument.DateProcessed = DateTimeOffset.UtcNow;
                mortgageApplicationDocument.SetBlobProperties(finalBlob);

                FindDataLine(document, "SSN", "Social Security Number", mortgageApplicationDocument, "a. Employee's social security number", "social security number", "Employee's soc. sec. number", "Employee's social security number");
                FindMoneyLine(document, "Wages", "Total Wages", mortgageApplicationDocument, "Wages");
                FindMoneyLine(document, "FedTax", "Federal Tax Witholding", mortgageApplicationDocument, "2. Federal income tax", "Federal income tax withheld");
                FindDataLine(document, "CopyType", "Copy Type", mortgageApplicationDocument, "Copy");
                FindMoneyLine(document, "SocialSecurityWage", "Social Security Wage", mortgageApplicationDocument, "Social security wages");
                FindMoneyLine(document, "SocialSecurityTax", "Social Security Tax", mortgageApplicationDocument, "Social security tax withheld");
                FindMoneyLine(document, "MedicareWage", "Medicare Wage", mortgageApplicationDocument, "Medicare wages and tips");
                FindMoneyLine(document, "MedicareTax", "Medicare Tax", mortgageApplicationDocument, "Medicare tax withheld");
                FindMoneyLine(document, "SocialSecurityTips", "Social Security Tips", mortgageApplicationDocument, "Social security tips");
                FindDataLine(document, "EmployerDetails", "Employer Details", mortgageApplicationDocument, "Employer's name, address, and ZIP code");
                FindDataLine(document, "EmployeeDetails", "Employee Details", mortgageApplicationDocument, "Employee's first name and initial", "Last name", "Employee's name, address, and ZIP code", "Employee's name (first, middle indial, last)");

                await finalBlob.UploadTextAsync(JsonConvert.SerializeObject(mortgageApplicationDocument));

                await MarkupService.HighlightHits(document, originalImage, markedUpImage);
            }
            catch (Exception ex)
            {
                log.LogError(ex, "W2 Processor Failed");
            }
        }
Ejemplo n.º 8
0
        private static DataField FindDataLine(ProcessedDocument document, string name, string title, MortgageApplicationDocument mortgageApplication, params string[] titlesToFind)
        {
            ProcessedLine foundLine = null;

            foreach (var titleToFind in titlesToFind)
            {
                string cleanTitle = CLEANUP.Replace(titleToFind, "");
                foundLine = document.Lines.Where(l => CLEANUP.Replace(l.Text, "").IndexOf(cleanTitle, StringComparison.OrdinalIgnoreCase) > -1).FirstOrDefault();

                if (foundLine != null)
                {
                    break;
                }
            }

            var dataLine = foundLine?.BoundingBox.FindClosestsBelow(document.Lines) ?? null;

            if (dataLine == null)
            {
                return(null);
            }

            DataField dataField = new DataField()
            {
                FieldName  = name,
                FieldTitle = title,
                LabelBox   = foundLine.BoundingBox,
                ValueBox   = dataLine.BoundingBox,
                Value      = dataLine.Text
            };

            if (mortgageApplication.DataFields == null)
            {
                mortgageApplication.DataFields = new List <DataField>();
            }

            mortgageApplication.DataFields.Add(dataField);

            return(dataField);
        }
 //todo: build a dict with distinct words before insert in the db to save time
 public void Train(ProcessedDocument document)
 {
     _iWordRepository.AddMultipleWords(document.Words);
     return;
 }