public static async Task HighlightHits(ProcessedDocument procesedDocument, Stream originalImage, CloudBlockBlob markedUpImage) { using (Image <Rgba32> image = Image.Load <Rgba32>(originalImage)) { foreach (var line in procesedDocument.Lines) { int maxX = max(line.BoundingBox.x1, line.BoundingBox.x2, line.BoundingBox.x3, line.BoundingBox.x4); int maxY = max(line.BoundingBox.y1, line.BoundingBox.y2, line.BoundingBox.y3, line.BoundingBox.y4); int minX = min(line.BoundingBox.x1, line.BoundingBox.x2, line.BoundingBox.x3, line.BoundingBox.x4); int minY = min(line.BoundingBox.y1, line.BoundingBox.y2, line.BoundingBox.y3, line.BoundingBox.y4); using (Image <Rgba32> highlight = new Image <Rgba32>(new Configuration(), maxX - minX, maxY - minY, Rgba32.Yellow)) { image.Mutate(x => { x.DrawImage(highlight, .5f, new SixLabors.Primitives.Point(minX, minY)); }); } } markedUpImage.Properties.ContentType = "image/jpeg"; using (var outboundStream = await markedUpImage.OpenWriteAsync()) { image.Save(outboundStream, ImageFormats.Jpeg); } } }
public async Task <string> TestDocument(Document document) { ProcessedDocument processedDocument = _iPreProcessTextService.PreProcessDocument(document); string topic = _iLearningService.Classify(processedDocument); return(topic); }
public async Task <Document> TrainDocument(Document document) { ProcessedDocument processedDocument = _iPreProcessTextService.PreProcessDocument(document); _iLearningService.Train(processedDocument); return(await _documentRepository.AddAsync(document)); }
private static DataField FindMoneyLine(ProcessedDocument document, string name, string title, MortgageApplicationDocument mortgageApplication, params string[] titlesToFind) { DataField dataField = FindDataLine(document, name, title, mortgageApplication, titlesToFind); if (dataField != null) { double tmpVal; if (double.TryParse(CLEANUPMONEY.Replace(dataField.Value, ""), out tmpVal)) { dataField.Value = tmpVal.ToString("c"); } } return(dataField); }
public string Classify(ProcessedDocument document) { Console.WriteLine("## Start classifying document ##"); _document = document; //todo keep this value in cache _nrDistinctWords = _iWordRepository.GetTotalCountOfDistinctWords(); CalculateProbabilityPerTopic("business"); CalculateProbabilityPerTopic("entertainment"); CalculateProbabilityPerTopic("politics"); CalculateProbabilityPerTopic("sport"); CalculateProbabilityPerTopic("tech"); Console.WriteLine("## Finished classifying document ##"); return(ChooseClass()); }
public ProcessedDocument PreProcessDocument(Document document) { Console.WriteLine("## Pre processing the document ##"); string text = document.Text; LowerText(text); ReplaceNonAlphaNumericCharacters(text); List <string> bagOfWords = SplitInList(text); RemoveSingleLetters(bagOfWords); RemoveStopWords(bagOfWords); ProcessedDocument processedDocument = new ProcessedDocument(document, BuildWordList(bagOfWords, document)); Console.WriteLine("## Finished pre processing the document ##"); return(processedDocument); }
public static async Task Run([BlobTrigger("parseddocuments/w2/{name}", Connection = "Documents")] CloudBlockBlob parsedBlob, [Blob("parseddocuments", Connection = "Documents")] CloudBlobContainer blobDirectory, [Blob("rawdocuments/{name}", FileAccess.Read)] Stream originalImage, [Blob("parseddocuments/markedup/{name}", FileAccess.ReadWrite)] CloudBlockBlob markedUpImage, ILogger log, Microsoft.Azure.WebJobs.ExecutionContext context, CancellationToken cancellationToken) { try { MortgageApplicationDocument mortgageApplicationDocument = new MortgageApplicationDocument(); mortgageApplicationDocument.PopuplateFromBlobProperties(parsedBlob); mortgageApplicationDocument.HasParsedResults = true; mortgageApplicationDocument.DocumentType = "w2"; string jsonContents = await parsedBlob.DownloadTextAsync(); ProcessedDocument document = JsonConvert.DeserializeObject <ProcessedDocument>(jsonContents); CloudBlockBlob finalBlob = blobDirectory.GetBlockBlobReference(parsedBlob.Name.Replace("w2/", "final/")); mortgageApplicationDocument.Status = MortgageApplicationStatus.Processed; mortgageApplicationDocument.DateProcessed = DateTimeOffset.UtcNow; mortgageApplicationDocument.SetBlobProperties(finalBlob); FindDataLine(document, "SSN", "Social Security Number", mortgageApplicationDocument, "a. Employee's social security number", "social security number", "Employee's soc. sec. number", "Employee's social security number"); FindMoneyLine(document, "Wages", "Total Wages", mortgageApplicationDocument, "Wages"); FindMoneyLine(document, "FedTax", "Federal Tax Witholding", mortgageApplicationDocument, "2. Federal income tax", "Federal income tax withheld"); FindDataLine(document, "CopyType", "Copy Type", mortgageApplicationDocument, "Copy"); FindMoneyLine(document, "SocialSecurityWage", "Social Security Wage", mortgageApplicationDocument, "Social security wages"); FindMoneyLine(document, "SocialSecurityTax", "Social Security Tax", mortgageApplicationDocument, "Social security tax withheld"); FindMoneyLine(document, "MedicareWage", "Medicare Wage", mortgageApplicationDocument, "Medicare wages and tips"); FindMoneyLine(document, "MedicareTax", "Medicare Tax", mortgageApplicationDocument, "Medicare tax withheld"); FindMoneyLine(document, "SocialSecurityTips", "Social Security Tips", mortgageApplicationDocument, "Social security tips"); FindDataLine(document, "EmployerDetails", "Employer Details", mortgageApplicationDocument, "Employer's name, address, and ZIP code"); FindDataLine(document, "EmployeeDetails", "Employee Details", mortgageApplicationDocument, "Employee's first name and initial", "Last name", "Employee's name, address, and ZIP code", "Employee's name (first, middle indial, last)"); await finalBlob.UploadTextAsync(JsonConvert.SerializeObject(mortgageApplicationDocument)); await MarkupService.HighlightHits(document, originalImage, markedUpImage); } catch (Exception ex) { log.LogError(ex, "W2 Processor Failed"); } }
private static DataField FindDataLine(ProcessedDocument document, string name, string title, MortgageApplicationDocument mortgageApplication, params string[] titlesToFind) { ProcessedLine foundLine = null; foreach (var titleToFind in titlesToFind) { string cleanTitle = CLEANUP.Replace(titleToFind, ""); foundLine = document.Lines.Where(l => CLEANUP.Replace(l.Text, "").IndexOf(cleanTitle, StringComparison.OrdinalIgnoreCase) > -1).FirstOrDefault(); if (foundLine != null) { break; } } var dataLine = foundLine?.BoundingBox.FindClosestsBelow(document.Lines) ?? null; if (dataLine == null) { return(null); } DataField dataField = new DataField() { FieldName = name, FieldTitle = title, LabelBox = foundLine.BoundingBox, ValueBox = dataLine.BoundingBox, Value = dataLine.Text }; if (mortgageApplication.DataFields == null) { mortgageApplication.DataFields = new List <DataField>(); } mortgageApplication.DataFields.Add(dataField); return(dataField); }
//todo: build a dict with distinct words before insert in the db to save time public void Train(ProcessedDocument document) { _iWordRepository.AddMultipleWords(document.Words); return; }