public DocumentLogEvent(TextArtifact artifact) : base(artifact) { PotentialSensitiveData = artifact.HasSensitiveData ? artifact.SensitiveData.Values.Aggregate((p, n) => p + "," + n) : ""; CompetitorsNamePresent = artifact.CompetitorNamesPresent.Count > 0 ? artifact.CompetitorNamesPresent.Aggregate((p, n) => p + "," + n) : ""; KeyWords = string.Join(",", artifact.KeyWords.ToArray()); Entities = string.Join(",", artifact.Entities.ToArray()); UserOp = artifact.HasFileSource ? (artifact.Source as FileArtifact).UserOp.ToString() : string.Empty; Global.Logger.Debug("Created Azure Log Analytics log event {0} for text artifact {1} from user op {2} at {3}.", Name, artifact.Id, UserOp, DateTime.Now); }
protected override ApiResult ProcessClientQueueMessage(ImageArtifact message) { BitmapData bData = message.Image.LockBits( new Rectangle(0, 0, message.Image.Width, message.Image.Height), ImageLockMode.ReadOnly, message.Image.PixelFormat); int w = bData.Width, h = bData.Height, bpp = Image.GetPixelFormatSize(bData.PixelFormat) / 8; unsafe { TesseractImage.SetImage(new UIntPtr(bData.Scan0.ToPointer()), w, h, bpp, bData.Stride); } Pix = TesseractImage.GetInputImage(); Debug("Pix has width: {0} height: {1} depth: {2} xres: {3} yres: {4}.", Pix.Width, Pix.Height, Pix.Depth, Pix.XRes, Pix.YRes); List <string> text; using (var op = Begin("Tesseract OCR (fast)")) { TesseractImage.Recognize(); ResultIterator resultIterator = TesseractImage.GetIterator(); text = new List <string>(); PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA; do { string r = resultIterator.GetUTF8Text(pageIteratorLevel); if (r.IsEmpty()) { continue; } text.Add(r.Trim()); }while (resultIterator.Next(pageIteratorLevel)); if (text.Count > 0) { string alltext = text.Aggregate((s1, s2) => s1 + " " + s2).Trim(); if (text.Count < 7) { Info("Artifact id {0} is likely a photo or non-text image.", message.Id); } else { message.OCRText = text; Info("OCR Text: {0}", alltext); } } else { Info("No text recognized in artifact id {0}.", message.Id); } op.Complete(); } message.Image.UnlockBits(bData); if (text.Count >= 7) { TextArtifact artifact = new TextArtifact(message.Name + ".txt", text); EnqueueMessage(artifact); Info("{0} added artifact id {1} of type {2} from artifact {3}.", Name, artifact.Id, artifact.GetType(), message.Id); } return(ApiResult.Success); }
protected override ApiResult ProcessClientQueueMessage(ImageArtifact message) { BitmapData bData = message.Image.LockBits( new Rectangle(0, 0, message.Image.Width, message.Image.Height), ImageLockMode.ReadOnly, message.Image.PixelFormat); int w = bData.Width, h = bData.Height, bpp = Image.GetPixelFormatSize(bData.PixelFormat) / 8; unsafe { TesseractImage.SetImage(new UIntPtr(bData.Scan0.ToPointer()), w, h, bpp, bData.Stride); } Pix = TesseractImage.GetInputImage(); Debug("Pix has width: {0} height: {1} depth: {2} xres: {3} yres: {4}.", Pix.Width, Pix.Height, Pix.Depth, Pix.XRes, Pix.YRes); List <string> text; using (var op = Begin("Tesseract OCR (fast)")) { TesseractImage.Recognize(); ResultIterator resultIterator = TesseractImage.GetIterator(); text = new List <string>(); PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA; do { string ant = TextArtifact.GetAlphaNumericString(resultIterator.GetUTF8Text(pageIteratorLevel)); ant = string.Join(" ", ant.Split(new[] { ' ', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries) .Where(word => TextArtifact.IsNumber(word) || word.Length > 3 || Pipeline.Dictionaries["common_words_en_3grams"].Contains(word))) .Trim(); if (ant.IsEmpty()) { continue; } else { text.Add(ant); } }while (resultIterator.Next(pageIteratorLevel)); if (text.Count > 0) { string alltext = text.Aggregate((s1, s2) => s1 + " " + s2).Trim(); if (text.Count < 7) { Info("Artifact id {0} is likely a photo or non-text image.", message.Id); } else { message.OCRText = text; Info("OCR Text: {0}", alltext); } } else { Info("No text recognized in artifact id {0}.", message.Id); } op.Complete(); } message.Image.UnlockBits(bData); if (text.Count >= 7) { TextArtifact artifact = new TextArtifact(message.Name + ".txt", string.Join(Environment.NewLine, text.ToArray())); artifact.Source = message.Source; artifact.CurrentProcess = message.CurrentProcess; artifact.CurrentWindowTitle = message.CurrentWindowTitle; artifact.Image = message; message.TextArtifact = artifact; EnqueueMessage(artifact); Info("{0} added artifact id {1} of type {2} from artifact {3}.", Name, artifact.Id, artifact.GetType(), message.Id); } return(ApiResult.Success); }