public static Document Construct(this LightDocument document, IWordFactory factory) { var result = new Document(document.Text); result.Author = document.Author; result.DocumentTime = document.DocumentTime; result.Id = document.Id; document.Title = document.Title; foreach (var sentence in document.Sentences) { var resultSentence = new SentenceItem(sentence.Text); result.Add(resultSentence); if (sentence.Words != null) { for (var i = 0; i < sentence.Words.Length; i++) { var word = sentence.Words[i]; var wordItem = factory.CreateWord(word.Text, word.Tag); wordItem.WordIndex = i; WordEx wordData = WordExFactory.Construct(wordItem); wordData.Phrase = word.Phrase; resultSentence.Add(wordData); } } } return(result); }
protected override LightDocument ActualProcess(ParseRequest request) { var tokenizer = sentenceTokenizer.Create(true, false); var wordsExtraction = new SimpleWordsExtraction(tokenizer); LightDocument document = wordsExtraction.GetDocument(request.Document.Text).GetLight(); return(document); }
public Task <bool> Save(LightDocument document) { var cacheEntryOptions = new MemoryCacheEntryOptions() .SetSlidingExpiration(TimeSpan.FromMinutes(1)); document = document.CloneJson(); // Save data in cache. cache.Set(document.GetId(), document, cacheEntryOptions); cache.Set(document.GetTextId(), document, cacheEntryOptions); return(Task.FromResult(true)); }
public async Task <LightDocument> Process(ParseRequest request) { if (request?.Document == null) { throw new ArgumentNullException(nameof(request)); } using (new PerformanceTrace(item => log.LogDebug(item), "Process")) { var text = request.Document.Text.Trim(); request.Document.Text = text; if (string.IsNullOrWhiteSpace(request.Document.Id)) { var tag = request.Document.Id = Guid.NewGuid().ToString(); log.LogDebug("Key not found on document. generating: {0}...", tag); } LightDocument document = await cache.GetCached(request.Document).ConfigureAwait(false); if (document != null) { log.LogDebug("Cache HIT"); document = document.CloneJson(); document.Id = request.Document.Id; return(document); } if (!string.IsNullOrEmpty(text)) { document = await Task.Run(() => ActualProcess(request)).ConfigureAwait(false); } else { document = new LightDocument(); document.Sentences = Array.Empty <LightSentence>(); log.LogInformation("Empty document detected"); } document.Id = request.Document.Id; document.DocumentTime = request.Document.DocumentTime; document.Author = request.Document.Author; document.Title = request.Document.Title; if (await cache.Save(document).ConfigureAwait(false)) { return(document); } return(document); } }
public static Document Construct(this LightDocument document, IWordFactory factory) { var result = new Document(document.Text); result.Author = document.Author; result.DocumentTime = document.DocumentTime; result.Id = document.Id; document.Title = document.Title; foreach (var sentence in document.Sentences) { var resultSentence = new SentenceItem(sentence.Text); result.Add(resultSentence, false); if (sentence.Words != null) { for (var i = 0; i < sentence.Words.Length; i++) { var word = sentence.Words[i]; var wordItem = factory.CreateWord(word.Text, word.Tag); wordItem.WordIndex = i; WordEx wordData = WordExFactory.Construct(wordItem); wordData.Phrase = word.Phrase; if (!string.IsNullOrEmpty(word.Entity)) { if (entityCache.TryGetValue(word.Entity, out var entity)) { wordData.EntityType = entity; } else { wordData.CustomEntity = word.Entity; } } else { wordData.EntityType = NamedEntities.None; } resultSentence.Add(wordData); } } } return(result); }
public async Task <LightDocument> Process(ParseRequest request) { if (splitter == null) { log.LogInformation("Constructing NEW {0} splitter...", id); Interlocked.Exchange(ref current, 0); splitter = factory(); } LightDocument result = await splitter.Process(request).ConfigureAwait(false); if (Interlocked.Increment(ref current) >= maxProcessing) { splitter.Dispose(); splitter = null; } return(result); }
public async Task <bool> Save(LightDocument document) { if (document is null) { throw new ArgumentNullException(nameof(document)); } if (string.IsNullOrEmpty(document.Id)) { throw new ArgumentException("Value cannot be null or empty id.", nameof(document.Id)); } await local.Save(document).ConfigureAwait(false); var key = new RepositoryKey(this, new ObjectKey(document.Id)); key.AddIndex(new IndexKey(this, "Index:All", false)); key.AddIndex(new IndexKey(this, $"Index:{document.GetId()}", true)); key.AddIndex(new IndexKey(this, $"Index:{document.GetTextId()}", true)); await manager.Client.AddRecord(key, document).ConfigureAwait(false); return(true); }
public async Task Test() { Assert.Throws <ArgumentNullException>(() => instance.GetCached(null)); var doc = new LightDocument(); doc.Id = "Test"; doc.Text = "Test"; var result = await instance.GetCached(doc).ConfigureAwait(false); Assert.IsNull(result); await instance.Save(doc).ConfigureAwait(false); result = await instance.GetCached(doc).ConfigureAwait(false); Assert.AreNotSame(doc, result); Assert.AreEqual("Test", result.Text); doc.Id = "2"; result = await instance.GetCached(doc).ConfigureAwait(false); Assert.AreNotSame(doc, result); Assert.AreEqual("Test", result.Text); }
protected override LightDocument ActualProcess(ParseRequest request) { // NOT Thread Safe var sentences = sentenceSplitter.Split(request.Document.Text).ToArray(); var document = new LightDocument(); document.Text = request.Document.Text; document.Sentences = new LightSentence[sentences.Length]; int added = 0; foreach (var sentence in sentences) { var text = repairHandler.Repair(sentence); if (sentence != text) { log.LogTrace("Sentence repaired!"); } var result = ProcessSentence(text); if (result != null) { document.Sentences[added] = result; added++; } } if (added < document.Sentences.Length) { var sentencesData = document.Sentences; Array.Resize(ref sentencesData, added); document.Sentences = sentencesData; } return(document); }
public void Setup() { document = new LightDocument(); }
protected override LightDocument ActualProcess(ParseRequest request) { var sentences = sentenceSplitter.Split(request.Document.Text).ToArray(); var sentenceDataList = new List <SentenceData>(sentences.Length); foreach (var sentence in sentences) { var text = repairHandler.Repair(sentence); if (sentence != text) { log.LogDebug("Sentence repaired!"); } var sentenceData = new SentenceData { Text = text }; sentenceData.Tokens = tokenizer.Tokenize(sentenceData.Text); if (sentenceData.Tokens.Length <= 0) { continue; } sentenceData.Tags = posTagger.Tag(sentenceData.Tokens); sentenceData.Chunks = chunker.ChunkAsSpans(sentenceData.Tokens, sentenceData.Tags).ToArray(); sentenceDataList.Add(sentenceData); } var document = new LightDocument(); document.Text = request.Document.Text; document.Sentences = new LightSentence[sentenceDataList.Count]; for (var index = 0; index < sentenceDataList.Count; index++) { SentenceData sentenceData = sentenceDataList[index]; if (string.IsNullOrWhiteSpace(sentenceData.Text)) { continue; } var currentSentence = new LightSentence(); currentSentence.Text = sentenceData.Text; document.Sentences[index] = currentSentence; var chunks = new Dictionary <int, Span>(); foreach (Span chunk in sentenceData.Chunks) { for (var i = chunk.Start; i < chunk.End; i++) { chunks[i] = chunk; } } currentSentence.Words = new LightWord[sentenceData.Tokens.Length]; for (var i = 0; i < sentenceData.Tokens.Length; i++) { var wordData = new LightWord(); wordData.Tag = sentenceData.Tags[i]; wordData.Text = sentenceData.Tokens[i]; currentSentence.Words[i] = wordData; if (chunks.TryGetValue(i, out Span chunk)) { wordData.Phrase = chunk.Type; } } } return(document); }