private LightSentence ProcessSentence(string text) { var tokens = tokenizer.Tokenize(text); if (tokens.Length <= 0) { return(null); } var tags = posTagger.Tag(tokens); var currentSentence = new LightSentence(); currentSentence.Text = text; currentSentence.Words = new LightWord[tokens.Length]; for (var i = 0; i < tokens.Length; i++) { var wordData = new LightWord(); wordData.Tag = tags[i]; wordData.Text = tokens[i]; currentSentence.Words[i] = wordData; } NERExtraction(currentSentence, tokens); PhraseExtraction(currentSentence, tokens, tags); return(currentSentence); }
public WordEx(LightWord item) { UnderlyingWord = item; Text = item.Text; Type = item.Tag; Phrase = item.Phrase; }
protected override LightDocument ActualProcess(ParseRequest request) { var sentences = sentenceSplitter.Split(request.Document.Text).ToArray(); var sentenceDataList = new List <SentenceData>(sentences.Length); foreach (var sentence in sentences) { var text = repairHandler.Repair(sentence); if (sentence != text) { log.LogDebug("Sentence repaired!"); } var sentenceData = new SentenceData { Text = text }; sentenceData.Tokens = tokenizer.Tokenize(sentenceData.Text); if (sentenceData.Tokens.Length <= 0) { continue; } sentenceData.Tags = posTagger.Tag(sentenceData.Tokens); sentenceData.Chunks = chunker.ChunkAsSpans(sentenceData.Tokens, sentenceData.Tags).ToArray(); sentenceDataList.Add(sentenceData); } var document = new LightDocument(); document.Text = request.Document.Text; document.Sentences = new LightSentence[sentenceDataList.Count]; for (var index = 0; index < sentenceDataList.Count; index++) { SentenceData sentenceData = sentenceDataList[index]; if (string.IsNullOrWhiteSpace(sentenceData.Text)) { continue; } var currentSentence = new LightSentence(); currentSentence.Text = sentenceData.Text; document.Sentences[index] = currentSentence; var chunks = new Dictionary <int, Span>(); foreach (Span chunk in sentenceData.Chunks) { for (var i = chunk.Start; i < chunk.End; i++) { chunks[i] = chunk; } } currentSentence.Words = new LightWord[sentenceData.Tokens.Length]; for (var i = 0; i < sentenceData.Tokens.Length; i++) { var wordData = new LightWord(); wordData.Tag = sentenceData.Tags[i]; wordData.Text = sentenceData.Tokens[i]; currentSentence.Words[i] = wordData; if (chunks.TryGetValue(i, out Span chunk)) { wordData.Phrase = chunk.Type; } } } return(document); }