private LightSentence ProcessSentence(string text)
        {
            var tokens = tokenizer.Tokenize(text);

            if (tokens.Length <= 0)
            {
                return(null);
            }

            var tags            = posTagger.Tag(tokens);
            var currentSentence = new LightSentence();

            currentSentence.Text  = text;
            currentSentence.Words = new LightWord[tokens.Length];

            for (var i = 0; i < tokens.Length; i++)
            {
                var wordData = new LightWord();
                wordData.Tag             = tags[i];
                wordData.Text            = tokens[i];
                currentSentence.Words[i] = wordData;
            }

            NERExtraction(currentSentence, tokens);
            PhraseExtraction(currentSentence, tokens, tags);
            return(currentSentence);
        }
Пример #2
0
 public WordEx(LightWord item)
 {
     UnderlyingWord = item;
     Text           = item.Text;
     Type           = item.Tag;
     Phrase         = item.Phrase;
 }
        protected override LightDocument ActualProcess(ParseRequest request)
        {
            var sentences        = sentenceSplitter.Split(request.Document.Text).ToArray();
            var sentenceDataList = new List <SentenceData>(sentences.Length);

            foreach (var sentence in sentences)
            {
                var text = repairHandler.Repair(sentence);
                if (sentence != text)
                {
                    log.LogDebug("Sentence repaired!");
                }

                var sentenceData = new SentenceData {
                    Text = text
                };
                sentenceData.Tokens = tokenizer.Tokenize(sentenceData.Text);
                if (sentenceData.Tokens.Length <= 0)
                {
                    continue;
                }

                sentenceData.Tags   = posTagger.Tag(sentenceData.Tokens);
                sentenceData.Chunks = chunker.ChunkAsSpans(sentenceData.Tokens, sentenceData.Tags).ToArray();
                sentenceDataList.Add(sentenceData);
            }

            var document = new LightDocument();

            document.Text      = request.Document.Text;
            document.Sentences = new LightSentence[sentenceDataList.Count];
            for (var index = 0; index < sentenceDataList.Count; index++)
            {
                SentenceData sentenceData = sentenceDataList[index];
                if (string.IsNullOrWhiteSpace(sentenceData.Text))
                {
                    continue;
                }

                var currentSentence = new LightSentence();
                currentSentence.Text = sentenceData.Text;

                document.Sentences[index] = currentSentence;
                var chunks = new Dictionary <int, Span>();
                foreach (Span chunk in sentenceData.Chunks)
                {
                    for (var i = chunk.Start; i < chunk.End; i++)
                    {
                        chunks[i] = chunk;
                    }
                }

                currentSentence.Words = new LightWord[sentenceData.Tokens.Length];
                for (var i = 0; i < sentenceData.Tokens.Length; i++)
                {
                    var wordData = new LightWord();
                    wordData.Tag             = sentenceData.Tags[i];
                    wordData.Text            = sentenceData.Tokens[i];
                    currentSentence.Words[i] = wordData;

                    if (chunks.TryGetValue(i, out Span chunk))
                    {
                        wordData.Phrase = chunk.Type;
                    }
                }
            }

            return(document);
        }