예제 #1
0
        public void Parse(IDocument document)
        {
            if (!document.Spans.Any())
            {
                document.AddSpan(0, document.Length - 1);
            }

            foreach (ISpan s in document.Spans)
            {
                try
                {
                    Parse(s);
                }
                catch (InvalidOperationException ome)
                {
                    Logger.LogError(ome, "Error tokenizing document:\n'{TEXT}'", document.Value);
                    document.Clear();
                }
            }
        }
예제 #2
0
        public void Parse(IDocument document)
        {
            if (document.Length == 0)
            {
                return;
            }

            if (document.Spans.Count() != 1)
            {
                return; //Document has already been tokenized and passed to the sentence detection, so ignore the second call
            }

            var tokens = document.Spans.First().Tokens.ToArray();

            if (tokens.Length == 0)
            {
                return;
            }

            bool hasReplacements = false;

            //NOTE: This loop is not used for anything here, but instead to force tokens to cache the replacement
            //      As they'll not be able to retrieve it later when re-added to the document.
            for (int i = 0; i < tokens.Length; i++)
            {
                hasReplacements |= (tokens[i].Replacement is null);
            }

            var text = document.Value.AsSpan();

            const int padding = 2;

            var paddedTokens = new List <IToken>(tokens.Length + 2 * padding);

            paddedTokens.Add(SpecialToken.BeginToken);
            paddedTokens.Add(SpecialToken.BeginToken);
            paddedTokens.AddRange(tokens);
            paddedTokens.Add(SpecialToken.EndToken);
            paddedTokens.Add(SpecialToken.EndToken);

            int N = paddedTokens.Count;

            var isSentenceEnd = new bool[N];

            for (int i = padding + 1; i < N - padding - 1; i++) //Skip BeginTokens and EndTokens, and first and last token of sentence
            {
                if (paddedTokens[i].ValueAsSpan.IsSentencePunctuation())
                {
                    var features = GetFeatures(paddedTokens, i);
                    isSentenceEnd[i] = PredictTagFromFeatures(features, Data.Weights);
                }
            }

            document.Clear();

            //Now split the original document at the right places

            //If any sentence detected within the single span (i.e. ignoring the first and last tokens
            if (isSentenceEnd.AsSpan().Slice(padding + 1, tokens.Length - 1).IndexOf(true) >= 0)
            {
                int offset = 0;
                for (int i = padding; i < N - padding; i++)
                {
                    if (isSentenceEnd[i])
                    {
                        int b = offset;
                        int e = tokens[i - padding].End;
                        if (e < b)
                        {
                            continue;
                        }
                        while (char.IsWhiteSpace(text[b]) && b < e)
                        {
                            b++;
                        }

                        while (char.IsWhiteSpace(text[e]) && e > b)
                        {
                            e--;
                        }

                        try
                        {
                            if (!text.Slice(b, e - b + 1).IsNullOrWhiteSpace())
                            {
                                var span = document.AddSpan(b, e);
                                foreach (var t in tokens)
                                {
                                    if (t.Begin >= span.Begin && t.End <= span.End)
                                    {
                                        span.AddToken(t); //Re-add the tokens back in the document
                                    }
                                }
                            }
                        }
                        catch (Exception)
                        {
                            Logger.LogCritical("Failed to tokenize: b={b} e={e} l={l} offset={offset} tEnd={tEnd} i={i} tCount={tCount}", b, e, text.Length, offset, tokens[i - padding].End, i, tokens.Length);
                            throw;
                        }
                        offset = e + 1;
                    }
                }
                if (offset <= document.Length - 1)
                {
                    int b = offset;
                    int e = document.Length - 1;
                    while (char.IsWhiteSpace(text[b]) && b < e)
                    {
                        b++;
                    }
                    while (char.IsWhiteSpace(text[e]) && e > b)
                    {
                        e--;
                    }

                    if (!text.Slice(b, e - b + 1).IsNullOrWhiteSpace())
                    {
                        var span = document.AddSpan(b, e);
                        foreach (var t in tokens)
                        {
                            if (t.Begin >= span.Begin && t.End <= span.End)
                            {
                                span.AddToken(t);
                            }
                        }
                    }
                }
            }
            else
            {
                int b = 0;
                int e = document.Length - 1;
                while (char.IsWhiteSpace(text[b]) && b < e)
                {
                    b++;
                }
                while (char.IsWhiteSpace(text[e]) && e > b)
                {
                    e--;
                }

                var span = document.AddSpan(b, e);
                foreach (var t in tokens)
                {
                    if (t.Begin >= span.Begin && t.End <= span.End)
                    {
                        span.AddToken(t); //Re-add the tokens back in the document
                    }
                }
            }
        }