Ejemplo n.º 1
0
        public override bool IncrementToken()
        {
            // Index all unique lemmas at the same position
            while (index < stack.Count)
            {
                HebMorph.HebrewToken res = stack[index++] as HebMorph.HebrewToken;

                if (res == null || previousLemmas.ContainsKey(res.Lemma)) // Skip multiple lemmas (we will merge morph properties later)
                {
                    continue;
                }

                previousLemmas.Add(res.Lemma, true);

                if (CreateHebrewToken(res))
                {
                    return(true);
                }
            }

            // Reset state
            ClearAttributes();
            index = 0;
            stack.Clear();
            previousLemmas.Clear();

            // Lemmatize next word in stream. The HebMorph lemmatizer will always return a token, unless
            // an unrecognized Hebrew word is hit, then an empty tokens array will be returned.
            string word = string.Empty; // to hold the original word from the stream

            if (_streamLemmatizer.LemmatizeNextToken(out word, stack) == 0)
            {
                return(false); // EOS
            }
            // Store the location of the word in the original stream
            offsetAtt.SetOffset(CorrectOffset(_streamLemmatizer.StartOffset), CorrectOffset(_streamLemmatizer.EndOffset));

            // A non-Hebrew word
            if (stack.Count == 1 && !(stack[0] is HebMorph.HebrewToken))
            {
                SetTermText(word);

                HebMorph.Token tkn = stack[0];
                if (tkn.IsNumeric)
                {
                    typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Numeric);
                }
                else
                {
                    typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.NonHebrew);
                }

                // Applying LowerCaseFilter for Non-Hebrew terms
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                for (int i = 0; i < length; i++)
                {
                    buffer[i] = System.Char.ToLower(buffer[i]);
                }

                stack.Clear();
                return(true);
            }

            // If we arrived here, we hit a Hebrew word
            // Do some filtering if requested...
            if (lemmaFilter != null && lemmaFilter.FilterCollection(stack, filterCache) != null)
            {
                stack.Clear();
                stack.AddRange(filterCache);
            }

            // OOV case -- for now store word as-is and return true
            if (stack.Count == 0)
            {
                // TODO: To allow for more advanced uses, fill stack with processed tokens and
                // SetPositionIncrement(0)

                SetTermText(word + "$");
                typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew);
                return(true);
            }

            // If only one lemma was returned for this word
            if (stack.Count == 1)
            {
                HebMorph.HebrewToken hebToken = stack[0] as HebMorph.HebrewToken;

                // Index the lemma alone if it exactly matches the word minus prefixes
                if (!alwaysSaveMarkedOriginal && hebToken.Lemma.Equals(word.Substring(hebToken.PrefixLength)))
                {
                    CreateHebrewToken(hebToken);
                    posIncrAtt.PositionIncrement = 1;
                    stack.Clear();
                    return(true);
                }
                // Otherwise, index the lemma plus the original word marked with a unique flag to increase precision
                else
                {
                    // DILEMMA: Does indexing word.Substring(hebToken.PrefixLength) + "$" make more or less sense?
                    // For now this is kept the way it is below to support duality of SimpleAnalyzer and MorphAnalyzer
                    SetTermText(word + "$");
                }
            }

            // More than one lemma exist. Mark and store the original term to increase precision, while all
            // lemmas will be popped out of the stack and get stored at the next call to IncrementToken.
            else
            {
                SetTermText(word + "$");
            }

            typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew);

            return(true);
        }