예제 #1
0
        protected virtual bool CreateHebrewToken(HebMorph.HebrewToken hebToken)
        {
            SetTermText(hebToken.Lemma ?? hebToken.Text.Substring(hebToken.PrefixLength));
            posIncrAtt.PositionIncrement = 0;

            // TODO: typeAtt.SetType(TokenTypeSignature(TOKEN_TYPES.Acronym));
            typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew);

            /*
             * Morph payload
             *
             * byte[] data = new byte[1];
             * data[0] = (byte)morphResult.Mask; // TODO: Set bits selectively
             * Payload payload = new Payload(data);
             * payAtt.SetPayload(payload);
             */

            return(true);
        }
예제 #2
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new HebrewTokenizer(reader, PrefixTree);

            // Niqqud normalization
            result = new NiqqudFilter(result);

            // TODO: should we ignoreCase in StopFilter?
            result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET);

            // TODO: Apply LowerCaseFilter to NonHebrew tokens only
            result = new LowerCaseFilter(result);

            if (suffixByTokenType != null && suffixByTokenType.Count > 0)
            {
                result = new AddSuffixFilter(result, suffixByTokenType);
            }

            return(result);
        }
예제 #3
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new HebrewTokenizer(reader, PrefixTree);

            // Niqqud normalization
            result = new NiqqudFilter(result);

            // TODO: should we ignoreCase in StopFilter?
            result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET);

            // TODO: Apply LowerCaseFilter to NonHebrew tokens only
            result = new LowerCaseFilter(result);

            if (suffixByTokenType != null && suffixByTokenType.Count > 0)
                result = new AddSuffixFilter(result, suffixByTokenType);

            return result;
        }
예제 #4
0
        public override bool IncrementToken()
        {
            // Index all unique lemmas at the same position
            while (index < stack.Count)
            {
                HebMorph.HebrewToken res = stack[index++] as HebMorph.HebrewToken;

                if (res == null || previousLemmas.ContainsKey(res.Lemma)) // Skip multiple lemmas (we will merge morph properties later)
                {
                    continue;
                }

                previousLemmas.Add(res.Lemma, true);

                if (CreateHebrewToken(res))
                {
                    return(true);
                }
            }

            // Reset state
            ClearAttributes();
            index = 0;
            stack.Clear();
            previousLemmas.Clear();

            // Lemmatize next word in stream. The HebMorph lemmatizer will always return a token, unless
            // an unrecognized Hebrew word is hit, then an empty tokens array will be returned.
            string word = string.Empty; // to hold the original word from the stream

            if (_streamLemmatizer.LemmatizeNextToken(out word, stack) == 0)
            {
                return(false); // EOS
            }
            // Store the location of the word in the original stream
            offsetAtt.SetOffset(CorrectOffset(_streamLemmatizer.StartOffset), CorrectOffset(_streamLemmatizer.EndOffset));

            // A non-Hebrew word
            if (stack.Count == 1 && !(stack[0] is HebMorph.HebrewToken))
            {
                SetTermText(word);

                HebMorph.Token tkn = stack[0];
                if (tkn.IsNumeric)
                {
                    typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Numeric);
                }
                else
                {
                    typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.NonHebrew);
                }

                // Applying LowerCaseFilter for Non-Hebrew terms
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                for (int i = 0; i < length; i++)
                {
                    buffer[i] = System.Char.ToLower(buffer[i]);
                }

                stack.Clear();
                return(true);
            }

            // If we arrived here, we hit a Hebrew word
            // Do some filtering if requested...
            if (lemmaFilter != null && lemmaFilter.FilterCollection(stack, filterCache) != null)
            {
                stack.Clear();
                stack.AddRange(filterCache);
            }

            // OOV case -- for now store word as-is and return true
            if (stack.Count == 0)
            {
                // TODO: To allow for more advanced uses, fill stack with processed tokens and
                // SetPositionIncrement(0)

                SetTermText(word + "$");
                typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew);
                return(true);
            }

            // If only one lemma was returned for this word
            if (stack.Count == 1)
            {
                HebMorph.HebrewToken hebToken = stack[0] as HebMorph.HebrewToken;

                // Index the lemma alone if it exactly matches the word minus prefixes
                if (!alwaysSaveMarkedOriginal && hebToken.Lemma.Equals(word.Substring(hebToken.PrefixLength)))
                {
                    CreateHebrewToken(hebToken);
                    posIncrAtt.PositionIncrement = 1;
                    stack.Clear();
                    return(true);
                }
                // Otherwise, index the lemma plus the original word marked with a unique flag to increase precision
                else
                {
                    // DILEMMA: Does indexing word.Substring(hebToken.PrefixLength) + "$" make more or less sense?
                    // For now this is kept the way it is below to support duality of SimpleAnalyzer and MorphAnalyzer
                    SetTermText(word + "$");
                }
            }

            // More than one lemma exist. Mark and store the original term to increase precision, while all
            // lemmas will be popped out of the stack and get stored at the next call to IncrementToken.
            else
            {
                SetTermText(word + "$");
            }

            typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew);

            return(true);
        }