コード例 #1
0
        public void IncrementsOffsetCorrectlyWithAnotherReader()
        {
            int[] expectedOffsets = { 0, 5, 10, 15 };
            int   curPos          = 0;

            string    token = string.Empty;
            Tokenizer t     =
                new Tokenizer(
                    new HTMLStripCharFilter(CharReader.Get(new System.IO.StringReader(@"test<a href=""foo"">test</a>test test"))));

            while (true)
            {
                Tokenizer.TokenType token_type = t.NextToken(out token);
                if (token_type == 0)
                {
                    break;
                }

                Assert.Equal(expectedOffsets[curPos++], t.Offset);
                Assert.Equal(4, t.LengthInSource);
            }
        }
コード例 #2
0
ファイル: StreamLemmatizer.cs プロジェクト: srdee/HebMorph
        public int LemmatizeNextToken(out string nextToken, IList <Token> retTokens)
        {
            retTokens.Clear();

            int currentPos = 0;

            // Used to loop over certain noise cases
            while (true)
            {
                Tokenizer.TokenType tokenType = _tokenizer.NextToken(out nextToken);
                if (tokenType == 0)
                {
                    return(0); // EOS
                }
                _startOffset = _tokenizer.Offset;
                _endOffset   = _startOffset + _tokenizer.LengthInSource;

                ++currentPos;

                if ((tokenType & Tokenizer.TokenType.Hebrew) > 0)
                {
                    // Right now we are blindly removing all Niqqud characters. Later we will try and make some
                    // use of Niqqud for some cases. We do this before everything else to allow for a correct
                    // identification of prefixes.
                    nextToken = RemoveNiqqud(nextToken);

                    // Ignore "words" which are actually only prefixes in a single word.
                    // This first case is easy to spot, since the prefix and the following word will be
                    // separated by a dash, and marked as a construct (סמיכות) by the Tokenizer
                    if ((tokenType & Tokenizer.TokenType.Construct) > 0 ||
                        (tokenType & Tokenizer.TokenType.Acronym) > 0)
                    {
                        if (IsLegalPrefix(nextToken))
                        {
                            --currentPos; // this should be treated as a word prefix
                            continue;
                        }
                    }

                    // This second case is a bit more complex. We take a risk of splitting a valid acronym or
                    // abbrevated word into two, so we send it to an external function to analyze the word, and
                    // get a possibly corrected word. Examples for words we expect to simplify by this operation
                    // are ה"שטיח", ש"המידע.
                    if ((tokenType & Tokenizer.TokenType.Acronym) > 0)
                    {
                        nextToken = TryStrippingPrefix(nextToken);

                        // Re-detect acronym, in case it was a false positive
                        if (nextToken.IndexOf('"') == -1)
                        {
                            tokenType &= ~Tokenizer.TokenType.Acronym;
                        }
                    }

                    // TODO: Perhaps by easily identifying the prefixes above we can also rule out some of the
                    // stem ambiguities retreived later...

                    // Support for external dictionaries, for preventing OOV words or providing synonyms
                    string correctedWord = LookupWordCorrection(nextToken);
                    if (!string.IsNullOrEmpty(correctedWord))
                    {
                        retTokens.Add(new HebrewToken(correctedWord, 0, DMask.D_CUSTOM, correctedWord, 1.0f));
                        nextToken = correctedWord;
                        break;
                    }

                    IList <HebrewToken> lemmas = Lemmatize(nextToken);
                    if (lemmas.Count > 0)
                    {
                        // TODO: Filter Stop Words based on morphological data (hspell 'x' identification)
                        // TODO: Check for worthy lemmas, if there are none then perform tolerant lookup and check again...
                        if ((tokenType & Tokenizer.TokenType.Construct) > 0)
                        {
                            // TODO: Test for (lemma.Mask & DMask.D_OSMICHUT) > 0
                        }

                        foreach (Token t in lemmas) // temp catch-all
                        {
                            retTokens.Add(t);
                        }
                    }

                    if (retTokens.Count == 0 && (tokenType & Tokenizer.TokenType.Acronym) > 0)
                    {
                        // TODO: Perform Gimatria test
                        // TODO: Treat an acronym as a noun and strip affixes accordingly?
                        retTokens.Add(new HebrewToken(nextToken, 0, DMask.D_ACRONYM, nextToken, 1.0f));
                    }
                    else if (TolerateWhenLemmatizingStream && retTokens.Count == 0)
                    {
                        lemmas = LemmatizeTolerant(nextToken);
                        if (lemmas.Count > 0)
                        {
                            // TODO: Keep only worthy lemmas, based on characteristics and score / confidence

                            if ((tokenType & Tokenizer.TokenType.Construct) > 0)
                            {
                                // TODO: Test for (lemma.Mask & DMask.D_OSMICHUT) > 0
                            }

                            foreach (Token t in lemmas) // temp catch-all
                            {
                                retTokens.Add(t);
                            }
                        }
                        else // Word unknown to hspell - OOV case
                        {
                            // TODO: Right now we store the word as-is. Perhaps we can assume this is a Noun or a name,
                            // and try removing prefixes and suffixes based on that?
                            //retTokens.Add(new HebrewToken(nextToken, 0, 0, null, 1.0f));
                        }
                    }
                }
                else if ((tokenType & Tokenizer.TokenType.Numeric) > 0)
                {
                    retTokens.Add(new Token(nextToken, true));
                }
                else
                {
                    retTokens.Add(new Token(nextToken));
                }

                break;
            }

            return(currentPos);
        }
コード例 #3
0
 /// <summary>
 /// Creates a new token with the given token string and type.
 /// </summary>
 /// <param name="token">The token string to be saved.</param>
 /// <param name="type">The TokenType associated with the information in token.</param>
 public FormulaToken(String token, Tokenizer.TokenType type)
 {
     Token = token;
     Type  = type;
 }