public void IncrementsOffsetCorrectlyWithAnotherReader() { int[] expectedOffsets = { 0, 5, 10, 15 }; int curPos = 0; string token = string.Empty; Tokenizer t = new Tokenizer( new HTMLStripCharFilter(CharReader.Get(new System.IO.StringReader(@"test<a href=""foo"">test</a>test test")))); while (true) { Tokenizer.TokenType token_type = t.NextToken(out token); if (token_type == 0) { break; } Assert.Equal(expectedOffsets[curPos++], t.Offset); Assert.Equal(4, t.LengthInSource); } }
public int LemmatizeNextToken(out string nextToken, IList <Token> retTokens) { retTokens.Clear(); int currentPos = 0; // Used to loop over certain noise cases while (true) { Tokenizer.TokenType tokenType = _tokenizer.NextToken(out nextToken); if (tokenType == 0) { return(0); // EOS } _startOffset = _tokenizer.Offset; _endOffset = _startOffset + _tokenizer.LengthInSource; ++currentPos; if ((tokenType & Tokenizer.TokenType.Hebrew) > 0) { // Right now we are blindly removing all Niqqud characters. Later we will try and make some // use of Niqqud for some cases. We do this before everything else to allow for a correct // identification of prefixes. nextToken = RemoveNiqqud(nextToken); // Ignore "words" which are actually only prefixes in a single word. // This first case is easy to spot, since the prefix and the following word will be // separated by a dash, and marked as a construct (סמיכות) by the Tokenizer if ((tokenType & Tokenizer.TokenType.Construct) > 0 || (tokenType & Tokenizer.TokenType.Acronym) > 0) { if (IsLegalPrefix(nextToken)) { --currentPos; // this should be treated as a word prefix continue; } } // This second case is a bit more complex. We take a risk of splitting a valid acronym or // abbrevated word into two, so we send it to an external function to analyze the word, and // get a possibly corrected word. Examples for words we expect to simplify by this operation // are ה"שטיח", ש"המידע. if ((tokenType & Tokenizer.TokenType.Acronym) > 0) { nextToken = TryStrippingPrefix(nextToken); // Re-detect acronym, in case it was a false positive if (nextToken.IndexOf('"') == -1) { tokenType &= ~Tokenizer.TokenType.Acronym; } } // TODO: Perhaps by easily identifying the prefixes above we can also rule out some of the // stem ambiguities retreived later... // Support for external dictionaries, for preventing OOV words or providing synonyms string correctedWord = LookupWordCorrection(nextToken); if (!string.IsNullOrEmpty(correctedWord)) { retTokens.Add(new HebrewToken(correctedWord, 0, DMask.D_CUSTOM, correctedWord, 1.0f)); nextToken = correctedWord; break; } IList <HebrewToken> lemmas = Lemmatize(nextToken); if (lemmas.Count > 0) { // TODO: Filter Stop Words based on morphological data (hspell 'x' identification) // TODO: Check for worthy lemmas, if there are none then perform tolerant lookup and check again... if ((tokenType & Tokenizer.TokenType.Construct) > 0) { // TODO: Test for (lemma.Mask & DMask.D_OSMICHUT) > 0 } foreach (Token t in lemmas) // temp catch-all { retTokens.Add(t); } } if (retTokens.Count == 0 && (tokenType & Tokenizer.TokenType.Acronym) > 0) { // TODO: Perform Gimatria test // TODO: Treat an acronym as a noun and strip affixes accordingly? retTokens.Add(new HebrewToken(nextToken, 0, DMask.D_ACRONYM, nextToken, 1.0f)); } else if (TolerateWhenLemmatizingStream && retTokens.Count == 0) { lemmas = LemmatizeTolerant(nextToken); if (lemmas.Count > 0) { // TODO: Keep only worthy lemmas, based on characteristics and score / confidence if ((tokenType & Tokenizer.TokenType.Construct) > 0) { // TODO: Test for (lemma.Mask & DMask.D_OSMICHUT) > 0 } foreach (Token t in lemmas) // temp catch-all { retTokens.Add(t); } } else // Word unknown to hspell - OOV case { // TODO: Right now we store the word as-is. Perhaps we can assume this is a Noun or a name, // and try removing prefixes and suffixes based on that? //retTokens.Add(new HebrewToken(nextToken, 0, 0, null, 1.0f)); } } } else if ((tokenType & Tokenizer.TokenType.Numeric) > 0) { retTokens.Add(new Token(nextToken, true)); } else { retTokens.Add(new Token(nextToken)); } break; } return(currentPos); }
/// <summary> /// Creates a new token with the given token string and type. /// </summary> /// <param name="token">The token string to be saved.</param> /// <param name="type">The TokenType associated with the information in token.</param> public FormulaToken(String token, Tokenizer.TokenType type) { Token = token; Type = type; }