protected virtual bool CreateHebrewToken(HebMorph.HebrewToken hebToken) { SetTermText(hebToken.Lemma ?? hebToken.Text.Substring(hebToken.PrefixLength)); posIncrAtt.PositionIncrement = 0; // TODO: typeAtt.SetType(TokenTypeSignature(TOKEN_TYPES.Acronym)); typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew); /* * Morph payload * * byte[] data = new byte[1]; * data[0] = (byte)morphResult.Mask; // TODO: Set bits selectively * Payload payload = new Payload(data); * payAtt.SetPayload(payload); */ return(true); }
public override bool IncrementToken() { // Index all unique lemmas at the same position while (index < stack.Count) { HebMorph.HebrewToken res = stack[index++] as HebMorph.HebrewToken; if (res == null || previousLemmas.ContainsKey(res.Lemma)) // Skip multiple lemmas (we will merge morph properties later) { continue; } previousLemmas.Add(res.Lemma, true); if (CreateHebrewToken(res)) { return(true); } } // Reset state ClearAttributes(); index = 0; stack.Clear(); previousLemmas.Clear(); // Lemmatize next word in stream. The HebMorph lemmatizer will always return a token, unless // an unrecognized Hebrew word is hit, then an empty tokens array will be returned. string word = string.Empty; // to hold the original word from the stream if (_streamLemmatizer.LemmatizeNextToken(out word, stack) == 0) { return(false); // EOS } // Store the location of the word in the original stream offsetAtt.SetOffset(CorrectOffset(_streamLemmatizer.StartOffset), CorrectOffset(_streamLemmatizer.EndOffset)); // A non-Hebrew word if (stack.Count == 1 && !(stack[0] is HebMorph.HebrewToken)) { SetTermText(word); HebMorph.Token tkn = stack[0]; if (tkn.IsNumeric) { typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Numeric); } else { typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.NonHebrew); } // Applying LowerCaseFilter for Non-Hebrew terms char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); for (int i = 0; i < length; i++) { buffer[i] = System.Char.ToLower(buffer[i]); } stack.Clear(); return(true); } // If we arrived here, we hit a Hebrew word // Do some filtering if requested... if (lemmaFilter != null && lemmaFilter.FilterCollection(stack, filterCache) != null) { stack.Clear(); stack.AddRange(filterCache); } // OOV case -- for now store word as-is and return true if (stack.Count == 0) { // TODO: To allow for more advanced uses, fill stack with processed tokens and // SetPositionIncrement(0) SetTermText(word + "$"); typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew); return(true); } // If only one lemma was returned for this word if (stack.Count == 1) { HebMorph.HebrewToken hebToken = stack[0] as HebMorph.HebrewToken; // Index the lemma alone if it exactly matches the word minus prefixes if (!alwaysSaveMarkedOriginal && hebToken.Lemma.Equals(word.Substring(hebToken.PrefixLength))) { CreateHebrewToken(hebToken); posIncrAtt.PositionIncrement = 1; stack.Clear(); return(true); } // Otherwise, index the lemma plus the original word marked with a unique flag to increase precision else { // DILEMMA: Does indexing word.Substring(hebToken.PrefixLength) + "$" make more or less sense? // For now this is kept the way it is below to support duality of SimpleAnalyzer and MorphAnalyzer SetTermText(word + "$"); } } // More than one lemma exist. Mark and store the original term to increase precision, while all // lemmas will be popped out of the stack and get stored at the next call to IncrementToken. else { SetTermText(word + "$"); } typeAtt.Type = HebrewTokenizer.TokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew); return(true); }