internal static void ParseRules(IEnumerable <string> rules, SlowSynonymMap map, string mappingSep, string synSep, bool expansion, TokenizerFactory tokFactory) { int count = 0; foreach (string rule in rules) { // To use regexes, we need an expression that specifies an odd number of chars. // This can't really be done with string.split(), and since we need to // do unescaping at some point anyway, we wouldn't be saving any effort // by using regexes. IList <string> mapping = SplitSmart(rule, mappingSep, false); IList <IList <string> > source; IList <IList <string> > target; if (mapping.Count > 2) { throw new ArgumentException("Invalid Synonym Rule:" + rule); } else if (mapping.Count == 2) { source = GetSynList(mapping[0], synSep, tokFactory); target = GetSynList(mapping[1], synSep, tokFactory); } else { source = GetSynList(mapping[0], synSep, tokFactory); if (expansion) { // expand to all arguments target = source; } else { // reduce to first argument target = new List <IList <string> >(1) { source[0] }; } } bool includeOrig = false; foreach (IList <string> fromToks in source) { count++; foreach (IList <string> toToks in target) { map.Add(fromToks, SlowSynonymMap.MakeTokens(toToks), includeOrig, true); } } } }
public void Inform(IResourceLoader loader) { TokenizerFactory tokFactory = null; if (tf != null) { tokFactory = LoadTokenizerFactory(loader, tf); } IEnumerable <string> wlist = LoadRules(synonyms, loader); synMap = new SlowSynonymMap(ignoreCase); ParseRules(wlist, synMap, "=>", ",", expand, tokFactory); }
private IEnumerator <AttributeSource> replacement; // iterator over generated tokens public SlowSynonymFilter(TokenStream @in, SlowSynonymMap map) : base(@in) { if (map is null) { throw new ArgumentException("map is required", "map"); } this.map = map; // just ensuring these attributes exist... AddAttribute <ICharTermAttribute>(); AddAttribute <IPositionIncrementAttribute>(); AddAttribute <IOffsetAttribute>(); AddAttribute <ITypeAttribute>(); }
private SlowSynonymMap Match(SlowSynonymMap map) { SlowSynonymMap result = null; if (map.Submap != null) { AttributeSource tok = NextTok(); if (tok != null) { // clone ourselves. if (tok == this) { tok = CloneAttributes(); } // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? var termAtt = tok.GetAttribute <ICharTermAttribute>(); SlowSynonymMap subMap = map.Submap.Get(termAtt.Buffer, 0, termAtt.Length); if (subMap != null) { // recurse result = Match(subMap); } if (result != null) { matched.AddFirst(tok); } else { // push back unmatched token PushTok(tok); } } } // if no longer sequence matched, so if this node has synonyms, it's the match. if (result is null && map.Synonyms != null) { result = map; } return(result); }
/// <param name="singleMatch"> <see cref="IList{String}"/>, the sequence of strings to match </param> /// <param name="replacement"> <see cref="IList{Token}"/> the list of tokens to use on a match </param> /// <param name="includeOrig"> sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens </param> /// <param name="mergeExisting"> merge the replacement tokens with any other mappings that exist </param> public virtual void Add(IList <string> singleMatch, IList <Token> replacement, bool includeOrig, bool mergeExisting) { var currMap = this; foreach (string str in singleMatch) { if (currMap.submap == null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap <SlowSynonymMap>(LuceneVersion.LUCENE_CURRENT, 1, IgnoreCase); } var map = currMap.submap.Get(str); if (map == null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.Put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new System.ArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } IList <Token> superset = currMap.synonyms == null ? replacement : MergeTokens(currMap.synonyms, replacement); currMap.synonyms = superset.ToArray(); if (includeOrig) { currMap.flags |= INCLUDE_ORIG; } }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok is null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null; if (result is null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result is null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? IList <AttributeSource> generated = new JCG.List <AttributeSource>(result.Synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.Synonyms.Length; i++) { Token repTok = result.Synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset); newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }