/// <summary> /// Build DFAs for multiple languages simultaneously. /// /// Each language is specified as a subset of available <typeparamref name="TResult"/>s, and will include /// patterns for each result in its set. /// /// Languages built simultaneously will be globally minimized and will share as many states as possible. /// </summary> /// <param name="languages">Sets defining the languages to build</param> /// <param name="ambiguityResolver">When patterns for multiple results match the same string, this is called to /// combine the multiple results into one. If this is null, then a DfaAmbiguityException will be thrown in that /// case.</param> /// <returns>Start states for DFAs that match the given languages. This will have the same length as languages, /// with corresponding start states in corresponding positions.</returns> public IList <DfaState <TResult> > Build(IList <ISet <TResult> > languages, DfaAmbiguityResolver <TResult> ambiguityResolver) { if (languages.Count < 1) { return(new List <DfaState <TResult> >()); } SerializableDfa <TResult> serializableDfa; if (cache == null) { serializableDfa = _build(languages, ambiguityResolver); } else { var cacheKey = GetCacheKey(DfaTypeMatcher, languages, ambiguityResolver); serializableDfa = (SerializableDfa <TResult>)cache.GetCachedItem(cacheKey); if (serializableDfa == null) { serializableDfa = _build(languages, ambiguityResolver); cache.MaybeCacheItem(cacheKey, serializableDfa); } } return(serializableDfa.GetStartStates()); }
private SerializableDfa <TResult> _build(IList <ISet <TResult> > languages, DfaAmbiguityResolver <TResult> ambiguityResolver) { var nfa = new Nfa <TResult>(); var nfaStartStates = new int[languages.Count]; for (var i = 0; i < languages.Count; ++i) { nfaStartStates[i] = nfa.AddState(); } ambiguityResolver ??= DefaultAmbiguityResolver; foreach (var patEntry in patterns) { var patList = patEntry.Value; if (patList == null || patList.Count < 1) { continue; } var matchState = -1; //start state for matching this token for (var i = 0; i < languages.Count; ++i) { if (!languages[i].Contains(patEntry.Key)) { continue; } if (matchState < 0) { var acceptState = nfa.AddState(patEntry.Key); //final state accepting this token if (patList.Count > 1) { //we have multiple patterns. Make a union matchState = nfa.AddState(); foreach (var pat in patList) { nfa.AddEpsilon(matchState, pat.AddToNfa(nfa, acceptState)); } } else { //only one pattern no union necessary matchState = patList[0].AddToNfa(nfa, acceptState); } } //language i matches these patterns nfa.AddEpsilon(nfaStartStates[i], matchState); } } var rawDfa = new DfaFromNfa <TResult>(nfa, nfaStartStates, ambiguityResolver).GetDfa(); var minimalDfa = new DfaMinimizer <TResult>(rawDfa).GetMinimizedDfa(); var serializableDfa = new SerializableDfa <TResult>(minimalDfa); return(serializableDfa); }
private SerializableDfa <bool> _buildReverseFinders(IList <ISet <TResult> > languages) { var nfa = new Nfa <bool>(); var startState = nfa.AddState(); var endState = nfa.AddState(true); DfaAmbiguityResolver <bool> ambiguityResolver = DefaultAmbiguityResolver; //First, make an NFA that matches the reverse of all the patterns foreach (var patEntry in patterns) { var patList = patEntry.Value; if (patList == null || patList.Count < 1) { continue; } foreach (var language in languages) { if (!language.Contains(patEntry.Key)) { continue; } foreach (var pat in patEntry.Value) { var st = pat.Reversed.AddToNfa(nfa, endState); nfa.AddEpsilon(startState, st); } } } //omit the empty string startState = nfa.Disemptify(startState); //allow anything first startState = Pattern.MaybeRepeat(CharRange.All).AddToNfa(nfa, startState); //build the DFA var rawDfa = new DfaFromNfa <bool>(nfa, new[] { startState }, ambiguityResolver).GetDfa(); var minimalDfa = new DfaMinimizer <bool>(rawDfa).GetMinimizedDfa(); var serializableDfa = new SerializableDfa <bool>(minimalDfa); return(serializableDfa); }
private string GetCacheKey(int dfaType, IList <ISet <TResult> > languages, DfaAmbiguityResolver <TResult> ambiguityResolver) { string cacheKey; var hashAlg = new SHA256Managed(); using (var ms = new MemoryStream()) { using var cs = new CryptoStream(ms, hashAlg, CryptoStreamMode.Write); var bf = new BinaryFormatter(); bf.Serialize(ms, dfaType); var numLangs = languages.Count; bf.Serialize(ms, numLangs); //write key stuff out in an order based on our LinkedHashMap, for deterministic serialization foreach (var patEntry in patterns) { var included = false; var patList = patEntry.Value; if (patList.Count == 0) { continue; } for (var i = 0; i < numLangs; ++i) { if (!languages[i].Contains(patEntry.Key)) { continue; } included = true; break; } if (!included) { continue; } bf.Serialize(ms, patList.Count); if (numLangs > 1) { var bits = languages[0].Contains(patEntry.Key) ? 1 : 0; for (var i = 1; i < languages.Count; ++i) { if ((i & 31) == 0) { bf.Serialize(ms, bits); bits = 0; } if (languages[i].Contains(patEntry.Key)) { bits |= 1 << (i & 31); } } bf.Serialize(ms, bits); } foreach (var pat in patList) { bf.Serialize(ms, pat); } bf.Serialize(ms, patEntry.Key); } bf.Serialize(ms, 0); //0-size pattern list terminates pattern map bf.Serialize(ms, ambiguityResolver ?? (object)0); ms.Flush(); cs.FlushFinalBlock(); cacheKey = Base32.GetDigest(hashAlg.Hash); } return(cacheKey); }
/// <summary> /// Build DFAs from a provided NFA /// /// This method is used when you want to build the NFA yourself instead of letting this class do it. /// /// Languages built simultaneously will be globally minimized and will share as many states as possible. /// </summary> /// <param name="nfa">The NFA</param> /// <param name="nfaStartStates">The return value will include the DFA states corresponding to these NFA states, in the same order</param> /// <param name="ambiguityResolver">When patterns for multiple results match the same string, this is called to /// combine the multiple results into one. If this is null, then a DfaAmbiguityException will be thrown in that case.</param> /// <param name="cache">If this cache is non-null, it will be checked for a memoized result for this NFA, and will be populated /// with a memoized result when the call is complete.</param> /// <returns>DFA start states that are equivalent to the given NFA start states. This will have the same length as nfaStartStates, with /// corresponding start states in corresponding positions.</returns> public static IList <DfaState <TResult> > BuildFromNfa(Nfa <TResult> nfa, int[] nfaStartStates, DfaAmbiguityResolver <TResult> ambiguityResolver, IBuilderCache cache) { string cacheKey = null; SerializableDfa <TResult> serializableDfa = null; if (cache != null) { var hashAlg = new SHA256Managed(); using (var ms = new MemoryStream()) { using var cs = new CryptoStream(ms, hashAlg, CryptoStreamMode.Write); var bf = new BinaryFormatter(); bf.Serialize(ms, nfaStartStates); bf.Serialize(ms, nfa); bf.Serialize(ms, ambiguityResolver); ms.Flush(); cs.FlushFinalBlock(); cacheKey = Base32.GetDigest(hashAlg.Hash); } serializableDfa = (SerializableDfa <TResult>)cache.GetCachedItem(cacheKey); } if (serializableDfa == null) { var rawDfa = new DfaFromNfa <TResult>(nfa, nfaStartStates, ambiguityResolver).GetDfa(); var minimalDfa = new DfaMinimizer <TResult>(rawDfa).GetMinimizedDfa(); serializableDfa = new SerializableDfa <TResult>(minimalDfa); if (cacheKey != null) { cache.MaybeCacheItem(cacheKey, serializableDfa); } } return(serializableDfa.GetStartStates()); }
/// <summary> /// Build a <see cref="StringSearcher{TResult}"/> for all the patterns that have been added to this builder /// </summary> /// <param name="ambiguityResolver">When patterns for multiple results match the same string, this is called to /// combine the multiple results into one. If this is null, then a <see cref="DfaAmbiguityException{TResult}"/> /// will be thrown in that case.</param> /// <returns>A <see cref="StringSearcher{TResult}"/> for all the patterns in this builder</returns> public StringSearcher <TResult> BuildStringSearcher(DfaAmbiguityResolver <TResult> ambiguityResolver) { return(new StringSearcher <TResult>(Build(ambiguityResolver), BuildReverseFinder())); }
/// <summary> /// Build DFA for a single language. /// /// The language is specified as a subset of available <typeparamref name="TResult"/>s, and will include /// patterns for each result in its set. /// </summary> /// <param name="language">Set defining the language to build</param> /// <param name="ambiguityResolver">When patterns for multiple results match the same string, this is called to /// combine the multiple results into one. If this is null, then a DfaAmbiguityException will be thrown in that /// case.</param> /// <returns>The start state for a DFA that matches the set of patterns in language</returns> public DfaState <TResult> Build(ISet <TResult> language, DfaAmbiguityResolver <TResult> ambiguityResolver) { return(Build(new List <ISet <TResult> > { language }, ambiguityResolver)[0]); }
/// <summary> /// Build DFA for a single language. /// /// The resulting DFA matches ALL patterns that have been added to this builder. /// </summary> /// <param name="ambiguityResolver">When patterns for multiple results match the same string, this is called to /// combine the multiple results into one. If this is null, then a <see cref="DfaAmbiguityException{TResult}"/> /// will be thrown in that case.</param> /// <returns>The start state for a DFA that matches the set of patterns in language</returns> public DfaState <TResult> Build(DfaAmbiguityResolver <TResult> ambiguityResolver) { return(Build(new List <ISet <TResult> > { new HashSet <TResult>(patterns.Keys) }, ambiguityResolver)[0]); }