// Convert to an obligatory rewrite transducer public static Fst ToRewriter(this Fst fst, ISet <char> alphabet) { var all = FsaBuilder.All(alphabet); var notInDomain = all .Difference(all.Concat(fst.Domain()).Concat(all)) .Identity() .Optional(); return(notInDomain.Concat(fst.Concat(notInDomain).Star())); }
public void AllFsaTest() { var fsa = FsaBuilder.All(new HashSet <char> { 'a', 'b', 'c' }); Assert.Equal(3, fsa.States.Count); Assert.Equal(1, fsa.Initial.Count); Assert.Equal(2, fsa.Final.Count); Assert.False(fsa.Recognize("d")); Assert.False(fsa.Recognize("ad")); Assert.True(new[] { "ab", string.Empty, "abc", "bbbac", "cba", "cbcbbcaaaaacb" }.All(fsa.Recognize)); }
public void ComplexFsaConstructionTest2() { // .*@.*\.com var all = FsaBuilder.All( Enumerable.Range(97, 27).Select(Convert.ToChar).ToHashSet()); var fsa = all .Concat( FsaBuilder.FromWord("@"), all, FsaBuilder.FromWord(".com")) .Determinize(); Assert.DoesNotContain(new[] { "*****@*****.**", "you@@gmail.com", "*****@*****.**", "*****@*****.**" }, fsa.Recognize); Assert.True(new[] { "*****@*****.**", "*****@*****.**", "*****@*****.**" }.All(fsa.Recognize)); }
// Convert to an obligatory leftmost-longest match rewrite transducer (Karttunen 1996) public static Fst ToLmlRewriter(this Fst fst, ISet <char> alphabet) { if (alphabet.Intersect(markers).Any()) { throw new ArgumentException("The alphabet contains invalid symbols."); } var alphabetStarFsa = FsaBuilder.All(alphabet).Minimal(); var allSymbols = alphabet.Concat(markers).ToHashSet(); var allSymbolsStarFsa = FsaBuilder.All(allSymbols).Minimal(); // Automaton recognizing all words that are not in the language of the input automaton (complement) Fsa NotInLang(Fsa lang) => allSymbolsStarFsa.Difference(lang); // Automaton recognizing all words that contain an occurrence of a word from the input automaton Fsa ContainsLang(Fsa lang) => allSymbolsStarFsa.Concat(lang, allSymbolsStarFsa); // All words w where each prefix of w representing a string in "P" is followed by a suffix which is in "S" Fsa IfPThenS(Fsa p, Fsa s) => NotInLang(p.Concat(NotInLang(s))); // All words for which each suffix from "S" is preceeded by a prefix from "P" Fsa IfSThenP(Fsa p, Fsa s) => NotInLang(NotInLang(p).Concat(s)); Fsa PiffS(Fsa l, Fsa r) => IfPThenS(l, r).Intersect(IfSThenP(l, r)); /* Describes the words where every position is preceded by a string with a suffix in "L" * if and only if it is followed by a string with a prefix in "R" */ Fsa LiffR(Fsa l, Fsa r) => PiffS(allSymbolsStarFsa.Concat(l), r.Concat(allSymbolsStarFsa)); var fstDomain = fst.Domain(); var initialMatch = // mark the beginnings of all rewrite occurrences by inserting "cb" Intro(allSymbols, new HashSet <char> { cb }) .Compose( LiffR( FsaBuilder.FromSymbol(cb), XIgnore(fstDomain, allSymbols, new HashSet <char> { cb })) .Identity()); var leftToRight = // insert boundary markers ("lb", "rb") around the leftmost rewrite occurrences alphabetStarFsa.Identity() // preceeded by arbitrary text that is not matched by the rule .Concat( FstBuilder.FromWordPair(cb.ToString(), lb.ToString()), // replace intial match marker with the left boundary marker IgnoreX(fstDomain, allSymbols, new HashSet <char> { cb }).Identity(), // recognize matches with the leftover "cb" symbol inbetween the markers FstBuilder.FromWordPair(string.Empty, rb.ToString())) // insert right boundary marker at the end of the matched substring .Star() // handle multiple rewrite occurrences .Concat(alphabetStarFsa.Identity()) // succeeded by arbitrary text that is not matched by the rule .Compose( FstBuilder.FromWordPair(cb.ToString(), string.Empty).ToRewriter(allSymbols)); // delete the remaining initial match markers var includesNotLongestMatches = ContainsLang( FsaBuilder.FromSymbol(lb) .Concat( IgnoreX(fstDomain, allSymbols, new HashSet <char> { lb, rb }) .Intersect(ContainsLang(FsaBuilder.FromSymbol(rb))))); // amongst occurrences with the same starting point, preserve only the longest ones var longestMatch = NotInLang(includesNotLongestMatches).Identity(); var replacement = // replace the rewrite occurrence and delete the left and right markers FstBuilder.FromWordPair(lb.ToString(), string.Empty) // delete the left boundary marker .Concat( fst, // perform the replacement FstBuilder.FromWordPair(rb.ToString(), string.Empty)) // delete the right boundary marker .ToRewriter(allSymbols); return(initialMatch.Compose(leftToRight, longestMatch, replacement)); }
// Convert an FST to an optional rewrite transducer public static Fst ToOptionalRewriter(this Fst fst, ISet <char> alphabet) { var idAll = FsaBuilder.All(alphabet).Identity(); return(idAll.Concat(fst.Concat(idAll).Star())); }