// Introduce symbols from a set S into an input string not containing symbols in S static Fst Intro(ISet <char> alphabet, ISet <char> symbols) => FsaBuilder.FromSymbolSet(alphabet.Except(symbols)) .Identity() .Union( FsaBuilder.FromEpsilon() .Product(FsaBuilder.FromSymbolSet(symbols))) .Star();
static Fst XintroX(ISet <char> alphabet, ISet <char> symbols) { var f = FsaBuilder.FromSymbolSet(alphabet.Except(symbols)).Identity(); var s = Intro(alphabet, symbols); var res = f.Concat(s, f).Union(f); return(res.Optional()); }
public void EpsilonFreeSimpleConstructionTest() { // a* var fsa = FsaBuilder.FromWord("a").Star().EpsilonFree(); Assert.DoesNotContain(fsa.Transitions, t => string.IsNullOrEmpty(t.Label)); Assert.DoesNotContain(new[] { "ca", "aaba", "b", "cc" }, fsa.Recognize); Assert.True(new[] { "aaaa", "a", "aa", string.Empty, "aaaaaaaa" }.All(fsa.Recognize)); }
Fsa Term() { if (this.HasMoreChars() && this.Peek() != ')' && this.Peek() != '|') { return(this.Factor().Concat(this.Term())); } return(FsaBuilder.FromEpsilon()); }
public void EpsilonFsaBuilderTest() { var fsa = FsaBuilder.FromEpsilon(); Assert.Single(fsa.States); Assert.False(fsa.Recognize("a")); Assert.False(fsa.Recognize("abc")); Assert.True(fsa.Recognize(string.Empty)); }
public static Bimachine CreateForEnglish() { var alphabet = Enumerable.Range(32, 95).Select(x => (char)x) .Concat(new[] { '\t', '\n', '\v', '\f', '\r' }) .ToHashSet(); var whitespaces = new[] { ' ', '\t', '\n' }; var upperCaseLetters = Enumerable.Range(65, 27).Select(x => (char)x); var lowerCaseLetters = Enumerable.Range(97, 27).Select(x => (char)x); var digits = new[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }; var letters = upperCaseLetters.Concat(lowerCaseLetters); var riseCase = alphabet .Select(symbol => FstBuilder.FromWordPair( symbol.ToString(), char.IsLower(symbol) ? symbol.ToString().ToUpper() : symbol.ToString())) .Aggregate((aggr, fst) => aggr.Union(fst)) .Star(); var multiWordExprList = new[] { "AT LEAST", "IN SPITE OF", "HEAD OVER HEELS" }; var multiWordExpr = multiWordExprList .Select(exp => FsaBuilder.FromWord(exp)) .Aggregate((aggr, fsa) => aggr.Union(fsa)); var token = FsaBuilder.FromSymbolSet(letters) .Plus() .Union( FsaBuilder.FromSymbolSet(digits).Plus(), riseCase.Compose(multiWordExpr.Identity()).Domain(), FsaBuilder.FromSymbolSet(alphabet.Except(whitespaces))); var insertLeadingNewLine = FstBuilder.FromWordPair(string.Empty, "\n") .Concat(FsaBuilder.FromSymbolSet(alphabet).Star().Identity()); var clearSpaces = FsaBuilder.FromSymbolSet(whitespaces) .Plus() .Product(FsaBuilder.FromWord(" ")) .ToLmlRewriter(alphabet); var markTokens = token.Identity() .Concat(FstBuilder.FromWordPair(string.Empty, "\n")) .ToLmlRewriter(alphabet); var clearLeadingSpace = insertLeadingNewLine.Compose( FstBuilder.FromWordPair("\n ", "\n").ToRewriter(alphabet), insertLeadingNewLine.Inverse()); return(clearSpaces.Compose(markTokens, clearLeadingSpace).ToBimachine(alphabet)); }
public void WordFsaBuilderTest() { var fsa = FsaBuilder.FromWord("abc"); Assert.Equal(4, fsa.States.Count); Assert.False(fsa.Recognize(string.Empty)); Assert.False(fsa.Recognize("a")); Assert.False(fsa.Recognize("abca")); Assert.True(fsa.Recognize("abc")); }
// Convert to an obligatory rewrite transducer public static Fst ToRewriter(this Fst fst, ISet <char> alphabet) { var all = FsaBuilder.All(alphabet); var notInDomain = all .Difference(all.Concat(fst.Domain()).Concat(all)) .Identity() .Optional(); return(notInDomain.Concat(fst.Concat(notInDomain).Star())); }
public void StarFsaTest() { var fsa = FsaBuilder.FromWord("a").Star(); Assert.Equal(3, fsa.States.Count); Assert.Single(fsa.Initial); Assert.Equal(2, fsa.Final.Count); Assert.False(fsa.Recognize("ab")); Assert.True(new[] { "aaaa", "a", "aa", string.Empty, "aaaaaaaa" }.All(fsa.Recognize)); }
public void ConcatMultipleFsaTest() { var fsa1 = FsaBuilder.FromWord("ab"); var fsa2 = FsaBuilder.FromWord("cde"); var fsa3 = FsaBuilder.FromWord("f").Star(); var fsa = fsa1.Concat(fsa2, fsa3); Assert.True(fsa.Recognize("abcdef")); Assert.True(fsa.Recognize("abcdefffffff")); Assert.False(fsa.Recognize("abcdff")); }
public void ComplexFsaConstructionTest1() { // (a|b)*c var fsa = FsaBuilder.FromWord("a") .Union(FsaBuilder.FromWord("b")) .Star() .Concat(FsaBuilder.FromWord("c")); Assert.DoesNotContain(new[] { "ca", "aaba", string.Empty, "cc" }, fsa.Recognize); Assert.True(new[] { "abbac", "ac", "bc", "ababbbbac", "c" }.All(fsa.Recognize)); }
public void OptionFsaTest() { var fsa = FsaBuilder.FromWord("ab").Optional(); Assert.Equal(4, fsa.States.Count); Assert.Equal(2, fsa.Initial.Count); Assert.Equal(2, fsa.Final.Count); Assert.False(fsa.Recognize("b")); Assert.False(fsa.Recognize("a")); Assert.True(new[] { "ab", string.Empty }.All(fsa.Recognize)); }
public void FromSymbolSetFsaTest() { var fsa = FsaBuilder.FromSymbolSet(new HashSet <char> { 'a', 'b', 'c' }); Assert.Equal(2, fsa.States.Count); Assert.False(fsa.Recognize(string.Empty)); Assert.False(fsa.Recognize("d")); Assert.False(fsa.Recognize("ab")); Assert.True(new[] { "b", "a", "c" }.All(fsa.Recognize)); }
public void AllFsaTest() { var fsa = FsaBuilder.All(new HashSet <char> { 'a', 'b', 'c' }); Assert.Equal(3, fsa.States.Count); Assert.Equal(1, fsa.Initial.Count); Assert.Equal(2, fsa.Final.Count); Assert.False(fsa.Recognize("d")); Assert.False(fsa.Recognize("ad")); Assert.True(new[] { "ab", string.Empty, "abc", "bbbac", "cba", "cbcbbcaaaaacb" }.All(fsa.Recognize)); }
public void EpsilonFreeConstructionTest() { // (a|b)+c var fsa = FsaBuilder.FromWord("a") .Union(FsaBuilder.FromWord("b")) .Plus() .Concat(FsaBuilder.FromWord("c")) .EpsilonFree(); Assert.DoesNotContain(fsa.Transitions, t => string.IsNullOrEmpty(t.Label)); Assert.True(new[] { "abbac", "ac", "bc", "ababbbbac", "aac" }.All(fsa.Recognize)); Assert.DoesNotContain(new[] { "ca", "aaba", string.Empty, "cc", "c" }, fsa.Recognize); }
public void StarFsaTest1() { var fsa = FsaBuilder.FromWord("abc").Star(); Assert.Equal(5, fsa.States.Count); Assert.Single(fsa.Initial); Assert.Equal(2, fsa.Final.Count); Assert.False(fsa.Recognize("abcabcabcb")); Assert.False(fsa.Recognize("ab")); Assert.True(fsa.Recognize(string.Empty)); Assert.True(fsa.Recognize("abc")); Assert.True(fsa.Recognize("abcabcabc")); }
public void ComplexFsaConstructionTest() { // ab*c var fsa = FsaBuilder.FromWord("a").Concat( FsaBuilder.FromWord("b").Star(), FsaBuilder.FromWord("c")); Assert.False(fsa.Recognize(string.Empty)); Assert.False(fsa.Recognize("ab")); Assert.True(fsa.Recognize("abc")); Assert.True(fsa.Recognize("ac")); Assert.True(fsa.Recognize("abbbbc")); }
public void UnionEpsilonFsaTest() { var fsa1 = FsaBuilder.FromWord("abc"); var fsa2 = FsaBuilder.FromEpsilon(); var fsa = fsa1.Union(fsa2); Assert.Equal(5, fsa.States.Count); Assert.Equal(2, fsa.Initial.Count); Assert.Equal(2, fsa.Final.Count); Assert.True(fsa.Recognize(string.Empty)); Assert.False(fsa.Recognize("a")); Assert.True(fsa.Recognize("abc")); Assert.False(fsa.Recognize("abca")); }
public void ConcatFsaTest() { var fsa1 = FsaBuilder.FromWord("abc"); var fsa2 = FsaBuilder.FromWord("de"); var fsa = fsa1.Concat(fsa2); Assert.Equal(7, fsa.States.Count); Assert.Single(fsa.Initial); Assert.Single(fsa.Final); Assert.False(fsa.Recognize(string.Empty)); Assert.False(fsa.Recognize("a")); Assert.False(fsa.Recognize("abc")); Assert.False(fsa.Recognize("de")); Assert.True(fsa.Recognize("abcde")); }
public void ComplexFsaConstructionTest2() { // .*@.*\.com var all = FsaBuilder.All( Enumerable.Range(97, 27).Select(Convert.ToChar).ToHashSet()); var fsa = all .Concat( FsaBuilder.FromWord("@"), all, FsaBuilder.FromWord(".com")) .Determinize(); Assert.DoesNotContain(new[] { "*****@*****.**", "you@@gmail.com", "*****@*****.**", "*****@*****.**" }, fsa.Recognize); Assert.True(new[] { "*****@*****.**", "*****@*****.**", "*****@*****.**" }.All(fsa.Recognize)); }
// Convert to an obligatory leftmost-longest match rewrite transducer (van Noord, Gerdemann 1999) public static Fst ToLmlRewriter2(this Fst fst, ISet <char> alphabet) { const char notMarkerSymbol = '0'; const char isMarkerSymbol = '1'; var markers = new[] { notMarkerSymbol, isMarkerSymbol }; var sigFsa = FsaBuilder.FromSymbolSet(alphabet) .Concat(FsaBuilder.FromSymbolSet(new[] { notMarkerSymbol })); var sigStarFsa = sigFsa.Star().Minimal(); var xSig = alphabet.Concat(markers).ToHashSet(); var xSigFsa = sigFsa.Concat(FsaBuilder.FromSymbolSet(markers)); var xSigStarFsa = xSigFsa.Star().Minimal(); const char lb1Marker = '<'; // <1 const char lb2Marker = '≪'; // <2 const char rb1Marker = '>'; // 1> const char rb2Marker = '≫'; // 2> var lb1 = FsaBuilder.FromSymbol(lb1Marker).Concat(FsaBuilder.FromSymbol(isMarkerSymbol)); var lb2 = FsaBuilder.FromSymbol(lb2Marker).Concat(FsaBuilder.FromSymbol(isMarkerSymbol)); var rb2 = FsaBuilder.FromSymbol(rb2Marker).Concat(FsaBuilder.FromSymbol(isMarkerSymbol)); var rb1 = FsaBuilder.FromSymbol(rb1Marker).Concat(FsaBuilder.FromSymbol(isMarkerSymbol)); var lb = lb1.Union(lb2); var rb = rb1.Union(rb2); var b1 = lb1.Union(rb1); var b2 = lb2.Union(rb2); var brack = lb.Union(rb); Fsa Not(Fsa lang) => xSigStarFsa.Difference(lang); Fsa Contain(Fsa lang) => xSigStarFsa.Concat(lang, xSigStarFsa); Fsa IfPThenS(Fsa l1, Fsa l2) => Not(l1.Concat(Not(l2))); Fsa IfSThenP(Fsa l1, Fsa l2) => Not(Not(l1).Concat(l2)); Fsa PiffS(Fsa l1, Fsa l2) => IfPThenS(l1, l2).Intersect(IfSThenP(l1, l2)); Fsa LiffR(Fsa l1, Fsa l2) => PiffS(xSigStarFsa.Concat(l1), l2.Concat(xSigStarFsa)); var trueFsa = xSigStarFsa; var falseFsa = FsaBuilder.FromEpsilon(); // Fsa CoerceToBoolean(Fsa l) => l.Identity() // .Compose(trueFsa.Product(trueFsa)).Range(); // Fst If(Fsa cond, Fst then, Fst @else) => // CoerceToBoolean(cond).Identity().Compose(then) // .Union(Not(CoerceToBoolean(cond)).Identity().Compose(@else)); var leftCtx = FsaBuilder.FromEpsilon(); var rightCtx = FsaBuilder.FromEpsilon(); var domainT = fst.Domain(); var nonMarkersFst = FsaBuilder.FromSymbolSet(alphabet) .Identity() .Concat(FstBuilder.FromWordPair(string.Empty, notMarkerSymbol.ToString())); Fsa NonMarkers(Fsa l) => l.Identity().Compose(nonMarkersFst).Range(); // begin R var cond = FsaBuilder.FromEpsilon().Intersect(rightCtx); var then = FsaBuilder.FromEpsilon().Product(rb2).Concat(sigFsa.Identity()).Star() .Concat(FsaBuilder.FromEpsilon().Product(rb2)); var @else = Intro(xSig, new HashSet <char> { rb2Marker }).Compose( LiffR(rb2, XIgnore(NonMarkers(rightCtx), xSig, new HashSet <char> { rb2Marker })).Identity()); // var r = If(cond, then, @else); var r = FsaBuilder.FromEpsilon().Product(rb2).Concat(sigFsa.Identity()).Star() .Concat(FsaBuilder.FromEpsilon().Product(rb2)); // end R var f = Intro(xSig, new HashSet <char> { lb2Marker }) .Compose( LiffR(lb2, XIgnoreX(NonMarkers(domainT), xSig, new HashSet <char> { lb2Marker, rb2Marker }) .Concat(lb2.Optional(), rb2)).Identity()); // begin lr var leftToRightBody = lb2.Product(lb1) .Concat( Ignore(NonMarkers(domainT), xSig, new HashSet <char> { lb2Marker, rb2Marker }).Identity() .Compose(Intro(xSig, new HashSet <char> { lb2Marker }).Inverse())) .Concat(rb2.Product(rb1)); var leftToRight = xSigStarFsa.Identity() .Concat(leftToRightBody) .Star() .Concat(xSigStarFsa.Identity()); // end lr // begin longest match var longestBody = lb1 .Concat( IgnoreX(NonMarkers(domainT), xSig, new HashSet <char> { lb1Marker, lb2Marker, rb1Marker, rb2Marker }) .Intersect(Contain(rb1))) .Concat(rb); var longestMatch = Not(Contain(longestBody)).Identity() .Compose(Intro(xSig, new HashSet <char> { rb2Marker }).Inverse()); // end longest match var auxReplace = sigFsa.Union(lb2).Identity() .Union(lb1.Identity() .Concat(nonMarkersFst.Inverse().Compose(fst, nonMarkersFst)) .Concat(rb1.Product(FsaBuilder.FromEpsilon()))) .Star(); var l1 = Ignore( IfSThenP( IgnoreX(xSigStarFsa.Concat(NonMarkers(leftCtx)), xSig, new HashSet <char> { lb1Marker }), lb1.Concat(xSigStarFsa)), xSig, new HashSet <char> { lb2Marker }) .Identity() .Compose(Intro(xSig, new HashSet <char> { lb1Marker }).Inverse()); var l2 = IfSThenP( IgnoreX(Not(xSigStarFsa.Concat(NonMarkers(leftCtx))), xSig, new HashSet <char> { lb2Marker }), lb2.Concat(xSigStarFsa)) .Identity() .Compose(Intro(xSig, new HashSet <char> { lb2Marker }).Inverse()); var replace = nonMarkersFst.Compose( r, f, leftToRight, longestMatch, auxReplace, l1, l2, nonMarkersFst.Inverse()); return(replace); }
// Same as "Intro" except symbols from S cannot occur at the beginning of the string static Fst Xintro(ISet <char> alphabet, ISet <char> symbols) => FsaBuilder.FromSymbolSet(alphabet.Except(symbols)) .Identity() .Concat(Intro(alphabet, symbols)) .Optional();
// Convert to an obligatory leftmost-longest match rewrite transducer (Karttunen 1996) public static Fst ToLmlRewriter(this Fst fst, ISet <char> alphabet) { if (alphabet.Intersect(markers).Any()) { throw new ArgumentException("The alphabet contains invalid symbols."); } var alphabetStarFsa = FsaBuilder.All(alphabet).Minimal(); var allSymbols = alphabet.Concat(markers).ToHashSet(); var allSymbolsStarFsa = FsaBuilder.All(allSymbols).Minimal(); // Automaton recognizing all words that are not in the language of the input automaton (complement) Fsa NotInLang(Fsa lang) => allSymbolsStarFsa.Difference(lang); // Automaton recognizing all words that contain an occurrence of a word from the input automaton Fsa ContainsLang(Fsa lang) => allSymbolsStarFsa.Concat(lang, allSymbolsStarFsa); // All words w where each prefix of w representing a string in "P" is followed by a suffix which is in "S" Fsa IfPThenS(Fsa p, Fsa s) => NotInLang(p.Concat(NotInLang(s))); // All words for which each suffix from "S" is preceeded by a prefix from "P" Fsa IfSThenP(Fsa p, Fsa s) => NotInLang(NotInLang(p).Concat(s)); Fsa PiffS(Fsa l, Fsa r) => IfPThenS(l, r).Intersect(IfSThenP(l, r)); /* Describes the words where every position is preceded by a string with a suffix in "L" * if and only if it is followed by a string with a prefix in "R" */ Fsa LiffR(Fsa l, Fsa r) => PiffS(allSymbolsStarFsa.Concat(l), r.Concat(allSymbolsStarFsa)); var fstDomain = fst.Domain(); var initialMatch = // mark the beginnings of all rewrite occurrences by inserting "cb" Intro(allSymbols, new HashSet <char> { cb }) .Compose( LiffR( FsaBuilder.FromSymbol(cb), XIgnore(fstDomain, allSymbols, new HashSet <char> { cb })) .Identity()); var leftToRight = // insert boundary markers ("lb", "rb") around the leftmost rewrite occurrences alphabetStarFsa.Identity() // preceeded by arbitrary text that is not matched by the rule .Concat( FstBuilder.FromWordPair(cb.ToString(), lb.ToString()), // replace intial match marker with the left boundary marker IgnoreX(fstDomain, allSymbols, new HashSet <char> { cb }).Identity(), // recognize matches with the leftover "cb" symbol inbetween the markers FstBuilder.FromWordPair(string.Empty, rb.ToString())) // insert right boundary marker at the end of the matched substring .Star() // handle multiple rewrite occurrences .Concat(alphabetStarFsa.Identity()) // succeeded by arbitrary text that is not matched by the rule .Compose( FstBuilder.FromWordPair(cb.ToString(), string.Empty).ToRewriter(allSymbols)); // delete the remaining initial match markers var includesNotLongestMatches = ContainsLang( FsaBuilder.FromSymbol(lb) .Concat( IgnoreX(fstDomain, allSymbols, new HashSet <char> { lb, rb }) .Intersect(ContainsLang(FsaBuilder.FromSymbol(rb))))); // amongst occurrences with the same starting point, preserve only the longest ones var longestMatch = NotInLang(includesNotLongestMatches).Identity(); var replacement = // replace the rewrite occurrence and delete the left and right markers FstBuilder.FromWordPair(lb.ToString(), string.Empty) // delete the left boundary marker .Concat( fst, // perform the replacement FstBuilder.FromWordPair(rb.ToString(), string.Empty)) // delete the right boundary marker .ToRewriter(allSymbols); return(initialMatch.Compose(leftToRight, longestMatch, replacement)); }
public static Fsa All(IEnumerable <char> alphabet) => FsaBuilder.FromSymbolSet(alphabet).Star();
// Convert an FST to an optional rewrite transducer public static Fst ToOptionalRewriter(this Fst fst, ISet <char> alphabet) { var idAll = FsaBuilder.All(alphabet).Identity(); return(idAll.Concat(fst.Concat(idAll).Star())); }