/// <summary> /// Add a rule to this set. Rules are added in order, and order is /// significant. /// </summary> /// <param name="rule">The rule to add.</param> public virtual void AddRule(TransliterationRule rule) { ruleVector.Add(rule); int len; if ((len = rule.AnteContextLength) > maxContextLength) { maxContextLength = len; } rules = null; }
// TODO Handle the case where we have :: [a] ; a > |b ; b > c ; // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing internal virtual void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { UnicodeSet currentFilter = new UnicodeSet(filter); UnicodeSet revisiting = new UnicodeSet(); int count = ruleVector.Count; for (int i = 0; i < count; ++i) { TransliterationRule r = ruleVector[i]; r.AddSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.Clear()); currentFilter.AddAll(revisiting); } }
/// <summary> /// Create rule strings that represents this rule set. /// </summary> internal virtual string ToRules(bool escapeUnprintable) { int i; int count = ruleVector.Count; StringBuilder ruleSource = new StringBuilder(); for (i = 0; i < count; ++i) { if (i != 0) { ruleSource.Append('\n'); } TransliterationRule r = ruleVector[i]; ruleSource.Append(r.ToRule(escapeUnprintable)); } return(ruleSource.ToString()); }
/// <summary> /// Close this rule set to further additions, check it for masked rules, /// and index it to optimize performance. /// </summary> /// <exception cref="ArgumentException">If some rules are masked.</exception> public virtual void Freeze() { /* Construct the rule array and index table. We reorder the * rules by sorting them into 256 bins. Each bin contains all * rules matching the index value for that bin. A rule * matches an index value if string whose first key character * has a low byte equal to the index value can match the rule. * * Each bin contains zero or more rules, in the same order * they were found originally. However, the total rules in * the bins may exceed the number in the original vector, * since rules that have a variable as their first key * character will generally fall into more than one bin. * * That is, each bin contains all rules that either have that * first index value as their first key character, or have * a set containing the index value as their first character. */ int n = ruleVector.Count; index = new int[257]; // [sic] List <TransliterationRule> v = new List <TransliterationRule>(2 * n); // heuristic; adjust as needed /* Precompute the index values. This saves a LOT of time. */ int[] indexValue = new int[n]; for (int j = 0; j < n; ++j) { TransliterationRule r = ruleVector[j]; indexValue[j] = r.GetIndexValue(); } for (int x = 0; x < 256; ++x) { index[x] = v.Count; for (int j = 0; j < n; ++j) { if (indexValue[j] >= 0) { if (indexValue[j] == x) { v.Add(ruleVector[j]); } } else { // If the indexValue is < 0, then the first key character is // a set, and we must use the more time-consuming // matchesIndexValue check. In practice this happens // rarely, so we seldom tread this code path. TransliterationRule r = ruleVector[j]; if (r.MatchesIndexValue(x)) { v.Add(r); } } } } index[256] = v.Count; /* Freeze things into an array. */ rules = new TransliterationRule[v.Count]; v.CopyTo(rules); StringBuilder errors = null; /* Check for masking. This is MUCH faster than our old check, * which was each rule against each following rule, since we * only have to check for masking within each bin now. It's * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule * count, and n2 is the per-bin rule count. But n2<<n1, so * it's a big win. */ for (int x = 0; x < 256; ++x) { for (int j = index[x]; j < index[x + 1] - 1; ++j) { TransliterationRule r1 = rules[j]; for (int k = j + 1; k < index[x + 1]; ++k) { TransliterationRule r2 = rules[k]; if (r1.Masks(r2)) { if (errors == null) { errors = new StringBuilder(); } else { errors.Append("\n"); } errors.Append("Rule " + r1 + " masks " + r2); } } } } if (errors != null) { throw new ArgumentException(errors.ToString()); } }
/** * Return true if this rule masks another rule. If r1 masks r2 then * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". * "[c]a>x" masks "[dc]a>y". */ public virtual bool Masks(TransliterationRule r2) { /* Rule r1 masks rule r2 if the string formed of the * antecontext, key, and postcontext overlaps in the following * way: * * r1: aakkkpppp * r2: aaakkkkkpppp * ^ * * The strings must be aligned at the first character of the * key. The length of r1 to the left of the alignment point * must be <= the length of r2 to the left; ditto for the * right. The characters of r1 must equal (or be a superset * of) the corresponding characters of r2. The superset * operation should be performed to check for UnicodeSet * masking. * * Anchors: Two patterns that differ only in anchors only * mask one another if they are exactly equal, and r2 has * all the anchors r1 has (optionally, plus some). Here Y * means the row masks the column, N means it doesn't. * * ab ^ab ab$ ^ab$ * ab Y Y Y Y * ^ab N Y N Y * ab$ N N Y Y * ^ab$ N N N Y * * Post context: {a}b masks ab, but not vice versa, since {a}b * matches everything ab matches, and {a}b matches {|a|}b but ab * does not. Pre context is different (a{b} does not align with * ab). */ /* LIMITATION of the current mask algorithm: Some rule * maskings are currently not detected. For example, * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO */ int len = pattern.Length; int left = anteContextLength; int left2 = r2.anteContextLength; int right = pattern.Length - left; int right2 = r2.pattern.Length - left2; // TODO Clean this up -- some logic might be combinable with the // next statement. // Test for anchor masking if (left == left2 && right == right2 && keyLength <= r2.keyLength && r2.pattern.RegionMatches(0, pattern, 0, len)) { // The following boolean logic implements the table above return((flags == r2.flags) || (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) || (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0))); } return(left <= left2 && (right < right2 || (right == right2 && keyLength <= r2.keyLength)) && r2.pattern.RegionMatches(left2 - left, pattern, 0, len)); }