/// <summary> /// Return true if this rule masks another rule. If r1 masks r2 then r1 /// matches any input string that r2 matches. If r1 masks r2 and r2 masks r1 /// then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". /// "[c]a>x" masks "[dc]a>y". /// </summary> /// public bool Masks(TransliterationRule r2) { /* * Rule r1 masks rule r2 if the string formed of the antecontext, key, * and postcontext overlaps in the following way: * * r1: aakkkpppp r2: aaakkkkkpppp ^ * * The strings must be aligned at the first character of the key. The * length of r1 to the left of the alignment point must be <= the length * of r2 to the left; ditto for the right. The characters of r1 must * equal (or be a superset of) the corresponding characters of r2. The * superset operation should be performed to check for UnicodeSet * masking. * * Anchors: Two patterns that differ only in anchors only mask one * another if they are exactly equal, and r2 has all the anchors r1 has * (optionally, plus some). Here Y means the row masks the column, N * means it doesn't. * * ab ^ab ab$ ^ab$ ab Y Y Y Y ^ab N Y N Y ab$ N N Y Y ^ab$ N N N Y * * Post context: {a}b masks ab, but not vice versa, since {a}b matches * everything ab matches, and {a}b matches {|a|}b but ab does not. Pre * context is different (a{b} does not align with ab). */ /* * LIMITATION of the current mask algorithm: Some rule maskings are * currently not detected. For example, "{Lu}]a>x" masks "A]a>y". This * can be added later. TODO */ int len = pattern.Length; int left = anteContextLength; int left2 = r2.anteContextLength; int right = pattern.Length - left; int right2 = r2.pattern.Length - left2; // TODO Clean this up -- some logic might be combinable with the // next statement. // Test for anchor masking if (left == left2 && right == right2 && keyLength <= r2.keyLength && StringUtil.RegionMatches(r2.pattern, 0, pattern, 0, len)) { // The following boolean logic : the table above return((flags == r2.flags) || (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) || (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0))); } return(left <= left2 && (right < right2 || (right == right2 && keyLength <= r2.keyLength)) && StringUtil.RegionMatches(r2.pattern, left2 - left, pattern, 0, len)); }
/// <summary> /// Add a rule to this set. Rules are added in order, and order is /// significant. /// </summary> /// /// <param name="rule">the rule to add</param> public void AddRule(TransliterationRule rule) { ruleVector.Add(rule); int len; if ((len = rule.GetAnteContextLength()) > maxContextLength) { maxContextLength = len; } rules = null; }
/// <summary> /// Create rule strings that represents this rule set. /// </summary> /// internal String ToRules(bool escapeUnprintable) { int i; int count = ruleVector.Count; StringBuilder ruleSource = new StringBuilder(); for (i = 0; i < count; ++i) { if (i != 0) { ruleSource.Append('\n'); } TransliterationRule r = (TransliterationRule)ruleVector[i]; ruleSource.Append(r.ToRule(escapeUnprintable)); } return(ruleSource.ToString()); }
/// <summary> /// Return the set of all characters that may be modified (getTarget=false) /// or emitted (getTarget=true) by this set. /// </summary> /// internal UnicodeSet GetSourceTargetSet(bool getTarget) { UnicodeSet set = new UnicodeSet(); int count = ruleVector.Count; for (int i = 0; i < count; ++i) { TransliterationRule r = (TransliterationRule)ruleVector[i]; if (getTarget) { r.AddTargetSetTo(set); } else { r.AddSourceSetTo(set); } } return(set); }
/// <summary> /// Close this rule set to further additions, check it for masked rules, and /// index it to optimize performance. /// </summary> /// /// <exception cref="IllegalArgumentException">if some rules are masked</exception> public void Freeze() { /* * Construct the rule array and index table. We reorder the rules by * sorting them into 256 bins. Each bin contains all rules matching the * index value for that bin. A rule matches an index value if string * whose first key character has a low byte equal to the index value can * match the rule. * * Each bin contains zero or more rules, in the same order they were * found originally. However, the total rules in the bins may exceed the * number in the original vector, since rules that have a variable as * their first key character will generally fall into more than one bin. * * That is, each bin contains all rules that either have that first * index value as their first key character, or have a set containing * the index value as their first character. */ int n = ruleVector.Count; index = new int[257]; // [sic] ArrayList v = new ArrayList(2 * n); // heuristic; adjust as needed /* * Precompute the index values. This saves a LOT of time. */ int[] indexValue = new int[n]; for (int j = 0; j < n; ++j) { TransliterationRule r = (TransliterationRule)ruleVector[j]; indexValue[j] = r.GetIndexValue(); } for (int x = 0; x < 256; ++x) { index[x] = v.Count; for (int j_0 = 0; j_0 < n; ++j_0) { if (indexValue[j_0] >= 0) { if (indexValue[j_0] == x) { v.Add(ruleVector[j_0]); } } else { // If the indexValue is < 0, then the first key character is // a set, and we must use the more time-consuming // matchesIndexValue check. In practice this happens // rarely, so we seldom tread this code path. TransliterationRule r_1 = (TransliterationRule)ruleVector[j_0]; if (r_1.MatchesIndexValue(x)) { v.Add(r_1); } } } } index[256] = v.Count; /* * Freeze things into an array. */ rules = new TransliterationRule[v.Count]; v.CopyTo(rules); StringBuilder errors = null; /* * Check for masking. This is MUCH faster than our old check, which was * each rule against each following rule, since we only have to check * for masking within each bin now. It's 256*O(n2^2) instead of O(n1^2), * where n1 is the total rule count, and n2 is the per-bin rule count. * But n2<<n1, so it's a big win. */ for (int x_2 = 0; x_2 < 256; ++x_2) { for (int j_3 = index[x_2]; j_3 < index[x_2 + 1] - 1; ++j_3) { TransliterationRule r1 = rules[j_3]; for (int k = j_3 + 1; k < index[x_2 + 1]; ++k) { TransliterationRule r2 = rules[k]; if (r1.Masks(r2)) { if (errors == null) { errors = new StringBuilder(); } else { errors.Append("\n"); } errors.Append("Rule " + r1 + " masks " + r2); } } } } if (errors != null) { throw new ArgumentException(errors.ToString()); } }