Ejemplo n.º 1
0
        /// <summary>
        /// Return true if this rule masks another rule. If r1 masks r2 then r1
        /// matches any input string that r2 matches. If r1 masks r2 and r2 masks r1
        /// then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
        /// "[c]a>x" masks "[dc]a>y".
        /// </summary>
        ///
        public bool Masks(TransliterationRule r2)
        {
            /*
             * Rule r1 masks rule r2 if the string formed of the antecontext, key,
             * and postcontext overlaps in the following way:
             *
             * r1: aakkkpppp r2: aaakkkkkpppp ^
             *
             * The strings must be aligned at the first character of the key. The
             * length of r1 to the left of the alignment point must be <= the length
             * of r2 to the left; ditto for the right. The characters of r1 must
             * equal (or be a superset of) the corresponding characters of r2. The
             * superset operation should be performed to check for UnicodeSet
             * masking.
             *
             * Anchors: Two patterns that differ only in anchors only mask one
             * another if they are exactly equal, and r2 has all the anchors r1 has
             * (optionally, plus some). Here Y means the row masks the column, N
             * means it doesn't.
             *
             * ab ^ab ab$ ^ab$ ab Y Y Y Y ^ab N Y N Y ab$ N N Y Y ^ab$ N N N Y
             *
             * Post context: {a}b masks ab, but not vice versa, since {a}b matches
             * everything ab matches, and {a}b matches {|a|}b but ab does not. Pre
             * context is different (a{b} does not align with ab).
             */

            /*
             * LIMITATION of the current mask algorithm: Some rule maskings are
             * currently not detected. For example, "{Lu}]a>x" masks "A]a>y". This
             * can be added later. TODO
             */

            int len    = pattern.Length;
            int left   = anteContextLength;
            int left2  = r2.anteContextLength;
            int right  = pattern.Length - left;
            int right2 = r2.pattern.Length - left2;

            // TODO Clean this up -- some logic might be combinable with the
            // next statement.

            // Test for anchor masking
            if (left == left2 && right == right2 && keyLength <= r2.keyLength &&
                StringUtil.RegionMatches(r2.pattern, 0, pattern, 0, len))
            {
                // The following boolean logic : the table above
                return((flags == r2.flags) ||
                       (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) ||
                       (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0)));
            }

            return(left <= left2 &&
                   (right < right2 || (right == right2 && keyLength <= r2.keyLength)) &&
                   StringUtil.RegionMatches(r2.pattern, left2 - left, pattern, 0, len));
        }
        /// <summary>
        /// Add a rule to this set. Rules are added in order, and order is
        /// significant.
        /// </summary>
        ///
        /// <param name="rule">the rule to add</param>
        public void AddRule(TransliterationRule rule)
        {
            ruleVector.Add(rule);
            int len;

            if ((len = rule.GetAnteContextLength()) > maxContextLength)
            {
                maxContextLength = len;
            }

            rules = null;
        }
        /// <summary>
        /// Create rule strings that represents this rule set.
        /// </summary>
        ///
        internal String ToRules(bool escapeUnprintable)
        {
            int           i;
            int           count      = ruleVector.Count;
            StringBuilder ruleSource = new StringBuilder();

            for (i = 0; i < count; ++i)
            {
                if (i != 0)
                {
                    ruleSource.Append('\n');
                }
                TransliterationRule r = (TransliterationRule)ruleVector[i];
                ruleSource.Append(r.ToRule(escapeUnprintable));
            }
            return(ruleSource.ToString());
        }
        /// <summary>
        /// Return the set of all characters that may be modified (getTarget=false)
        /// or emitted (getTarget=true) by this set.
        /// </summary>
        ///
        internal UnicodeSet GetSourceTargetSet(bool getTarget)
        {
            UnicodeSet set   = new UnicodeSet();
            int        count = ruleVector.Count;

            for (int i = 0; i < count; ++i)
            {
                TransliterationRule r = (TransliterationRule)ruleVector[i];
                if (getTarget)
                {
                    r.AddTargetSetTo(set);
                }
                else
                {
                    r.AddSourceSetTo(set);
                }
            }
            return(set);
        }
        /// <summary>
        /// Close this rule set to further additions, check it for masked rules, and
        /// index it to optimize performance.
        /// </summary>
        ///
        /// <exception cref="IllegalArgumentException">if some rules are masked</exception>
        public void Freeze()
        {
            /*
             * Construct the rule array and index table. We reorder the rules by
             * sorting them into 256 bins. Each bin contains all rules matching the
             * index value for that bin. A rule matches an index value if string
             * whose first key character has a low byte equal to the index value can
             * match the rule.
             *
             * Each bin contains zero or more rules, in the same order they were
             * found originally. However, the total rules in the bins may exceed the
             * number in the original vector, since rules that have a variable as
             * their first key character will generally fall into more than one bin.
             *
             * That is, each bin contains all rules that either have that first
             * index value as their first key character, or have a set containing
             * the index value as their first character.
             */
            int n = ruleVector.Count;

            index = new int[257];               // [sic]
            ArrayList v = new ArrayList(2 * n); // heuristic; adjust as needed

            /*
             * Precompute the index values. This saves a LOT of time.
             */
            int[] indexValue = new int[n];
            for (int j = 0; j < n; ++j)
            {
                TransliterationRule r = (TransliterationRule)ruleVector[j];
                indexValue[j] = r.GetIndexValue();
            }
            for (int x = 0; x < 256; ++x)
            {
                index[x] = v.Count;
                for (int j_0 = 0; j_0 < n; ++j_0)
                {
                    if (indexValue[j_0] >= 0)
                    {
                        if (indexValue[j_0] == x)
                        {
                            v.Add(ruleVector[j_0]);
                        }
                    }
                    else
                    {
                        // If the indexValue is < 0, then the first key character is
                        // a set, and we must use the more time-consuming
                        // matchesIndexValue check. In practice this happens
                        // rarely, so we seldom tread this code path.
                        TransliterationRule r_1 = (TransliterationRule)ruleVector[j_0];
                        if (r_1.MatchesIndexValue(x))
                        {
                            v.Add(r_1);
                        }
                    }
                }
            }
            index[256] = v.Count;

            /*
             * Freeze things into an array.
             */
            rules = new TransliterationRule[v.Count];
            v.CopyTo(rules);

            StringBuilder errors = null;

            /*
             * Check for masking. This is MUCH faster than our old check, which was
             * each rule against each following rule, since we only have to check
             * for masking within each bin now. It's 256*O(n2^2) instead of O(n1^2),
             * where n1 is the total rule count, and n2 is the per-bin rule count.
             * But n2<<n1, so it's a big win.
             */
            for (int x_2 = 0; x_2 < 256; ++x_2)
            {
                for (int j_3 = index[x_2]; j_3 < index[x_2 + 1] - 1; ++j_3)
                {
                    TransliterationRule r1 = rules[j_3];
                    for (int k = j_3 + 1; k < index[x_2 + 1]; ++k)
                    {
                        TransliterationRule r2 = rules[k];
                        if (r1.Masks(r2))
                        {
                            if (errors == null)
                            {
                                errors = new StringBuilder();
                            }
                            else
                            {
                                errors.Append("\n");
                            }
                            errors.Append("Rule " + r1 + " masks " + r2);
                        }
                    }
                }
            }

            if (errors != null)
            {
                throw new ArgumentException(errors.ToString());
            }
        }