Beispiel #1
0
        /// <summary>
        /// Add a rule to this set.  Rules are added in order, and order is
        /// significant.
        /// </summary>
        /// <param name="rule">The rule to add.</param>
        public virtual void AddRule(TransliterationRule rule)
        {
            ruleVector.Add(rule);
            int len;

            if ((len = rule.AnteContextLength) > maxContextLength)
            {
                maxContextLength = len;
            }

            rules = null;
        }
Beispiel #2
0
        // TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
        // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
        internal virtual void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet)
        {
            UnicodeSet currentFilter = new UnicodeSet(filter);
            UnicodeSet revisiting    = new UnicodeSet();
            int        count         = ruleVector.Count;

            for (int i = 0; i < count; ++i)
            {
                TransliterationRule r = ruleVector[i];
                r.AddSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.Clear());
                currentFilter.AddAll(revisiting);
            }
        }
Beispiel #3
0
        /// <summary>
        /// Create rule strings that represents this rule set.
        /// </summary>
        internal virtual string ToRules(bool escapeUnprintable)
        {
            int           i;
            int           count      = ruleVector.Count;
            StringBuilder ruleSource = new StringBuilder();

            for (i = 0; i < count; ++i)
            {
                if (i != 0)
                {
                    ruleSource.Append('\n');
                }
                TransliterationRule r = ruleVector[i];
                ruleSource.Append(r.ToRule(escapeUnprintable));
            }
            return(ruleSource.ToString());
        }
Beispiel #4
0
        /// <summary>
        /// Close this rule set to further additions, check it for masked rules,
        /// and index it to optimize performance.
        /// </summary>
        /// <exception cref="ArgumentException">If some rules are masked.</exception>
        public virtual void Freeze()
        {
            /* Construct the rule array and index table.  We reorder the
             * rules by sorting them into 256 bins.  Each bin contains all
             * rules matching the index value for that bin.  A rule
             * matches an index value if string whose first key character
             * has a low byte equal to the index value can match the rule.
             *
             * Each bin contains zero or more rules, in the same order
             * they were found originally.  However, the total rules in
             * the bins may exceed the number in the original vector,
             * since rules that have a variable as their first key
             * character will generally fall into more than one bin.
             *
             * That is, each bin contains all rules that either have that
             * first index value as their first key character, or have
             * a set containing the index value as their first character.
             */
            int n = ruleVector.Count;

            index = new int[257];                                                 // [sic]
            List <TransliterationRule> v = new List <TransliterationRule>(2 * n); // heuristic; adjust as needed

            /* Precompute the index values.  This saves a LOT of time.
             */

            int[] indexValue = new int[n];
            for (int j = 0; j < n; ++j)
            {
                TransliterationRule r = ruleVector[j];
                indexValue[j] = r.GetIndexValue();
            }
            for (int x = 0; x < 256; ++x)
            {
                index[x] = v.Count;
                for (int j = 0; j < n; ++j)
                {
                    if (indexValue[j] >= 0)
                    {
                        if (indexValue[j] == x)
                        {
                            v.Add(ruleVector[j]);
                        }
                    }
                    else
                    {
                        // If the indexValue is < 0, then the first key character is
                        // a set, and we must use the more time-consuming
                        // matchesIndexValue check.  In practice this happens
                        // rarely, so we seldom tread this code path.
                        TransliterationRule r = ruleVector[j];
                        if (r.MatchesIndexValue(x))
                        {
                            v.Add(r);
                        }
                    }
                }
            }
            index[256] = v.Count;

            /* Freeze things into an array.
             */
            rules = new TransliterationRule[v.Count];
            v.CopyTo(rules);

            StringBuilder errors = null;

            /* Check for masking.  This is MUCH faster than our old check,
             * which was each rule against each following rule, since we
             * only have to check for masking within each bin now.  It's
             * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
             * count, and n2 is the per-bin rule count.  But n2<<n1, so
             * it's a big win.
             */
            for (int x = 0; x < 256; ++x)
            {
                for (int j = index[x]; j < index[x + 1] - 1; ++j)
                {
                    TransliterationRule r1 = rules[j];
                    for (int k = j + 1; k < index[x + 1]; ++k)
                    {
                        TransliterationRule r2 = rules[k];
                        if (r1.Masks(r2))
                        {
                            if (errors == null)
                            {
                                errors = new StringBuilder();
                            }
                            else
                            {
                                errors.Append("\n");
                            }
                            errors.Append("Rule " + r1 + " masks " + r2);
                        }
                    }
                }
            }

            if (errors != null)
            {
                throw new ArgumentException(errors.ToString());
            }
        }
Beispiel #5
0
        /**
         * Return true if this rule masks another rule.  If r1 masks r2 then
         * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
         * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
         * "[c]a>x" masks "[dc]a>y".
         */
        public virtual bool Masks(TransliterationRule r2)
        {
            /* Rule r1 masks rule r2 if the string formed of the
             * antecontext, key, and postcontext overlaps in the following
             * way:
             *
             * r1:      aakkkpppp
             * r2:     aaakkkkkpppp
             *            ^
             *
             * The strings must be aligned at the first character of the
             * key.  The length of r1 to the left of the alignment point
             * must be <= the length of r2 to the left; ditto for the
             * right.  The characters of r1 must equal (or be a superset
             * of) the corresponding characters of r2.  The superset
             * operation should be performed to check for UnicodeSet
             * masking.
             *
             * Anchors:  Two patterns that differ only in anchors only
             * mask one another if they are exactly equal, and r2 has
             * all the anchors r1 has (optionally, plus some).  Here Y
             * means the row masks the column, N means it doesn't.
             *
             *         ab   ^ab    ab$  ^ab$
             *   ab    Y     Y     Y     Y
             *  ^ab    N     Y     N     Y
             *   ab$   N     N     Y     Y
             *  ^ab$   N     N     N     Y
             *
             * Post context: {a}b masks ab, but not vice versa, since {a}b
             * matches everything ab matches, and {a}b matches {|a|}b but ab
             * does not.  Pre context is different (a{b} does not align with
             * ab).
             */

            /* LIMITATION of the current mask algorithm: Some rule
             * maskings are currently not detected.  For example,
             * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO
             */

            int len    = pattern.Length;
            int left   = anteContextLength;
            int left2  = r2.anteContextLength;
            int right  = pattern.Length - left;
            int right2 = r2.pattern.Length - left2;

            // TODO Clean this up -- some logic might be combinable with the
            // next statement.

            // Test for anchor masking
            if (left == left2 && right == right2 &&
                keyLength <= r2.keyLength &&
                r2.pattern.RegionMatches(0, pattern, 0, len))
            {
                // The following boolean logic implements the table above
                return((flags == r2.flags) ||
                       (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) ||
                       (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0)));
            }

            return(left <= left2 &&
                   (right < right2 ||
                    (right == right2 && keyLength <= r2.keyLength)) &&
                   r2.pattern.RegionMatches(left2 - left, pattern, 0, len));
        }