public static int Compare(StemmingRule a, StemmingRule b)
        {
            // primary: decreasing by prio
            int result = b._Priority - a._Priority;

            if (result == 0)
            {
                // secondary: decreasing by length
                result = b._Length - a._Length;
            }
            return(result);
        }
Beispiel #2
0
 public void Add(StemmingRule r)
 {
     _Rules.Add(r);
     Sort();
 }
        /// <summary>
        /// Read a stemming _Rule from a string in the old TRADOS format.
        /// </summary>
        /// <param name="rule">The textual _Rule</param>
        /// <returns>A new stemming _Rule, parsed from the input string</returns>
        public void Add(string rule)
        {
            _Rule  = rule;
            _RuleP = 0;

            StemmingRule result = new StemmingRule();

            int state = 0;
            int len   = _Rule.Length;

            while (state != 99)
            {
                while (_RuleP < len && System.Char.IsWhiteSpace(_Rule, _RuleP))
                {
                    ++_RuleP;
                }

                switch (state)
                {
                case 0:
                    // waiting for initial keyword
                    switch (GetIdentifier().ToLowerInvariant())
                    {
                    case "replace":
                        state = 6;
                        break;

                    case "stripdiacritics":
                        state         = 1;
                        result.Action = StemmingRule.StemAction.StripDiacritics;
                        break;

                    case "tolower":
                        result.Action = StemmingRule.StemAction.MapToLower;
                        state         = 1;
                        break;

                    case "deletelastdoublevowels":
                        result.Action = StemmingRule.StemAction.DeleteLastDoubleVowels;
                        state         = 1;
                        break;

                    case "deletelastdoubleconsonants":
                        result.Action = StemmingRule.StemAction.DeleteLastDoubleConsonants;
                        state         = 1;
                        break;

                    case "testonbaseword":
                        result.Action = StemmingRule.StemAction.TestOnBaseWord;
                        state         = 1;
                        break;

                    case "set":
                        // Variable Setting
                        state = 12;
                        break;

                    default:
                        throw new Core.LanguagePlatformException(Core.ErrorCode.SegmentationIllegalKeywordInRule, _Rule);
                    }
                    break;

                case 1:
                    // priority...
                    Expect("priority");
                    state = 2;
                    break;

                case 2:
                    // priority number
                    result.Priority = GetNumber();
                    state           = 3;
                    break;

                case 3:
                    // "and"
                    Expect("and");
                    state = 4;
                    break;

                case 4:
                    // "restart" - continuation
                    state = 10;
                    switch (GetIdentifier().ToLowerInvariant())
                    {
                    case "continue":
                        result.ContinuationOnSuccess = StemmingRule.StemContinuation.Continue;
                        break;

                    case "restart":
                        result.ContinuationOnSuccess = StemmingRule.StemContinuation.Restart;
                        break;

                    case "stop":
                        result.ContinuationOnSuccess = StemmingRule.StemContinuation.Stop;
                        state = 5;         // no continuation priority for stop
                        break;

                    default:
                        throw new Core.LanguagePlatformException(Core.ErrorCode.SegmentationIllegalContinuation, _Rule);
                    }

                    result.ContinuationOnFail   = StemmingRule.StemContinuation.Continue;
                    result.ContinuationPriority = 0;
                    break;

                case 5:
                    // final semicolon
                    if (_Rule[_RuleP] == ';')
                    {
                        state = 99;     // success
                    }
                    else
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.SegmentationTrailingJunk, _Rule);
                    }
                    break;

                case 6:
                    // _Rule/affix type
                    switch (GetIdentifier().ToLowerInvariant())
                    {
                    case "prefix":
                        result.Action = StemmingRule.StemAction.Prefix;
                        break;

                    case "suffix":
                        result.Action = StemmingRule.StemAction.Suffix;
                        break;

                    case "infix":
                        result.Action = StemmingRule.StemAction.Infix;
                        break;

                    case "properinfix":
                        result.Action = StemmingRule.StemAction.ProperInfix;
                        break;

                    case "circumfix":
                        result.Action = StemmingRule.StemAction.Circumfix;
                        break;

                    case "form":
                        result.Action = StemmingRule.StemAction.Form;
                        break;

                    case "prefixedinfix":
                        result.Action = StemmingRule.StemAction.PrefixedInfix;
                        break;

                    default:
                        throw new Core.LanguagePlatformException(Core.ErrorCode.SegmentationUnknownRuleType, _Rule);
                    }
                    state = 7;
                    break;

                case 7:
                    // affix pattern
                    result.Affix = GetQuotedString();
                    state        = 8;
                    break;

                case 8:
                    // "with"
                    Expect("with");
                    state = 9;
                    break;

                case 9:
                    // replacement pattern
                    result.Replacement = GetQuotedString();
                    state = 1;
                    break;

                case 10:
                    // "at" <cprio>
                    if (_Rule[_RuleP] == ';')
                    {
                        state = 5;
                    }
                    else
                    {
                        Expect("at");
                        state = 11;
                    }
                    break;

                case 11:
                    result.ContinuationPriority = GetNumber();
                    state = 5;
                    break;

                case 12:     // Variable Setting - got "set", expect variable name
                {
                    string id = GetIdentifier().ToLowerInvariant();
                    switch (id)
                    {
                    case "minwordlength":
                        _RuleSet.MinimumWordLength = GetNumber();
                        break;

                    case "minstemlength":
                        _RuleSet.MinimumStemLength = GetNumber();
                        break;

                    case "minstempercentage":
                        _RuleSet.MinimumStemPercentage = GetNumber();
                        break;

                    case "maxruleapplications":
                        _RuleSet.MaximumRuleApplications = GetNumber();
                        break;

                    default:
                        throw new Core.LanguagePlatformException(Core.ErrorCode.SegmentationInvalidVariableName, id);
                    }
                    state = 5;
                }
                break;
                }
            }

            if (result != null && result.Action != StemmingRule.StemAction.None)
            {
                _RuleSet.Add(result);
            }
        }
Beispiel #4
0
        private bool ApplyRule(ref string form, int shortestStemLength, bool specialRulesOnly,
                               StemmingRule rule)
        {
            int patternLength     = String.IsNullOrEmpty(rule.Affix) ? 0 : rule.Affix.Length;
            int wordLength        = form.Length;
            int replacementLength = String.IsNullOrEmpty(rule.Replacement) ? 0 : rule.Replacement.Length;

            bool result = false;

            // we allow that the pattern length is equal to the word length,
            // in order to support substitutions such as "went"/Prefix->"go"

            // special rules (tolower, tobase) are always executed, no matter
            // whether the minwordlength is below the original length.
            if (rule.Action == StemmingRule.StemAction.MapToLower)
            {
                form   = form.ToLowerInvariant();
                result = true;
            }
            else if (rule.Action == StemmingRule.StemAction.StripDiacritics)
            {
                form = Core.CharacterProperties.ToBase(form);
                form = StripPeripheralPunctuation(form);

                result = true;
            }
            else if (specialRulesOnly)
            {
                result = false;
            }
            else if (rule.Action == StemmingRule.StemAction.TestOnBaseWord)
            {
                result = _Resources.IsStopword(form);
            }
            else if (rule.Action == StemmingRule.StemAction.DeleteLastDoubleConsonants)
            {
                if (wordLength > 2)
                {
                    for (int charPos = wordLength - 1; charPos > 0; --charPos)
                    {
                        if (form[charPos] == form[charPos - 1] && !Core.CharacterProperties.IsVowel(form[charPos]))
                        {
                            form = form.Remove(charPos);
                            break;
                        }
                    }
                }
                result = true;
            }
            else if (rule.Action == StemmingRule.StemAction.DeleteLastDoubleVowels)
            {
                if (wordLength > 2)
                {
                    for (int charPos = wordLength - 1; charPos > 0; --charPos)
                    {
                        if (form[charPos] == form[charPos - 1] && Core.CharacterProperties.IsVowel(form[charPos]))
                        {
                            form = form.Remove(charPos);
                            break;
                        }
                    }
                }
                result = true;
            }
            else if (wordLength < patternLength)
            {
                result = false;
            }
            else if (wordLength - patternLength + replacementLength < shortestStemLength)
            {
                // Don't let result string become too short
                result = false;
            }
            else
            {
                result = ReplaceAffix(ref form, rule.Action, rule.Affix, rule.Replacement);
            }

            return(result);
        }