Esempio n. 1
0
 public bool TryGetAliasM(int number, out MorphSet result)
 {
     if (number > 0 && number <= aliasM.Count)
     {
         result = aliasM[number - 1];
         return(true);
     }
     else
     {
         result = MorphSet.Empty;
         return(false);
     }
 }
Esempio n. 2
0
        private bool TryParseAliasM(string parameterText, List <MorphSet> entries)
        {
            if (EnumEx.HasFlag(Builder.Options, AffixConfigOptions.ComplexPrefixes))
            {
                parameterText = parameterText.Reverse();
            }

            var parts = parameterText.SplitOnTabOrSpace();

            Builder.DedupInPlace(parts);

            entries.Add(Builder.Dedup(MorphSet.TakeArray(parts)));

            return(true);
        }
Esempio n. 3
0
 public static TEntry Create <TEntry>
 (
     string strip,
     string affixText,
     CharacterConditionGroup conditions,
     MorphSet morph,
     FlagSet contClass
 )
     where TEntry : AffixEntry, new()
 {
     return(new TEntry
     {
         Strip = strip,
         Append = affixText,
         Conditions = conditions,
         MorphCode = morph ?? MorphSet.Empty,
         ContClass = contClass ?? FlagSet.Empty
     });
 }
Esempio n. 4
0
 public MorphSet Dedup(MorphSet value) =>
 value == null ? null : MorphSetDeduper.GetEqualOrAdd(value);
Esempio n. 5
0
        private bool TryParseAffixIntoList <TEntry>(string parameterText, ref List <AffixEntryGroup.Builder <TEntry> > groups)
            where TEntry : AffixEntry, new()
        {
            if (groups == null)
            {
                groups = new List <AffixEntryGroup.Builder <TEntry> >();
            }

            var lineMatch = AffixLineRegex.Match(parameterText);

            if (!lineMatch.Success)
            {
                Builder.LogWarning("Failed to parse affix line: " + parameterText);
                return(false);
            }

            var lineMatchGroups = lineMatch.Groups;

            FlagValue characterFlag;

            if (!TryParseFlag(lineMatchGroups[1].Value, out characterFlag))
            {
                Builder.LogWarning($"Failed to parse affix flag for {lineMatchGroups[1].Value} from: {parameterText}");
                return(false);
            }

            var affixGroup = groups.FindLast(g => g.AFlag == characterFlag);
            var contClass  = FlagSet.Empty;

            if (lineMatchGroups[2].Success && lineMatchGroups[3].Success)
            {
                if (affixGroup != null)
                {
                    Builder.LogWarning($"Duplicate affix group definition for {affixGroup.AFlag} from: {parameterText}");
                    return(false);
                }

                var options = AffixEntryOptions.None;
                if (lineMatchGroups[2].Value.StartsWith('Y'))
                {
                    options |= AffixEntryOptions.CrossProduct;
                }
                if (Builder.IsAliasM)
                {
                    options |= AffixEntryOptions.AliasM;
                }
                if (Builder.IsAliasF)
                {
                    options |= AffixEntryOptions.AliasF;
                }

                int expectedEntryCount;
                IntEx.TryParseInvariant(lineMatchGroups[3].Value, out expectedEntryCount);

                affixGroup = new AffixEntryGroup.Builder <TEntry>
                {
                    AFlag   = characterFlag,
                    Options = options,
                    Entries = new List <TEntry>(expectedEntryCount)
                };

                groups.Add(affixGroup);

                return(true);
            }
            else if (lineMatchGroups[4].Success && lineMatchGroups[5].Success && lineMatchGroups[6].Success)
            {
                // piece 3 - is string to strip or 0 for null
                var strip = lineMatchGroups[4].Value;
                if (strip == "0")
                {
                    strip = string.Empty;
                }
                else if (EnumEx.HasFlag(Builder.Options, AffixConfigOptions.ComplexPrefixes))
                {
                    strip = strip.Reverse();
                }

                // piece 4 - is affix string or 0 for null
                var           affixInput      = lineMatchGroups[5].Value;
                var           affixSlashIndex = affixInput.IndexOf('/');
                StringBuilder affixText;
                if (affixSlashIndex >= 0)
                {
                    var slashPartOffset = affixSlashIndex + 1;
                    var slashPartLength = affixInput.Length - slashPartOffset;
                    affixText = StringBuilderPool.Get(affixInput, 0, affixSlashIndex);

                    if (Builder.IsAliasF)
                    {
                        int aliasNumber;
                        if (IntEx.TryParseInvariant(affixInput.Subslice(slashPartOffset, slashPartLength), out aliasNumber) && aliasNumber > 0 && aliasNumber <= Builder.AliasF.Count)
                        {
                            contClass = Builder.AliasF[aliasNumber - 1];
                        }
                        else
                        {
                            Builder.LogWarning($"Failed to parse contclasses from : {parameterText}");
                            return(false);
                        }
                    }
                    else
                    {
                        contClass = ParseFlags(affixInput.Subslice(slashPartOffset, slashPartLength));
                    }
                }
                else
                {
                    affixText = StringBuilderPool.Get(affixInput);
                }

                if (Builder.IgnoredChars != null && Builder.IgnoredChars.HasItems)
                {
                    affixText.RemoveChars(Builder.IgnoredChars);
                }

                if (EnumEx.HasFlag(Builder.Options, AffixConfigOptions.ComplexPrefixes))
                {
                    affixText.Reverse();
                }

                if (affixText.Length == 1 && affixText[0] == '0')
                {
                    affixText.Clear();
                }

                // piece 5 - is the conditions descriptions
                var conditionText = lineMatchGroups[6].Value;
                if (EnumEx.HasFlag(Builder.Options, AffixConfigOptions.ComplexPrefixes))
                {
                    conditionText = ReverseCondition(conditionText);
                }

                var conditions = CharacterCondition.Parse(conditionText);

                if (!string.IsNullOrEmpty(strip) && !conditions.AllowsAnySingleCharacter)
                {
                    bool isRedundant;
                    if (typeof(TEntry) == typeof(PrefixEntry))
                    {
                        isRedundant = RedundantConditionPrefix(strip, conditions);
                    }
                    else if (typeof(TEntry) == typeof(SuffixEntry))
                    {
                        isRedundant = RedundantConditionSuffix(strip, conditions);
                    }
                    else
                    {
                        throw new NotSupportedException();
                    }

                    if (isRedundant)
                    {
                        conditions = CharacterConditionGroup.AllowAnySingleCharacter;
                    }
                }

                // piece 6
                MorphSet morph;
                if (lineMatchGroups[7].Success)
                {
                    var morphAffixText = lineMatchGroups[7].Value;
                    if (Builder.IsAliasM)
                    {
                        int morphNumber;
                        if (IntEx.TryParseInvariant(morphAffixText, out morphNumber) && morphNumber > 0 && morphNumber <= Builder.AliasM.Count)
                        {
                            morph = Builder.AliasM[morphNumber - 1];
                        }
                        else
                        {
                            Builder.LogWarning($"Failed to parse morph {morphAffixText} from: {parameterText}");
                            return(false);
                        }
                    }
                    else
                    {
                        if (EnumEx.HasFlag(Builder.Options, AffixConfigOptions.ComplexPrefixes))
                        {
                            morphAffixText = morphAffixText.Reverse();
                        }

                        morph = Builder.Dedup(MorphSet.TakeArray(Builder.DedupInPlace(morphAffixText.SplitOnTabOrSpace())));
                    }
                }
                else
                {
                    morph = MorphSet.Empty;
                }

                if (affixGroup == null)
                {
                    affixGroup = new AffixEntryGroup.Builder <TEntry>
                    {
                        AFlag   = characterFlag,
                        Options = AffixEntryOptions.None,
                        Entries = new List <TEntry>()
                    };
                }

                if (!Builder.HasContClass && contClass.HasItems)
                {
                    Builder.HasContClass = true;
                }

                affixGroup.Entries.Add(AffixEntry.Create <TEntry>(
                                           Builder.Dedup(strip),
                                           Builder.Dedup(StringBuilderPool.GetStringAndReturn(affixText)),
                                           Builder.Dedup(conditions),
                                           morph,
                                           contClass));

                return(true);
            }
            else
            {
                Builder.LogWarning("Affix line not fully parsed: " + parameterText);
                return(false);
            }
        }
Esempio n. 6
0
        private bool AddWord(string word, FlagSet flags, MorphSet morphs, bool onlyUpperCase)
        {
            if (Affix.IgnoredChars.HasItems)
            {
                word = word.RemoveChars(Affix.IgnoredChars);
            }

            if (Affix.ComplexPrefixes)
            {
                word = word.Reverse();

                if (morphs.HasItems && !Affix.IsAliasM)
                {
                    var newMorphs = new string[morphs.Count];
                    for (int i = 0; i < morphs.Count; i++)
                    {
                        newMorphs[i] = morphs[morphs.Count - i - 1].Reverse();
                    }

                    morphs = MorphSet.TakeArray(newMorphs);
                }
            }

            WordEntryOptions options;

            if (morphs.HasItems)
            {
                if (Affix.IsAliasM)
                {
                    options = WordEntryOptions.AliasM;
                    var morphBuilder = new List <string>();
                    foreach (var originalValue in morphs)
                    {
                        int      morphNumber;
                        MorphSet aliasedMorph;
                        if (IntEx.TryParseInvariant(originalValue, out morphNumber) && Affix.TryGetAliasM(morphNumber, out aliasedMorph))
                        {
                            morphBuilder.AddRange(aliasedMorph);
                        }
                        else
                        {
                            morphBuilder.Add(originalValue);
                        }
                    }

                    morphs = MorphSet.Create(morphBuilder);
                }
                else
                {
                    options = WordEntryOptions.None;
                }

                if (morphs.AnyStartsWith(MorphologicalTags.Phon))
                {
                    options |= WordEntryOptions.Phon;
                }
            }
            else
            {
                options = WordEntryOptions.None;
            }

            bool         saveEntryList = false;
            WordEntrySet entryList;

            word = Builder.Dedup(word);
            if (!Builder.EntriesByRoot.TryGetValue(word, out entryList))
            {
                saveEntryList = true;
                entryList     = WordEntrySet.Empty;
            }

            var upperCaseHomonym = false;

            for (var i = 0; i < entryList.Count; i++)
            {
                var existingEntry = entryList[i];

                if (!onlyUpperCase)
                {
                    if (existingEntry.ContainsFlag(SpecialFlags.OnlyUpcaseFlag))
                    {
                        existingEntry = new WordEntry(
                            existingEntry.Word,
                            flags,
                            existingEntry.Morphs,
                            existingEntry.Options);
                        entryList.DestructiveReplace(i, existingEntry);
                        return(false);
                    }
                }
                else
                {
                    upperCaseHomonym = true;
                }
            }

            if (!upperCaseHomonym)
            {
                saveEntryList = true;
                entryList     = WordEntrySet.CopyWithItemAdded(entryList, new WordEntry(
                                                                   word,
                                                                   flags,
                                                                   Builder.Dedup(morphs),
                                                                   options));
            }

            if (saveEntryList)
            {
                Builder.EntriesByRoot[word] = entryList;
            }

            return(false);
        }
Esempio n. 7
0
 private bool AddWord(string word, FlagSet flags, MorphSet morphs)
 {
     return(AddWord(word, flags, morphs, false) ||
            AddWordCapitalized(word, flags, morphs, CapitalizationTypeEx.GetCapitalizationType(word, Affix)));
 }
Esempio n. 8
0
        private bool ParseLine(string line)
        {
            if (string.IsNullOrEmpty(line))
            {
                return(true);
            }

            if (!hasInitialized && AttemptToProcessInitializationLine(line))
            {
                return(true);
            }

            if (Builder.EntriesByRoot == null)
            {
                Builder.InitializeEntriesByRoot(-1);
            }

            var parsed = ParsedWordLine.Parse(line);

            if (string.IsNullOrEmpty(parsed.Word))
            {
                return(false);
            }

            FlagSet flags;

            if (!string.IsNullOrEmpty(parsed.Flags))
            {
                if (Affix.IsAliasF)
                {
                    int     flagAliasNumber;
                    FlagSet aliasedFlags;
                    if (IntEx.TryParseInvariant(parsed.Flags, out flagAliasNumber) && Affix.TryGetAliasF(flagAliasNumber, out aliasedFlags))
                    {
                        flags = aliasedFlags;
                    }
                    else
                    {
                        // TODO: warn
                        return(false);
                    }
                }
                else if (Affix.FlagMode == FlagMode.Uni)
                {
                    var encodedBytes = Affix.Encoding.GetBytes(parsed.Flags);
                    var utf8Flags    = Encoding.UTF8.GetString(encodedBytes, 0, encodedBytes.Length);
                    flags = Builder.Dedup(FlagValue.ParseFlags(utf8Flags, FlagMode.Char));
                }
                else
                {
                    flags = Builder.Dedup(FlagValue.ParseFlags(parsed.Flags, Affix.FlagMode));
                }
            }
            else
            {
                flags = FlagSet.Empty;
            }

            MorphSet morphs;

            if (parsed.Morphs != null && parsed.Morphs.Length != 0)
            {
                var morphValues = new string[parsed.Morphs.Length];
                for (int i = 0; i < parsed.Morphs.Length; i++)
                {
                    morphValues[i] = parsed.Morphs[i];
                }

                morphs = Builder.Dedup(MorphSet.TakeArray(morphValues));
            }
            else
            {
                morphs = MorphSet.Empty;
            }

            return(AddWord(parsed.Word, flags, morphs));
        }