コード例 #1
0
        protected override string GetLemma(TaggedToken token)
        {
            int posTag = token.PosTag;

            string lowerCaseText = token.LowerCaseText;

            try
            {
                if (posTag == TaggedData.PosTagSet.GetTagId("LE"))
                {
                    return(token.Token.Value);
                }
            }
            catch (TagNameException)
            {
            }

            Entry[] entries = PosLexicon.GetEntries(lowerCaseText);

            if (entries != null)
            {
                foreach (Entry entry in entries)
                {
                    if (entry.TagId == posTag && entry.Lemma != null)
                    {
                        return(entry.Lemma);
                    }
                }
            }

            int length = lowerCaseText.Length;

            for (int i = (length <= 16) ? 1 : length - 16; i < length; i++)
            {
                entries = PosLexicon.GetEntries(lowerCaseText.Substring(i));

                if (entries == null)
                {
                    continue;
                }

                foreach (Entry entry in entries)
                {
                    if (entry.TagId == posTag && entry.Lemma != null)
                    {
                        return(CapitalizeLemma(lowerCaseText.Substring(0, i) + entry.Lemma.ToLower(), posTag));
                    }
                }
            }

            return(CapitalizeLemma(lowerCaseText, posTag));
        }
コード例 #2
0
ファイル: Tagger.cs プロジェクト: Sojaner/NStagger
        public virtual void BuildLexicons(TaggedToken[][] sentences)
        {
            const int types = (int)TokenType.Types;

            bool[,] hasTag = new bool[types, TaggedData.PosTagSet.Size];

            foreach (TaggedToken[] sentence in sentences)
            {
                foreach (TaggedToken token in sentence)
                {
                    if (token.PosTag >= 0)
                    {
                        hasTag[(int)token.Token.Type, token.PosTag] = true;

                        PosLexicon.AddEntry(token.Token.Value, token.Lemma, token.PosTag, 1);
                    }
                }
            }

            TokenTypeTags = new int[types][];

            for (int tokenType = 0; tokenType < types; tokenType++)
            {
                int tagsCount = OpenTags.Count(openTag => hasTag[tokenType, openTag]);

                if (tagsCount == 0)
                {
                    TokenTypeTags[tokenType] = OpenTags;
                }
                else
                {
                    TokenTypeTags[tokenType] = new int[tagsCount];

                    int j = 0;

                    foreach (int openTag in OpenTags)
                    {
                        if (hasTag[tokenType, openTag])
                        {
                            TokenTypeTags[tokenType][j++] = openTag;
                        }
                    }

                    Debug.Assert(j == tagsCount);

                    for (int k = 0; k < j - 1; k++)
                    {
                        Debug.Assert(TokenTypeTags[tokenType][k] < TokenTypeTags[tokenType][k + 1]);
                    }
                }
            }
        }
コード例 #3
0
        public override void BuildLexicons(TaggedToken[][] sentences)
        {
            base.BuildLexicons(sentences);

            try
            {
                TokenTypeTags[(int)TokenType.Smiley] = new int[1];

                TokenTypeTags[(int)TokenType.Smiley][0] = TaggedData.PosTagSet.GetTagId("LE", true);

                PosLexicon.Interpolate(TaggedData.PosTagSet.GetTagId("NN|NEU|PLU|IND|NOM"), TaggedData.PosTagSet.GetTagId("NN|NEU|SIN|IND|NOM"));
            }
            catch (TagNameException)
            {
                Debug.Assert(false);
            }
        }
コード例 #4
0
ファイル: Tagger.cs プロジェクト: Sojaner/NStagger
        protected int[] PossiblePosTags(TaggedToken[] sentence, int index)
        {
            string textLower = sentence[index].LowerCaseText;

            if (!TrainingMode)
            {
                GuessTags(sentence[index].Token.Value, (index == 0));
            }

            Entry[] entries = PosLexicon.GetEntries(textLower);

            if (entries == null)
            {
                return(TokenTypeTags[(int)sentence[index].Token.Type]);
            }

            int[] tags = new int[entries.Length];

            int tagsCount = 0;

            int lastTag = -1;

            int seenCount = entries.Sum(entry => entry.NumberOfOccurence);

            foreach (Entry entry in entries)
            {
                if (seenCount > 0 && !ExtendLexicon && entry.NumberOfOccurence == 0)
                {
                    continue;
                }

                if (entry.TagId != lastTag)
                {
                    tags[tagsCount++] = entry.TagId;

                    lastTag = entry.TagId;
                }
            }

            for (int t = 0; t < tagsCount - 1; t++)
            {
                Debug.Assert(tags[t] < tags[t + 1]);
            }

            if (!TrainingMode || seenCount >= CountLimit)
            {
                return(tagsCount != tags.Length ? Arrays.CopyOf(tags, tagsCount) : tags);
            }

            int[] possibleTags = TokenTypeTags[(int)sentence[index].Token.Type];

            int[] lexiconTags = tags;

            int i = 0, j = 0, k = 0;

            tags = new int[tagsCount + possibleTags.Length];

            for (; j < possibleTags.Length && k < tagsCount; i++)
            {
                if (possibleTags[j] < lexiconTags[k])
                {
                    tags[i] = possibleTags[j++];
                }
                else if (possibleTags[j] == lexiconTags[k])
                {
                    tags[i] = possibleTags[j++]; k++;
                }
                else
                {
                    tags[i] = lexiconTags[k++];
                }
            }

            if (j < possibleTags.Length)
            {
                for (; j < possibleTags.Length; j++)
                {
                    tags[i++] = possibleTags[j];
                }
            }
            else
            {
                for (; k < tagsCount; k++)
                {
                    tags[i++] = lexiconTags[k];
                }
            }

            tagsCount = i;

            for (int t = 0; t < tagsCount - 1; t++)
            {
                Debug.Assert(tags[t] < tags[t + 1]);
            }

            return(tagsCount != tags.Length ? Arrays.CopyOf(tags, tagsCount) : tags);
        }