protected override string GetLemma(TaggedToken token) { int posTag = token.PosTag; string lowerCaseText = token.LowerCaseText; try { if (posTag == TaggedData.PosTagSet.GetTagId("LE")) { return(token.Token.Value); } } catch (TagNameException) { } Entry[] entries = PosLexicon.GetEntries(lowerCaseText); if (entries != null) { foreach (Entry entry in entries) { if (entry.TagId == posTag && entry.Lemma != null) { return(entry.Lemma); } } } int length = lowerCaseText.Length; for (int i = (length <= 16) ? 1 : length - 16; i < length; i++) { entries = PosLexicon.GetEntries(lowerCaseText.Substring(i)); if (entries == null) { continue; } foreach (Entry entry in entries) { if (entry.TagId == posTag && entry.Lemma != null) { return(CapitalizeLemma(lowerCaseText.Substring(0, i) + entry.Lemma.ToLower(), posTag)); } } } return(CapitalizeLemma(lowerCaseText, posTag)); }
public virtual void BuildLexicons(TaggedToken[][] sentences) { const int types = (int)TokenType.Types; bool[,] hasTag = new bool[types, TaggedData.PosTagSet.Size]; foreach (TaggedToken[] sentence in sentences) { foreach (TaggedToken token in sentence) { if (token.PosTag >= 0) { hasTag[(int)token.Token.Type, token.PosTag] = true; PosLexicon.AddEntry(token.Token.Value, token.Lemma, token.PosTag, 1); } } } TokenTypeTags = new int[types][]; for (int tokenType = 0; tokenType < types; tokenType++) { int tagsCount = OpenTags.Count(openTag => hasTag[tokenType, openTag]); if (tagsCount == 0) { TokenTypeTags[tokenType] = OpenTags; } else { TokenTypeTags[tokenType] = new int[tagsCount]; int j = 0; foreach (int openTag in OpenTags) { if (hasTag[tokenType, openTag]) { TokenTypeTags[tokenType][j++] = openTag; } } Debug.Assert(j == tagsCount); for (int k = 0; k < j - 1; k++) { Debug.Assert(TokenTypeTags[tokenType][k] < TokenTypeTags[tokenType][k + 1]); } } } }
public override void BuildLexicons(TaggedToken[][] sentences) { base.BuildLexicons(sentences); try { TokenTypeTags[(int)TokenType.Smiley] = new int[1]; TokenTypeTags[(int)TokenType.Smiley][0] = TaggedData.PosTagSet.GetTagId("LE", true); PosLexicon.Interpolate(TaggedData.PosTagSet.GetTagId("NN|NEU|PLU|IND|NOM"), TaggedData.PosTagSet.GetTagId("NN|NEU|SIN|IND|NOM")); } catch (TagNameException) { Debug.Assert(false); } }
protected int[] PossiblePosTags(TaggedToken[] sentence, int index) { string textLower = sentence[index].LowerCaseText; if (!TrainingMode) { GuessTags(sentence[index].Token.Value, (index == 0)); } Entry[] entries = PosLexicon.GetEntries(textLower); if (entries == null) { return(TokenTypeTags[(int)sentence[index].Token.Type]); } int[] tags = new int[entries.Length]; int tagsCount = 0; int lastTag = -1; int seenCount = entries.Sum(entry => entry.NumberOfOccurence); foreach (Entry entry in entries) { if (seenCount > 0 && !ExtendLexicon && entry.NumberOfOccurence == 0) { continue; } if (entry.TagId != lastTag) { tags[tagsCount++] = entry.TagId; lastTag = entry.TagId; } } for (int t = 0; t < tagsCount - 1; t++) { Debug.Assert(tags[t] < tags[t + 1]); } if (!TrainingMode || seenCount >= CountLimit) { return(tagsCount != tags.Length ? Arrays.CopyOf(tags, tagsCount) : tags); } int[] possibleTags = TokenTypeTags[(int)sentence[index].Token.Type]; int[] lexiconTags = tags; int i = 0, j = 0, k = 0; tags = new int[tagsCount + possibleTags.Length]; for (; j < possibleTags.Length && k < tagsCount; i++) { if (possibleTags[j] < lexiconTags[k]) { tags[i] = possibleTags[j++]; } else if (possibleTags[j] == lexiconTags[k]) { tags[i] = possibleTags[j++]; k++; } else { tags[i] = lexiconTags[k++]; } } if (j < possibleTags.Length) { for (; j < possibleTags.Length; j++) { tags[i++] = possibleTags[j]; } } else { for (; k < tagsCount; k++) { tags[i++] = lexiconTags[k]; } } tagsCount = i; for (int t = 0; t < tagsCount - 1; t++) { Debug.Assert(tags[t] < tags[t + 1]); } return(tagsCount != tags.Length ? Arrays.CopyOf(tags, tagsCount) : tags); }