public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words) where _T0 : IHasWord { IList <Tree> preterminals = Generics.NewArrayList(); for (int index = 0; index < words.Count; ++index) { IHasWord hw = words[index]; CoreLabel wordLabel; string tag; if (hw is CoreLabel) { wordLabel = (CoreLabel)hw; tag = wordLabel.Tag(); } else { wordLabel = new CoreLabel(); wordLabel.SetValue(hw.Word()); wordLabel.SetWord(hw.Word()); if (!(hw is IHasTag)) { throw new ArgumentException("Expected tagged words"); } tag = ((IHasTag)hw).Tag(); wordLabel.SetTag(tag); } if (tag == null) { throw new ArgumentException("Input word not tagged"); } CoreLabel tagLabel = new CoreLabel(); tagLabel.SetValue(tag); // Index from 1. Tools downstream from the parser expect that // Internally this parser uses the index, so we have to // overwrite incorrect indices if the label is already indexed wordLabel.SetIndex(index + 1); tagLabel.SetIndex(index + 1); LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel); LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel); tagNode.AddChild(wordNode); // TODO: can we get away with not setting these on the wordLabel? wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); preterminals.Add(tagNode); } return(new State(preterminals)); }
private static string GetString(object o) { if (o is IHasWord) { IHasWord h = (IHasWord)o; return(h.Word()); } else { if (o is string) { return((string)o); } else { if (o is ICoreMap) { return(((ICoreMap)o).Get(typeof(CoreAnnotations.TextAnnotation))); } else { throw new Exception("Expected token to be either Word or String."); } } } }
/// <summary>Adds a sentence final punctuation mark to sentences that lack one.</summary> /// <remarks> /// Adds a sentence final punctuation mark to sentences that lack one. /// This method adds a period (the first sentence final punctuation word /// in a parser language pack) to sentences that don't have one within /// the last 3 words (to allow for close parentheses, etc.). It checks /// tags for punctuation, if available, otherwise words. /// </remarks> /// <param name="sentence">The sentence to check</param> /// <param name="length">The length of the sentence (just to avoid recomputation)</param> private bool AddSentenceFinalPunctIfNeeded(IList <IHasWord> sentence, int length) { int start = length - 3; if (start < 0) { start = 0; } ITreebankLanguagePack tlp = op.tlpParams.TreebankLanguagePack(); for (int i = length - 1; i >= start; i--) { IHasWord item = sentence[i]; // An object (e.g., CoreLabel) can implement HasTag but not actually store // a tag so we need to check that there is something there for this case. // If there is, use only it, since word tokens can be ambiguous. string tag = null; if (item is IHasTag) { tag = ((IHasTag)item).Tag(); } if (tag != null && !tag.IsEmpty()) { if (tlp.IsSentenceFinalPunctuationTag(tag)) { return(false); } } else { string str = item.Word(); if (tlp.IsPunctuationWord(str)) { return(false); } } } // none found so add one. if (op.testOptions.verbose) { log.Info("Adding missing final punctuation to sentence."); } string[] sfpWords = tlp.SentenceFinalPunctuationWords(); if (sfpWords.Length > 0) { sentence.Add(new Word(sfpWords[0])); } return(true); }
public virtual void PrintSamples(IList samples, TextWriter @out) { for (int i = 0; i < document.Count; i++) { IHasWord word = (IHasWord)document[i]; string s = "null"; if (word != null) { s = word.Word(); } @out.Write(StringUtils.PadOrTrim(s, 10)); foreach (object sample in samples) { int[] sequence = (int[])sample; @out.Write(" " + StringUtils.PadLeft(sequence[i], 2)); } @out.WriteLine(); } }
/// <summary>Splits the Word w on the character splitChar.</summary> private IHasWord SplitTag(IHasWord w) { if (splitChar == 0) { return(w); } string s = w.Word(); int split = s.LastIndexOf(splitChar); if (split <= 0) { // == 0 isn't allowed - no empty words! return(w); } string word = Sharpen.Runtime.Substring(s, 0, split); string tag = Sharpen.Runtime.Substring(s, split + 1, s.Length); return(new TaggedWord(word, tag)); }
private static IList <IHasWord> FixQuotes(IList <IHasWord> input) { int inputSize = input.Count; LinkedList <IHasWord> result = new LinkedList <IHasWord>(); if (inputSize == 0) { return(result); } bool begin; // see if there is a quote at the end if (input[inputSize - 1].Word().Equals("\"")) { // alternate from the end begin = false; for (int i = inputSize - 1; i >= 0; i--) { IHasWord hw = input[i]; string tok = hw.Word(); if (tok.Equals("\"")) { if (begin) { hw.SetWord("``"); begin = false; } else { hw.SetWord("\'\'"); begin = true; } } // otherwise leave it alone result.AddFirst(hw); } } else { // end loop // alternate from the beginning begin = true; foreach (IHasWord hw in input) { string tok = hw.Word(); if (tok.Equals("\"")) { if (begin) { hw.SetWord("``"); begin = false; } else { hw.SetWord("\'\'"); begin = true; } } // otherwise leave it alone result.AddLast(hw); } } // end loop return(result); }
protected internal virtual IList <Item> MakeInitialItems <_T0>(IList <_T0> wordList) where _T0 : IHasWord { IList <Item> itemList = new List <Item>(); int length = wordList.Count; int numTags = tagIndex.Size(); words = new int[length]; taggedWordList = new IList[length]; int terminalCount = 0; originalLabels = new CoreLabel[wordList.Count]; for (int i = 0; i < length; i++) { taggedWordList[i] = new List <IntTaggedWord>(numTags); IHasWord wordObject = wordList[i]; if (wordObject is CoreLabel) { originalLabels[i] = (CoreLabel)wordObject; } string wordStr = wordObject.Word(); //Word context (e.g., morphosyntactic info) string wordContextStr = null; if (wordObject is IHasContext) { wordContextStr = ((IHasContext)wordObject).OriginalText(); if (string.Empty.Equals(wordContextStr)) { wordContextStr = null; } } if (!wordIndex.Contains(wordStr)) { wordStr = LexiconConstants.UnknownWord; } int word = wordIndex.IndexOf(wordStr); words[i] = word; for (IEnumerator <IntTaggedWord> tagI = lex.RuleIteratorByWord(word, i, wordContextStr); tagI.MoveNext();) { IntTaggedWord tagging = tagI.Current; int tag = tagging.tag; //String curTagStr = tagIndex.get(tag); //if (!tagStr.equals("") && !tagStr.equals(curTagStr)) // continue; int state = stateIndex.IndexOf(tagIndex.Get(tag)); //itemList.add(makeInitialItem(i,tag,state,1.0*tagging.score)); // THIS WILL CAUSE BUGS!!! Don't use with another A* scorer tempEdge.state = state; tempEdge.head = i; tempEdge.start = i; tempEdge.end = i + 1; tempEdge.tag = tag; itemList.Add(MakeInitialItem(i, tag, state, scorer.IScore(tempEdge))); terminalCount++; taggedWordList[i].Add(new IntTaggedWord(word, tag)); } } if (op.testOptions.verbose) { log.Info("Terminals (# of tag edges in chart): " + terminalCount); } return(itemList); }