private static void ReplacePOSTags(Tree tree) { IList <ILabel> yield = tree.Yield(); IList <ILabel> preYield = tree.PreTerminalYield(); System.Diagnostics.Debug.Assert(yield.Count == preYield.Count); MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification(); for (int i = 0; i < yield.Count; i++) { // Morphological Analysis string morphStr = ((CoreLabel)yield[i]).OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = preYield[i].Value(); // POS subcategory string subCat = ((CoreLabel)yield[i]).Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = spec.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { CoreLabel cl = (CoreLabel)preYield[i]; cl.SetValue(feats.GetAltTag()); cl.SetTag(feats.GetAltTag()); } } }
/// <summary> /// Set the tags of the original tokens and the leaves if they /// aren't already set. /// </summary> private static void SetMissingTags(ICoreMap sentence, Tree tree) { IList <TaggedWord> taggedWords = null; IList <ILabel> leaves = null; IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int i = 0; i < size; ++i) { CoreLabel token = tokens[i]; if (token.Tag() == null) { if (taggedWords == null) { taggedWords = tree.TaggedYield(); } if (leaves == null) { leaves = tree.Yield(); } token.SetTag(taggedWords[i].Tag()); ILabel leaf = leaves[i]; if (leaf is IHasTag) { ((IHasTag)leaf).SetTag(taggedWords[i].Tag()); } } } }
private void AnnotateTokens <Token>(IList <TOKEN> tokens) where Token : CoreLabel { // Make a copy of the tokens before annotating because QuantifiableEntityNormalizer may change the POS too IList <CoreLabel> words = new List <CoreLabel>(); foreach (CoreLabel token in tokens) { CoreLabel word = new CoreLabel(); word.SetWord(token.Word()); word.SetNER(token.Ner()); word.SetTag(token.Tag()); // copy fields potentially set by SUTime NumberSequenceClassifier.TransferAnnotations(token, word); words.Add(word); } DoOneSentence(words); // TODO: If collapsed is set, tokens for entities are collapsed into one node then // (words.size() != tokens.size() and the logic below just don't work!!! for (int i = 0; i < words.Count; i++) { string ner = words[i].Ner(); tokens[i].SetNER(ner); tokens[i].Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), words[i].Get(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation))); } }
/// <summary> /// Only works on English, as it is hard coded for using the /// Morphology class, which is English-only /// </summary> public virtual IList <CoreLabel> Lemmatize <_T0>(IList <_T0> tokens) where _T0 : IHasWord { IList <TaggedWord> tagged; if (GetOp().testOptions.preTag) { IFunction <IList <IHasWord>, IList <TaggedWord> > tagger = LoadTagger(); tagged = tagger.Apply(tokens); } else { Tree tree = Parse(tokens); tagged = tree.TaggedYield(); } Morphology morpha = new Morphology(); IList <CoreLabel> lemmas = Generics.NewArrayList(); foreach (TaggedWord token in tagged) { CoreLabel label = new CoreLabel(); label.SetWord(token.Word()); label.SetTag(token.Tag()); morpha.Stem(label); lemmas.Add(label); } return(lemmas); }
/// <summary>Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens).</summary> /// <param name="conll">The CoNLL formatted tree.</param> /// <returns> /// A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence /// and to tokens in the sentence. /// </returns> protected internal virtual Pair <SemanticGraph, IList <CoreLabel> > MkTree(string conll) { IList <CoreLabel> sentence = new List <CoreLabel>(); SemanticGraph tree = new SemanticGraph(); foreach (string line in conll.Split("\n")) { if (line.Trim().Equals(string.Empty)) { continue; } string[] fields = line.Trim().Split("\\s+"); int index = System.Convert.ToInt32(fields[0]); string word = fields[1]; CoreLabel label = IETestUtils.MkWord(word, index); sentence.Add(label); if (fields[2].Equals("0")) { tree.AddRoot(new IndexedWord(label)); } else { tree.AddVertex(new IndexedWord(label)); } if (fields.Length > 4) { label.SetTag(fields[4]); } if (fields.Length > 5) { label.SetNER(fields[5]); } if (fields.Length > 6) { label.SetLemma(fields[6]); } } int i = 0; foreach (string line_1 in conll.Split("\n")) { if (line_1.Trim().Equals(string.Empty)) { continue; } string[] fields = line_1.Trim().Split("\\s+"); int parent = System.Convert.ToInt32(fields[2]); string reln = fields[3]; if (parent > 0) { tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.UniversalEnglish, reln, null, null), 1.0, false); } i += 1; } return(Pair.MakePair(tree, sentence)); }
public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words) where _T0 : IHasWord { IList <Tree> preterminals = Generics.NewArrayList(); for (int index = 0; index < words.Count; ++index) { IHasWord hw = words[index]; CoreLabel wordLabel; string tag; if (hw is CoreLabel) { wordLabel = (CoreLabel)hw; tag = wordLabel.Tag(); } else { wordLabel = new CoreLabel(); wordLabel.SetValue(hw.Word()); wordLabel.SetWord(hw.Word()); if (!(hw is IHasTag)) { throw new ArgumentException("Expected tagged words"); } tag = ((IHasTag)hw).Tag(); wordLabel.SetTag(tag); } if (tag == null) { throw new ArgumentException("Input word not tagged"); } CoreLabel tagLabel = new CoreLabel(); tagLabel.SetValue(tag); // Index from 1. Tools downstream from the parser expect that // Internally this parser uses the index, so we have to // overwrite incorrect indices if the label is already indexed wordLabel.SetIndex(index + 1); tagLabel.SetIndex(index + 1); LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel); LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel); tagNode.AddChild(wordNode); // TODO: can we get away with not setting these on the wordLabel? wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); preterminals.Add(tagNode); } return(new State(preterminals)); }
public virtual void SetUp() { lock (typeof(RegexNERSequenceClassifierTest)) { if (tempFile == null) { tempFile = File.CreateTempFile("regexnertest.patterns", "txt"); FileWriter fout = new FileWriter(tempFile); BufferedWriter bout = new BufferedWriter(fout); bout.Write("sausage\tfood\n"); bout.Write("(avocet|curlew)(s?)\tshorebird\n"); bout.Write("shoreline park\tpark\n"); bout.Flush(); fout.Close(); } } sentences = new List <IList <CoreLabel> >(); NERsentences = new List <IList <CoreLabel> >(); NUnit.Framework.Assert.AreEqual(words.Length, tags.Length); NUnit.Framework.Assert.AreEqual(words.Length, ner.Length); for (int snum = 0; snum < words.Length; ++snum) { string[] wordPieces = words[snum].Split(" "); string[] tagPieces = tags[snum].Split(" "); string[] nerPieces = ner[snum].Split(" "); NUnit.Framework.Assert.AreEqual(wordPieces.Length, tagPieces.Length); NUnit.Framework.Assert.AreEqual(wordPieces.Length, nerPieces.Length, "Input " + snum + " " + words[snum] + " of different length than " + ner[snum]); IList <CoreLabel> sentence = new List <CoreLabel>(); IList <CoreLabel> NERsentence = new List <CoreLabel>(); for (int wnum = 0; wnum < wordPieces.Length; ++wnum) { CoreLabel token = new CoreLabel(); token.SetWord(wordPieces[wnum]); token.SetTag(tagPieces[wnum]); sentence.Add(token); CoreLabel NERtoken = new CoreLabel(); NERtoken.SetWord(wordPieces[wnum]); NERtoken.SetTag(tagPieces[wnum]); NERtoken.SetNER(nerPieces[wnum]); NERsentence.Add(NERtoken); } sentences.Add(sentence); NERsentences.Add(NERsentence); } }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return cLabel; }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return(cLabel); }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }
protected internal virtual ICollection <string> Clauses(string conll) { IList <CoreLabel> sentence = new List <CoreLabel>(); SemanticGraph tree = new SemanticGraph(); foreach (string line in conll.Split("\n")) { if (line.Trim().Equals(string.Empty)) { continue; } string[] fields = line.Trim().Split("\\s+"); int index = System.Convert.ToInt32(fields[0]); string word = fields[1]; CoreLabel label = MkWord(word, index); sentence.Add(label); if (fields[2].Equals("0")) { tree.AddRoot(new IndexedWord(label)); } else { tree.AddVertex(new IndexedWord(label)); } if (fields.Length > 4) { label.SetTag(fields[4]); } if (fields.Length > 5) { label.SetNER(fields[5]); } if (fields.Length > 6) { label.SetLemma(fields[6]); } } int i = 0; foreach (string line_1 in conll.Split("\n")) { if (line_1.Trim().Equals(string.Empty)) { continue; } string[] fields = line_1.Trim().Split("\\s+"); int parent = System.Convert.ToInt32(fields[2]); string reln = fields[3]; if (parent > 0) { tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.English, reln, null, null), 1.0, false); } i += 1; } // Run extractor ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true); ICollection <string> clauses = new HashSet <string>(); problem.Search(null, new LinearClassifier <ClauseSplitter.ClauseClassifierLabel, string>(new ClassicCounter <Pair <string, ClauseSplitter.ClauseClassifierLabel> >()), ClauseSplitterSearchProblem.HardSplits, null, 100000); return(clauses); }
/// <summary>Parse a sentence represented as a List of tokens.</summary> /// <remarks> /// Parse a sentence represented as a List of tokens. /// The text must already have been tokenized and /// normalized into tokens that are appropriate to the treebank /// which was used to train the parser. The tokens can be of /// multiple types, and the list items need not be homogeneous as to type /// (in particular, only some words might be given tags): /// <ul> /// <li>If a token implements HasWord, then the word to be parsed is /// given by its word() value.</li> /// <li>If a token implements HasTag and the tag() value is not /// null or the empty String, then the parser is strongly advised to assign /// a part of speech tag that <i>begins</i> with this String.</li> /// </ul> /// </remarks> /// <param name="sentence">The sentence to parse</param> /// <returns>true Iff the sentence was accepted by the grammar</returns> /// <exception cref="System.NotSupportedException"> /// If the Sentence is too long or /// of zero length or the parse /// otherwise fails for resource reasons /// </exception> private bool ParseInternal <_T0>(IList <_T0> sentence) where _T0 : IHasWord { parseSucceeded = false; parseNoMemory = false; parseUnparsable = false; parseSkipped = false; parseFallback = false; whatFailed = null; addedPunct = false; originalSentence = sentence; int length = sentence.Count; if (length == 0) { parseSkipped = true; throw new NotSupportedException("Can't parse a zero-length sentence!"); } IList <IHasWord> sentenceB; if (op.wordFunction != null) { sentenceB = Generics.NewArrayList(); foreach (IHasWord word in originalSentence) { if (word is ILabel) { ILabel label = (ILabel)word; ILabel newLabel = label.LabelFactory().NewLabel(label); if (newLabel is IHasWord) { sentenceB.Add((IHasWord)newLabel); } else { throw new AssertionError("This should have been a HasWord"); } } else { if (word is IHasTag) { TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag()); sentenceB.Add(tw); } else { sentenceB.Add(new Word(word.Word())); } } } foreach (IHasWord word_1 in sentenceB) { word_1.SetWord(op.wordFunction.Apply(word_1.Word())); } } else { sentenceB = new List <IHasWord>(sentence); } if (op.testOptions.addMissingFinalPunctuation) { addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length); } if (length > op.testOptions.maxLength) { parseSkipped = true; throw new NotSupportedException("Sentence too long: length " + length); } TreePrint treePrint = GetTreePrint(); PrintWriter pwOut = op.tlpParams.Pw(); //Insert the boundary symbol if (sentence[0] is CoreLabel) { CoreLabel boundary = new CoreLabel(); boundary.SetWord(LexiconConstants.Boundary); boundary.SetValue(LexiconConstants.Boundary); boundary.SetTag(LexiconConstants.BoundaryTag); boundary.SetIndex(sentence.Count + 1); //1-based indexing used in the parser sentenceB.Add(boundary); } else { sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag)); } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG) { if (!pparser.Parse(sentenceB)) { return(parseSucceeded); } if (op.testOptions.verbose) { pwOut.Println("PParser output"); // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes treePrint.PrintTree(GetBestPCFGParse(false), pwOut); } } // without scores on nodes if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doDep && !op.testOptions.useFastFactored) { if (!dparser.Parse(sentenceB)) { return(parseSucceeded); } // cdm nov 2006: should move these printing bits to the main printing section, // so don't calculate the best parse twice! if (op.testOptions.verbose) { pwOut.Println("DParser output"); treePrint.PrintTree(dparser.GetBestParse(), pwOut); } } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG && op.doDep) { if (!bparser.Parse(sentenceB)) { return(parseSucceeded); } else { parseSucceeded = true; } } return(true); }
// TODO replace with GrammaticalStructure#readCoNLLGrammaticalStructureCollection public static void LoadConllFile(string inFile, IList <ICoreMap> sents, IList <DependencyTree> trees, bool unlabeled, bool cPOS) { CoreLabelTokenFactory tf = new CoreLabelTokenFactory(false); try { using (BufferedReader reader = IOUtils.ReaderFromString(inFile)) { IList <CoreLabel> sentenceTokens = new List <CoreLabel>(); DependencyTree tree = new DependencyTree(); foreach (string line in IOUtils.GetLineIterable(reader, false)) { string[] splits = line.Split("\t"); if (splits.Length < 10) { if (sentenceTokens.Count > 0) { trees.Add(tree); ICoreMap sentence = new CoreLabel(); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), sentenceTokens); sents.Add(sentence); tree = new DependencyTree(); sentenceTokens = new List <CoreLabel>(); } } else { string word = splits[1]; string pos = cPOS ? splits[3] : splits[4]; string depType = splits[7]; int head = -1; try { head = System.Convert.ToInt32(splits[6]); } catch (NumberFormatException) { continue; } CoreLabel token = tf.MakeToken(word, 0, 0); token.SetTag(pos); token.Set(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation), head); token.Set(typeof(CoreAnnotations.CoNLLDepTypeAnnotation), depType); sentenceTokens.Add(token); if (!unlabeled) { tree.Add(head, depType); } else { tree.Add(head, Config.Unknown); } } } } } catch (IOException e) { throw new RuntimeIOException(e); } }