public virtual IndexedWord Apply(string line) { IndexedWord word = new IndexedWord(); if (line.StartsWith("#")) { word.SetWord(line); word.SetTag(CommentPos); return(word); } string[] bits = line.Split("\\s+"); word.Set(typeof(CoreAnnotations.TextAnnotation), bits[1]); /* Check if it is a multiword token. */ if (bits[0].Contains("-")) { string[] span = bits[0].Split("-"); int start = System.Convert.ToInt32(span[0]); int end = System.Convert.ToInt32(span[1]); word.Set(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation), new IntPair(start, end)); word.Set(typeof(CoreAnnotations.IndexAnnotation), start); } else { if (bits[0].Contains(".")) { string[] indexParts = bits[0].Split("\\."); int index = System.Convert.ToInt32(indexParts[0]); int copyCount = System.Convert.ToInt32(indexParts[1]); word.Set(typeof(CoreAnnotations.IndexAnnotation), index); word.SetIndex(index); word.SetCopyCount(copyCount); word.SetValue(bits[1]); /* Parse features. */ Dictionary <string, string> features = CoNLLUUtils.ParseFeatures(bits[5]); word.Set(typeof(CoreAnnotations.CoNLLUFeats), features); /* Parse extra dependencies. */ Dictionary <string, string> extraDeps = CoNLLUUtils.ParseExtraDeps(bits[8]); word.Set(typeof(CoreAnnotations.CoNLLUSecondaryDepsAnnotation), extraDeps); } else { word.Set(typeof(CoreAnnotations.IndexAnnotation), System.Convert.ToInt32(bits[0])); word.Set(typeof(CoreAnnotations.LemmaAnnotation), bits[2]); word.Set(typeof(CoreAnnotations.CoarseTagAnnotation), bits[3]); word.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), bits[4]); word.Set(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation), System.Convert.ToInt32(bits[6])); word.Set(typeof(CoreAnnotations.CoNLLDepTypeAnnotation), bits[7]); word.Set(typeof(CoreAnnotations.CoNLLUMisc), bits[9]); word.SetIndex(System.Convert.ToInt32(bits[0])); word.SetValue(bits[1]); /* Parse features. */ Dictionary <string, string> features = CoNLLUUtils.ParseFeatures(bits[5]); word.Set(typeof(CoreAnnotations.CoNLLUFeats), features); /* Parse extra dependencies. */ Dictionary <string, string> extraDeps = CoNLLUUtils.ParseExtraDeps(bits[8]); word.Set(typeof(CoreAnnotations.CoNLLUSecondaryDepsAnnotation), extraDeps); } } return(word); }
/// <exception cref="System.IO.IOException"/> private void LoadFeatureMap() { using (Reader r = IOUtils.ReaderFromString(FeatureMapFile)) { BufferedReader br = new BufferedReader(r); posFeatureMap = new Dictionary <string, Dictionary <string, string> >(); wordPosFeatureMap = new Dictionary <string, Dictionary <string, string> >(); string line; while ((line = br.ReadLine()) != null) { string[] parts = line.Split("\\s+"); if (parts.Length < 3) { continue; } if (parts[0].Equals("*")) { posFeatureMap[parts[1]] = CoNLLUUtils.ParseFeatures(parts[2]); } else { wordPosFeatureMap[parts[0] + '_' + parts[1]] = CoNLLUUtils.ParseFeatures(parts[2]); } } } }
/// <summary> /// Outputs a partial CONLL-U file with token information (form, lemma, POS) /// but without any dependency information. /// </summary> /// <param name="sentence"/> /// <returns/> public virtual string PrintPOSAnnotations(ICoreMap sentence) { StringBuilder sb = new StringBuilder(); foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { string upos = token.GetString <CoreAnnotations.CoarseTagAnnotation>("_"); string lemma = token.GetString <CoreAnnotations.LemmaAnnotation>("_"); string pos = token.GetString <CoreAnnotations.PartOfSpeechAnnotation>("_"); string featuresString = CoNLLUUtils.ToFeatureString(token.Get(typeof(CoreAnnotations.CoNLLUFeats))); string misc = token.GetString <CoreAnnotations.CoNLLUMisc>("_"); sb.Append(string.Format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.Index(), token.Word(), lemma, upos, pos, featuresString, "_", "_", "_", misc)); } sb.Append("\n"); return(sb.ToString()); }
public virtual string PrintSemanticGraph(SemanticGraph sg, bool unescapeParenthesis) { bool isTree = SemanticGraphUtils.IsTree(sg); StringBuilder sb = new StringBuilder(); /* Print comments. */ foreach (string comment in sg.GetComments()) { sb.Append(comment).Append("\n"); } foreach (IndexedWord token in sg.VertexListSorted()) { /* Check for multiword tokens. */ if (token.ContainsKey(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation))) { IntPair tokenSpan = token.Get(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation)); if (tokenSpan.GetSource() == token.Index()) { string range = string.Format("%d-%d", tokenSpan.GetSource(), tokenSpan.GetTarget()); sb.Append(string.Format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.OriginalText())); } } /* Try to find main governor and additional dependencies. */ string govIdx = null; GrammaticalRelation reln = null; Dictionary <string, string> enhancedDependencies = new Dictionary <string, string>(); foreach (IndexedWord parent in sg.GetParents(token)) { SemanticGraphEdge edge = sg.GetEdge(parent, token); if (govIdx == null && !edge.IsExtra()) { govIdx = parent.ToCopyIndex(); reln = edge.GetRelation(); } enhancedDependencies[parent.ToCopyIndex()] = edge.GetRelation().ToString(); } string additionalDepsString = isTree ? "_" : CoNLLUUtils.ToExtraDepsString(enhancedDependencies); string word = token.Word(); string featuresString = CoNLLUUtils.ToFeatureString(token.Get(typeof(CoreAnnotations.CoNLLUFeats))); string pos = token.GetString <CoreAnnotations.PartOfSpeechAnnotation>("_"); string upos = token.GetString <CoreAnnotations.CoarseTagAnnotation>("_"); string misc = token.GetString <CoreAnnotations.CoNLLUMisc>("_"); string lemma = token.GetString <CoreAnnotations.LemmaAnnotation>("_"); string relnName = reln == null ? "_" : reln.ToString(); /* Root. */ if (govIdx == null && sg.GetRoots().Contains(token)) { govIdx = "0"; relnName = GrammaticalRelation.Root.ToString(); additionalDepsString = isTree ? "_" : "0:" + relnName; } else { if (govIdx == null) { govIdx = "_"; relnName = "_"; } } if (unescapeParenthesis) { word = word.ReplaceAll(LrbPattern, "("); word = word.ReplaceAll(RrbPattern, ")"); lemma = lemma.ReplaceAll(LrbPattern, "("); lemma = lemma.ReplaceAll(RrbPattern, ")"); } sb.Append(string.Format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.ToCopyIndex(), word, lemma, upos, pos, featuresString, govIdx, relnName, additionalDepsString, misc)); } sb.Append("\n"); return(sb.ToString()); }