Пример #1
0
            public virtual IndexedWord Apply(string line)
            {
                IndexedWord word = new IndexedWord();

                if (line.StartsWith("#"))
                {
                    word.SetWord(line);
                    word.SetTag(CommentPos);
                    return(word);
                }
                string[] bits = line.Split("\\s+");
                word.Set(typeof(CoreAnnotations.TextAnnotation), bits[1]);
                /* Check if it is a multiword token. */
                if (bits[0].Contains("-"))
                {
                    string[] span  = bits[0].Split("-");
                    int      start = System.Convert.ToInt32(span[0]);
                    int      end   = System.Convert.ToInt32(span[1]);
                    word.Set(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation), new IntPair(start, end));
                    word.Set(typeof(CoreAnnotations.IndexAnnotation), start);
                }
                else
                {
                    if (bits[0].Contains("."))
                    {
                        string[] indexParts = bits[0].Split("\\.");
                        int      index      = System.Convert.ToInt32(indexParts[0]);
                        int      copyCount  = System.Convert.ToInt32(indexParts[1]);
                        word.Set(typeof(CoreAnnotations.IndexAnnotation), index);
                        word.SetIndex(index);
                        word.SetCopyCount(copyCount);
                        word.SetValue(bits[1]);
                        /* Parse features. */
                        Dictionary <string, string> features = CoNLLUUtils.ParseFeatures(bits[5]);
                        word.Set(typeof(CoreAnnotations.CoNLLUFeats), features);
                        /* Parse extra dependencies. */
                        Dictionary <string, string> extraDeps = CoNLLUUtils.ParseExtraDeps(bits[8]);
                        word.Set(typeof(CoreAnnotations.CoNLLUSecondaryDepsAnnotation), extraDeps);
                    }
                    else
                    {
                        word.Set(typeof(CoreAnnotations.IndexAnnotation), System.Convert.ToInt32(bits[0]));
                        word.Set(typeof(CoreAnnotations.LemmaAnnotation), bits[2]);
                        word.Set(typeof(CoreAnnotations.CoarseTagAnnotation), bits[3]);
                        word.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), bits[4]);
                        word.Set(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation), System.Convert.ToInt32(bits[6]));
                        word.Set(typeof(CoreAnnotations.CoNLLDepTypeAnnotation), bits[7]);
                        word.Set(typeof(CoreAnnotations.CoNLLUMisc), bits[9]);
                        word.SetIndex(System.Convert.ToInt32(bits[0]));
                        word.SetValue(bits[1]);
                        /* Parse features. */
                        Dictionary <string, string> features = CoNLLUUtils.ParseFeatures(bits[5]);
                        word.Set(typeof(CoreAnnotations.CoNLLUFeats), features);
                        /* Parse extra dependencies. */
                        Dictionary <string, string> extraDeps = CoNLLUUtils.ParseExtraDeps(bits[8]);
                        word.Set(typeof(CoreAnnotations.CoNLLUSecondaryDepsAnnotation), extraDeps);
                    }
                }
                return(word);
            }
Пример #2
0
 /// <exception cref="System.IO.IOException"/>
 private void LoadFeatureMap()
 {
     using (Reader r = IOUtils.ReaderFromString(FeatureMapFile))
     {
         BufferedReader br = new BufferedReader(r);
         posFeatureMap     = new Dictionary <string, Dictionary <string, string> >();
         wordPosFeatureMap = new Dictionary <string, Dictionary <string, string> >();
         string line;
         while ((line = br.ReadLine()) != null)
         {
             string[] parts = line.Split("\\s+");
             if (parts.Length < 3)
             {
                 continue;
             }
             if (parts[0].Equals("*"))
             {
                 posFeatureMap[parts[1]] = CoNLLUUtils.ParseFeatures(parts[2]);
             }
             else
             {
                 wordPosFeatureMap[parts[0] + '_' + parts[1]] = CoNLLUUtils.ParseFeatures(parts[2]);
             }
         }
     }
 }
Пример #3
0
        /// <summary>
        /// Outputs a partial CONLL-U file with token information (form, lemma, POS)
        /// but without any dependency information.
        /// </summary>
        /// <param name="sentence"/>
        /// <returns/>
        public virtual string PrintPOSAnnotations(ICoreMap sentence)
        {
            StringBuilder sb = new StringBuilder();

            foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
            {
                string upos           = token.GetString <CoreAnnotations.CoarseTagAnnotation>("_");
                string lemma          = token.GetString <CoreAnnotations.LemmaAnnotation>("_");
                string pos            = token.GetString <CoreAnnotations.PartOfSpeechAnnotation>("_");
                string featuresString = CoNLLUUtils.ToFeatureString(token.Get(typeof(CoreAnnotations.CoNLLUFeats)));
                string misc           = token.GetString <CoreAnnotations.CoNLLUMisc>("_");
                sb.Append(string.Format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.Index(), token.Word(), lemma, upos, pos, featuresString, "_", "_", "_", misc));
            }
            sb.Append("\n");
            return(sb.ToString());
        }
Пример #4
0
        public virtual string PrintSemanticGraph(SemanticGraph sg, bool unescapeParenthesis)
        {
            bool          isTree = SemanticGraphUtils.IsTree(sg);
            StringBuilder sb     = new StringBuilder();

            /* Print comments. */
            foreach (string comment in sg.GetComments())
            {
                sb.Append(comment).Append("\n");
            }
            foreach (IndexedWord token in sg.VertexListSorted())
            {
                /* Check for multiword tokens. */
                if (token.ContainsKey(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation)))
                {
                    IntPair tokenSpan = token.Get(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation));
                    if (tokenSpan.GetSource() == token.Index())
                    {
                        string range = string.Format("%d-%d", tokenSpan.GetSource(), tokenSpan.GetTarget());
                        sb.Append(string.Format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.OriginalText()));
                    }
                }
                /* Try to find main governor and additional dependencies. */
                string govIdx = null;
                GrammaticalRelation         reln = null;
                Dictionary <string, string> enhancedDependencies = new Dictionary <string, string>();
                foreach (IndexedWord parent in sg.GetParents(token))
                {
                    SemanticGraphEdge edge = sg.GetEdge(parent, token);
                    if (govIdx == null && !edge.IsExtra())
                    {
                        govIdx = parent.ToCopyIndex();
                        reln   = edge.GetRelation();
                    }
                    enhancedDependencies[parent.ToCopyIndex()] = edge.GetRelation().ToString();
                }
                string additionalDepsString = isTree ? "_" : CoNLLUUtils.ToExtraDepsString(enhancedDependencies);
                string word           = token.Word();
                string featuresString = CoNLLUUtils.ToFeatureString(token.Get(typeof(CoreAnnotations.CoNLLUFeats)));
                string pos            = token.GetString <CoreAnnotations.PartOfSpeechAnnotation>("_");
                string upos           = token.GetString <CoreAnnotations.CoarseTagAnnotation>("_");
                string misc           = token.GetString <CoreAnnotations.CoNLLUMisc>("_");
                string lemma          = token.GetString <CoreAnnotations.LemmaAnnotation>("_");
                string relnName       = reln == null ? "_" : reln.ToString();
                /* Root. */
                if (govIdx == null && sg.GetRoots().Contains(token))
                {
                    govIdx               = "0";
                    relnName             = GrammaticalRelation.Root.ToString();
                    additionalDepsString = isTree ? "_" : "0:" + relnName;
                }
                else
                {
                    if (govIdx == null)
                    {
                        govIdx   = "_";
                        relnName = "_";
                    }
                }
                if (unescapeParenthesis)
                {
                    word  = word.ReplaceAll(LrbPattern, "(");
                    word  = word.ReplaceAll(RrbPattern, ")");
                    lemma = lemma.ReplaceAll(LrbPattern, "(");
                    lemma = lemma.ReplaceAll(RrbPattern, ")");
                }
                sb.Append(string.Format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n", token.ToCopyIndex(), word, lemma, upos, pos, featuresString, govIdx, relnName, additionalDepsString, misc));
            }
            sb.Append("\n");
            return(sb.ToString());
        }