예제 #1
0
        /// <summary>
        /// Decodes the lemma from the word and the induced lemma class.
        /// </summary>
        /// <param name="tokens">The array of token</param>
        /// <param name="preds">The predicted lemma classes.</param>
        /// <returns>An array of decoded lemmas.</returns>
        public string[] DecodeLemmas(string[] tokens, string[] preds)
        {
            if (tokens == null)
            {
                throw new ArgumentNullException(nameof(tokens));
            }

            if (preds == null)
            {
                throw new ArgumentNullException(nameof(preds));
            }

            if (tokens.Length != preds.Length)
            {
                throw new ArgumentException("The arguments must have the same length.");
            }

            var lemmas = new List <string>(tokens.Length);

            for (var i = 0; i < tokens.Length; i++)
            {
                var lemma = LemmatizerUtils.DecodeShortestEditScript(tokens[i].ToLowerInvariant(), preds[i]);

                if (string.IsNullOrEmpty(lemma))
                {
                    lemma = "_";
                }

                lemmas.Add(lemma);
            }

            return(lemmas.ToArray());
        }
예제 #2
0
        /// <summary>
        /// Returns the next lemma sample object. Calling this method repeatedly until it returns, <c>null</c> will return each
        /// object from the underlying source exactly once.
        /// </summary>
        /// <returns>The next lemma sample or <c>null</c> to signal that the stream is exhausted.</returns>
        public override LemmaSample Read()
        {
            var tokens = new List <string>();
            var tags   = new List <string>();
            var lemmas = new List <string>();

            for (var line = Samples.Read(); !string.IsNullOrEmpty(line); line = Samples.Read())
            {
                var parts = line.Split('\t');
                if (parts.Length != 3)
                {
                    continue; // skip corrupt line
                }
                tokens.Add(parts[0]);
                tags.Add(parts[1]);
                lemmas.Add(LemmatizerUtils.GetShortestEditScript(parts[0], parts[2]));
            }

            return(tokens.Count > 0 ? new LemmaSample(tokens.ToArray(), tags.ToArray(), lemmas.ToArray()) : null);
        }