/// <summary> /// Decodes the lemma from the word and the induced lemma class. /// </summary> /// <param name="tokens">The array of token</param> /// <param name="preds">The predicted lemma classes.</param> /// <returns>An array of decoded lemmas.</returns> public string[] DecodeLemmas(string[] tokens, string[] preds) { if (tokens == null) { throw new ArgumentNullException(nameof(tokens)); } if (preds == null) { throw new ArgumentNullException(nameof(preds)); } if (tokens.Length != preds.Length) { throw new ArgumentException("The arguments must have the same length."); } var lemmas = new List <string>(tokens.Length); for (var i = 0; i < tokens.Length; i++) { var lemma = LemmatizerUtils.DecodeShortestEditScript(tokens[i].ToLowerInvariant(), preds[i]); if (string.IsNullOrEmpty(lemma)) { lemma = "_"; } lemmas.Add(lemma); } return(lemmas.ToArray()); }
/// <summary> /// Returns the next lemma sample object. Calling this method repeatedly until it returns, <c>null</c> will return each /// object from the underlying source exactly once. /// </summary> /// <returns>The next lemma sample or <c>null</c> to signal that the stream is exhausted.</returns> public override LemmaSample Read() { var tokens = new List <string>(); var tags = new List <string>(); var lemmas = new List <string>(); for (var line = Samples.Read(); !string.IsNullOrEmpty(line); line = Samples.Read()) { var parts = line.Split('\t'); if (parts.Length != 3) { continue; // skip corrupt line } tokens.Add(parts[0]); tags.Add(parts[1]); lemmas.Add(LemmatizerUtils.GetShortestEditScript(parts[0], parts[2])); } return(tokens.Count > 0 ? new LemmaSample(tokens.ToArray(), tags.ToArray(), lemmas.ToArray()) : null); }