private IList <CoreLabel> SegmentStringToIOB(string line) { IList <CoreLabel> tokenList; if (tf == null) { // Whitespace tokenization. tokenList = IOBUtils.StringToIOB(line); } else { IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(line)).Tokenize(); tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line); } IOBUtils.LabelDomain(tokenList, domain); tokenList = classifier.Classify(tokenList); return(tokenList); }
public IList <CoreLabel> Apply(string @in) { IList <CoreLabel> tokenList; string lineDomain = string.Empty; if (this._enclosing.inputHasDomainLabels) { string[] domainAndData = @in.Split("\\s+", 2); if (domainAndData.Length < 2) { ArabicDocumentReaderAndWriter.log.Info("Missing domain label or text: "); ArabicDocumentReaderAndWriter.log.Info(@in); } else { lineDomain = domainAndData[0]; @in = domainAndData[1]; } } else { lineDomain = this._enclosing.inputDomain; } if (this._enclosing.inputHasTags) { string[] toks = @in.Split("\\s+"); IList <CoreLabel> input = new List <CoreLabel>(toks.Length); string tagDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.tagDelimiter); string rewDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.rewriteDelimiter); foreach (string wordTag in toks) { string[] wordTagPair = wordTag.Split(tagDelim); System.Diagnostics.Debug.Assert(wordTagPair.Length == 2); string[] rewritePair = wordTagPair[0].Split(rewDelim); System.Diagnostics.Debug.Assert(rewritePair.Length == 1 || rewritePair.Length == 2); string raw = rewritePair[0]; string rewritten = raw; if (rewritePair.Length == 2) { rewritten = rewritePair[1]; } CoreLabel cl = new CoreLabel(); if (this._enclosing.tf != null) { IList <CoreLabel> lexListRaw = this._enclosing.tf.GetTokenizer(new StringReader(raw)).Tokenize(); IList <CoreLabel> lexListRewritten = this._enclosing.tf.GetTokenizer(new StringReader(rewritten)).Tokenize(); if (lexListRewritten.Count != lexListRaw.Count) { System.Console.Error.Printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.GetType().FullName, raw, rewritten); lexListRewritten = lexListRaw; } if (lexListRaw.IsEmpty()) { continue; } else { if (lexListRaw.Count == 1) { raw = lexListRaw[0].Value(); rewritten = lexListRewritten[0].Value(); } else { if (lexListRaw.Count > 1) { string secondWord = lexListRaw[1].Value(); if (secondWord.Equals(this._enclosing.segMarker.ToString())) { // Special case for the null marker in the vocalized section raw = lexListRaw[0].Value() + this._enclosing.segMarker; rewritten = lexListRewritten[0].Value() + this._enclosing.segMarker; } else { System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", this.GetType().FullName, raw); raw = lexListRaw[0].Value(); rewritten = lexListRewritten[0].Value(); } } } } } cl.SetValue(raw); cl.SetWord(raw); cl.SetTag(wordTagPair[1]); cl.Set(typeof(CoreAnnotations.DomainAnnotation), lineDomain); cl.Set(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation), rewritten); input.Add(cl); } tokenList = IOBUtils.StringToIOB(input, this._enclosing.segMarker, true, this._enclosing.shouldStripRewrites); } else { if (this._enclosing.tf == null) { tokenList = IOBUtils.StringToIOB(@in, this._enclosing.segMarker); } else { IList <CoreLabel> line = this._enclosing.tf.GetTokenizer(new StringReader(@in)).Tokenize(); tokenList = IOBUtils.StringToIOB(line, this._enclosing.segMarker, false); } } if (this._enclosing.inputHasDomainLabels && !this._enclosing.inputHasTags) { IOBUtils.LabelDomain(tokenList, lineDomain); } else { if (!this._enclosing.inputHasDomainLabels) { IOBUtils.LabelDomain(tokenList, this._enclosing.inputDomain); } } return(tokenList); }