Пример #1
0
        private IList <CoreLabel> SegmentStringToIOB(string line)
        {
            IList <CoreLabel> tokenList;

            if (tf == null)
            {
                // Whitespace tokenization.
                tokenList = IOBUtils.StringToIOB(line);
            }
            else
            {
                IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(line)).Tokenize();
                tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line);
            }
            IOBUtils.LabelDomain(tokenList, domain);
            tokenList = classifier.Classify(tokenList);
            return(tokenList);
        }
Пример #2
0
            public IList <CoreLabel> Apply(string @in)
            {
                IList <CoreLabel> tokenList;
                string            lineDomain = string.Empty;

                if (this._enclosing.inputHasDomainLabels)
                {
                    string[] domainAndData = @in.Split("\\s+", 2);
                    if (domainAndData.Length < 2)
                    {
                        ArabicDocumentReaderAndWriter.log.Info("Missing domain label or text: ");
                        ArabicDocumentReaderAndWriter.log.Info(@in);
                    }
                    else
                    {
                        lineDomain = domainAndData[0];
                        @in        = domainAndData[1];
                    }
                }
                else
                {
                    lineDomain = this._enclosing.inputDomain;
                }
                if (this._enclosing.inputHasTags)
                {
                    string[]          toks     = @in.Split("\\s+");
                    IList <CoreLabel> input    = new List <CoreLabel>(toks.Length);
                    string            tagDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.tagDelimiter);
                    string            rewDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.rewriteDelimiter);
                    foreach (string wordTag in toks)
                    {
                        string[] wordTagPair = wordTag.Split(tagDelim);
                        System.Diagnostics.Debug.Assert(wordTagPair.Length == 2);
                        string[] rewritePair = wordTagPair[0].Split(rewDelim);
                        System.Diagnostics.Debug.Assert(rewritePair.Length == 1 || rewritePair.Length == 2);
                        string raw       = rewritePair[0];
                        string rewritten = raw;
                        if (rewritePair.Length == 2)
                        {
                            rewritten = rewritePair[1];
                        }
                        CoreLabel cl = new CoreLabel();
                        if (this._enclosing.tf != null)
                        {
                            IList <CoreLabel> lexListRaw       = this._enclosing.tf.GetTokenizer(new StringReader(raw)).Tokenize();
                            IList <CoreLabel> lexListRewritten = this._enclosing.tf.GetTokenizer(new StringReader(rewritten)).Tokenize();
                            if (lexListRewritten.Count != lexListRaw.Count)
                            {
                                System.Console.Error.Printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.GetType().FullName, raw, rewritten);
                                lexListRewritten = lexListRaw;
                            }
                            if (lexListRaw.IsEmpty())
                            {
                                continue;
                            }
                            else
                            {
                                if (lexListRaw.Count == 1)
                                {
                                    raw       = lexListRaw[0].Value();
                                    rewritten = lexListRewritten[0].Value();
                                }
                                else
                                {
                                    if (lexListRaw.Count > 1)
                                    {
                                        string secondWord = lexListRaw[1].Value();
                                        if (secondWord.Equals(this._enclosing.segMarker.ToString()))
                                        {
                                            // Special case for the null marker in the vocalized section
                                            raw       = lexListRaw[0].Value() + this._enclosing.segMarker;
                                            rewritten = lexListRewritten[0].Value() + this._enclosing.segMarker;
                                        }
                                        else
                                        {
                                            System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", this.GetType().FullName, raw);
                                            raw       = lexListRaw[0].Value();
                                            rewritten = lexListRewritten[0].Value();
                                        }
                                    }
                                }
                            }
                        }
                        cl.SetValue(raw);
                        cl.SetWord(raw);
                        cl.SetTag(wordTagPair[1]);
                        cl.Set(typeof(CoreAnnotations.DomainAnnotation), lineDomain);
                        cl.Set(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation), rewritten);
                        input.Add(cl);
                    }
                    tokenList = IOBUtils.StringToIOB(input, this._enclosing.segMarker, true, this._enclosing.shouldStripRewrites);
                }
                else
                {
                    if (this._enclosing.tf == null)
                    {
                        tokenList = IOBUtils.StringToIOB(@in, this._enclosing.segMarker);
                    }
                    else
                    {
                        IList <CoreLabel> line = this._enclosing.tf.GetTokenizer(new StringReader(@in)).Tokenize();
                        tokenList = IOBUtils.StringToIOB(line, this._enclosing.segMarker, false);
                    }
                }
                if (this._enclosing.inputHasDomainLabels && !this._enclosing.inputHasTags)
                {
                    IOBUtils.LabelDomain(tokenList, lineDomain);
                }
                else
                {
                    if (!this._enclosing.inputHasDomainLabels)
                    {
                        IOBUtils.LabelDomain(tokenList, this._enclosing.inputDomain);
                    }
                }
                return(tokenList);
            }