示例#1
0
        /// <summary>
        /// Parses model string representation into training model
        /// </summary>
        /// <param name="input"></param>
        /// <param name="maxTagLength">Maximum tag length</param>
        /// <returns></returns>
        public TagsCorpus <TTag> Parse(string input, int maxTagLength = -1)
        {
            var           model          = new TagsCorpus <TTag>();
            var           textIndex      = 0;
            StringBuilder clearedText    = new StringBuilder();
            Tag <TTag>    previousEntity = null;

            using (var reader = new StringReader(input))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    if (string.IsNullOrEmpty(line))
                    {
                        textIndex++;
                        clearedText.Append("\n");
                        previousEntity = null;
                        continue;
                    }
                    var components = line.Split(Const.SpaceC);
                    var word       = components[0];
                    var type       = _tagTypeParser(components[1]);
                    if (!_tagTypeValidator(type))
                    {
                        textIndex += word.Length + 1;
                        clearedText.Append(word).Append(Const.SpaceC);
                        previousEntity = null;
                    }
                    else
                    {
                        if (previousEntity == null || !previousEntity.Type.Equals(type))
                        {
                            previousEntity = new Tag <TTag>(textIndex, textIndex + word.Length, type);
                            model.Tags.Add(previousEntity);
                        }
                        else
                        {
                            previousEntity.End = textIndex + word.Length;
                        }
                        textIndex += word.Length + 1;
                        clearedText.Append(word).Append(Const.SpaceC);
                    }
                }
            }
            model.ClearedText = clearedText.ToString();
            return(model);
        }
示例#2
0
        /// <summary>
        /// Parses model string representation into training model
        /// </summary>
        /// <param name="input"></param>
        /// <param name="maxTagLength">Maximum tag length</param>
        /// <returns></returns>
        public TagsCorpus <TTag> Parse(string input, int maxTagLength = -1)
        {
            var model      = new TagsCorpus <TTag>();
            var context    = new Stack <TagPart <TTag> >();
            var state      = ParsingState.Text;
            var tagBuilder = new StringBuilder(8);
            var builder    = new StringBuilder(input.Length);
            var i          = 0;
            var pos        = 0;
            var lineNumber = 0;

            for (; i < input.Length; i++)
            {
                try
                {
                    var c = input[i];
                    switch (c)
                    {
                    case '<':
                        switch (state)
                        {
                        case ParsingState.Text:
                            state = ParsingState.OpenTag;
                            context.Push(new TagPart <TTag> {
                                Start = pos, State = ParsingState.OpenTag, ModelStart = i, Line = lineNumber
                            });
                            break;

                        case ParsingState.CloseTag:
                            throw new ModelParsingException <TTag>("Incorrect markup", lineNumber, pos, i, builder, context);

                        default:
                            throw new ModelParsingException <TTag>("Incorrect state={0}".FormatWith(state), lineNumber, pos, i, builder, context);
                        }
                        break;

                    case '/':
                        switch (state)
                        {
                        case ParsingState.OpenTag:
                            state = ParsingState.CloseTag;
                            context.Peek().State = ParsingState.CloseTag;
                            break;

                        default:
                            builder.Append(c);
                            pos++;
                            break;
                        }
                        break;

                    case '>':
                        switch (state)
                        {
                        case ParsingState.OpenTag:
                        case ParsingState.CloseTag:
                        case ParsingState.InsideTagAfterTagName:
                            var last = context.Peek();
                            last.Type = _tagTypeParser(tagBuilder);
                            tagBuilder.Clear();
                            last.End = pos;
                            state    = ParsingState.Text;
                            TryMergeTags(context, model.Tags, maxTagLength);
                            break;

                        default:
                            builder.Append(c);
                            pos++;
                            break;
                        }
                        break;

                    default:
                        switch (state)
                        {
                        case ParsingState.OpenTag:
                        case ParsingState.CloseTag:
                            if (char.IsWhiteSpace(c))
                            {
                                state = ParsingState.InsideTagAfterTagName;
                            }
                            else
                            {
                                tagBuilder.Append(c);
                            }
                            break;

                        case ParsingState.InsideTagAfterTagName:
                            break;

                        default:
                            if (c == '\n')
                            {
                                lineNumber++;
                            }
                            builder.Append(c);
                            pos++;
                            break;
                        }
                        break;
                    }
                }
                catch (Exception exc)
                {
                    throw new ModelParsingException <TTag>(lineNumber, pos, i, builder, context, exc);
                }
            }
            if (context.Count > 0)
            {
                var last = context.Pop();
                throw new ModelParsingException <TTag>(last, GetText(builder, last.Start));
            }
            model.ClearedText = builder.ToString();
            return(model);
        }