/// <summary> /// Parses model string representation into training model /// </summary> /// <param name="input"></param> /// <param name="maxTagLength">Maximum tag length</param> /// <returns></returns> public TagsCorpus <TTag> Parse(string input, int maxTagLength = -1) { var model = new TagsCorpus <TTag>(); var textIndex = 0; StringBuilder clearedText = new StringBuilder(); Tag <TTag> previousEntity = null; using (var reader = new StringReader(input)) { string line; while ((line = reader.ReadLine()) != null) { if (string.IsNullOrEmpty(line)) { textIndex++; clearedText.Append("\n"); previousEntity = null; continue; } var components = line.Split(Const.SpaceC); var word = components[0]; var type = _tagTypeParser(components[1]); if (!_tagTypeValidator(type)) { textIndex += word.Length + 1; clearedText.Append(word).Append(Const.SpaceC); previousEntity = null; } else { if (previousEntity == null || !previousEntity.Type.Equals(type)) { previousEntity = new Tag <TTag>(textIndex, textIndex + word.Length, type); model.Tags.Add(previousEntity); } else { previousEntity.End = textIndex + word.Length; } textIndex += word.Length + 1; clearedText.Append(word).Append(Const.SpaceC); } } } model.ClearedText = clearedText.ToString(); return(model); }
/// <summary> /// Parses model string representation into training model /// </summary> /// <param name="input"></param> /// <param name="maxTagLength">Maximum tag length</param> /// <returns></returns> public TagsCorpus <TTag> Parse(string input, int maxTagLength = -1) { var model = new TagsCorpus <TTag>(); var context = new Stack <TagPart <TTag> >(); var state = ParsingState.Text; var tagBuilder = new StringBuilder(8); var builder = new StringBuilder(input.Length); var i = 0; var pos = 0; var lineNumber = 0; for (; i < input.Length; i++) { try { var c = input[i]; switch (c) { case '<': switch (state) { case ParsingState.Text: state = ParsingState.OpenTag; context.Push(new TagPart <TTag> { Start = pos, State = ParsingState.OpenTag, ModelStart = i, Line = lineNumber }); break; case ParsingState.CloseTag: throw new ModelParsingException <TTag>("Incorrect markup", lineNumber, pos, i, builder, context); default: throw new ModelParsingException <TTag>("Incorrect state={0}".FormatWith(state), lineNumber, pos, i, builder, context); } break; case '/': switch (state) { case ParsingState.OpenTag: state = ParsingState.CloseTag; context.Peek().State = ParsingState.CloseTag; break; default: builder.Append(c); pos++; break; } break; case '>': switch (state) { case ParsingState.OpenTag: case ParsingState.CloseTag: case ParsingState.InsideTagAfterTagName: var last = context.Peek(); last.Type = _tagTypeParser(tagBuilder); tagBuilder.Clear(); last.End = pos; state = ParsingState.Text; TryMergeTags(context, model.Tags, maxTagLength); break; default: builder.Append(c); pos++; break; } break; default: switch (state) { case ParsingState.OpenTag: case ParsingState.CloseTag: if (char.IsWhiteSpace(c)) { state = ParsingState.InsideTagAfterTagName; } else { tagBuilder.Append(c); } break; case ParsingState.InsideTagAfterTagName: break; default: if (c == '\n') { lineNumber++; } builder.Append(c); pos++; break; } break; } } catch (Exception exc) { throw new ModelParsingException <TTag>(lineNumber, pos, i, builder, context, exc); } } if (context.Count > 0) { var last = context.Pop(); throw new ModelParsingException <TTag>(last, GetText(builder, last.Start)); } model.ClearedText = builder.ToString(); return(model); }