public void Parse(byte[] data) { try { if (_isParsingComplete) { Debug.WriteLine("Parsing already complete !"); return; } _remainingData += Encoding.UTF8.GetString(data); while (true) { var delimiterIndex = _remainingData.IndexOf(Delimiter, StringComparison.Ordinal); if (delimiterIndex == -1) { break; } var line = _remainingData.Substring(0, delimiterIndex); _isParsingComplete = ParseLine(line); //Debug.WriteLine(line); if (_isParsingComplete) { ParserComplete?.Invoke(this, new ParserCompleteEventArgs <AmcpParsedData>(_amcpParsedData)); break; } _remainingData = _remainingData.Substring(delimiterIndex + Delimiter.Length); } } catch (Exception ex) { ParserError?.Invoke(this, new ParserErrorEventArgs(ex)); } }
public virtual List <R> Parse(string html) { // Define html doc HtmlDocument htmlDocument; // Try to parse html string try { htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); } catch (Exception ex) { ParserError?.Invoke(this, new TwitterItemParserErrorEventArgs("Failed to create HtmlDocument from raw html string.", ex)); return(null); } // Check nodes count var nodes = XPathHelper.GetJsStreamItemNodes(htmlDocument.DocumentNode); if (nodes == null || nodes.Count == 0) { return(new List <R>()); } // Start parsing nodes List <R> results = new List <R>(); foreach (var node in nodes) { var r = Parse(node); if (r != null) { results.Add(r); } } return(results); }
private void OnParserError(string error) { ParserError?.Invoke(this, error); }
/// <summary> /// Parses the given data, which should be 1 or more lines, multiple /// calls can be made with multiple lines. /// Events are triggered upon reading a line, or on an error. /// If TagCollection and XrefTagCollection haven't been set /// prior to calling default IndexedKeyCollection objects will be /// used. To support replacing XRefs you need to set XrefTagCollection /// to an instance of XRefIndexedKeyCollection before calling. /// </summary> /// <param name="data">Data to parse, expected to be unicode</param> /// <returns>The last error encountered.</returns> public GedcomErrorState GedcomParse(string data) { ErrorState = GedcomErrorState.NoError; int i = 0; int len = data.Length; // Tags are always the same, data.Substring was allocating lots // of memory, instead use a special collection which matches via // array index, e.g tagCollection[str, index, length] to avoid // the extra allocations, and caches the resulting string for // use again without having to substring if (TagCollection == null) { TagCollection = new IndexedKeyCollection(); } // same for Xrefs if (XrefCollection == null) { XrefCollection = new IndexedKeyCollection(); } while (i < len) { int temp = i; switch (State) { case GedcomState.Level: // eat up leading white space while (temp < len && char.IsWhiteSpace(data[temp])) { temp++; } bool hadLevel = false; int lvl = 0; while (temp < len && IsDigit(data[temp])) { hadLevel = true; lvl *= 10; lvl += data[temp++] - '0'; } // possible we had data after eating white space // but that it wasn't a digit if (!hadLevel) { if (ApplyConcContOnNewLineHack && (previousTag == "CONC" || previousTag == "CONT")) { Level = previousLevel; Tag = "CONC"; State = GedcomState.LineValue; } else { ErrorState = GedcomErrorState.LevelExpected; } } else if (temp == i) { if (!char.IsWhiteSpace(data[i])) { ErrorState = GedcomErrorState.LevelExpected; } else { i++; } } else { Level = lvl; if (Level > MaxLevel) { ErrorState = GedcomErrorState.LevelInvalid; Level = -1; } else { i = temp; } } // move to next state if we have a level // and we are still in a level state (may not be // if we have some hacks active) if (Level != -1 && State == GedcomState.Level) { if (IsDelim(data[i])) { i++; if (IgnoreInvalidDelim) { while (i < len && IsDelim(data[i])) { i++; } State = GedcomState.XrefID; } else if (IsDelim(data[i])) { ErrorState = GedcomErrorState.InvalidDelim; } else { State = GedcomState.XrefID; } } else { ErrorState = GedcomErrorState.LevelMissingDelim; } } break; case GedcomState.XrefID: // no optional xref id just move to next state // otherwise extract pointer if (IsXrefID(data, temp)) { // bypass first @ i++; temp = i; while (temp < len && data[temp] != '@') { temp++; } if ((temp - i) > MaxXRefLength) { ErrorState = GedcomErrorState.XrefIDTooLong; } else { XrefID = XrefCollection[data, i, temp - i]; i = temp + 1; if (IsDelim(data[i])) { i++; if (IgnoreInvalidDelim) { while (i < len && IsDelim(data[i])) { i++; } State = GedcomState.Tag; } else if (IsDelim(data[i])) { ErrorState = GedcomErrorState.InvalidDelim; } else { State = GedcomState.Tag; } } else { ErrorState = GedcomErrorState.XrefIDMissingDelim; } } } else { State = GedcomState.Tag; } break; case GedcomState.Tag: while (temp < len && (IsAlphaNum(data[temp]) || (AllowHyphenOrUnderscoreInTag && (data[temp] == '-' || data[temp] == '_')))) { temp++; } if (temp == i) { ErrorState = GedcomErrorState.TagExpected; } else { Tag = TagCollection[data, i, temp - i]; i = temp; } if (Tag != string.Empty) { if (Tag == "TRLR" && i == len) { FoundTag(); } else { if (i < len && IsDelim(data[i])) { i++; State = GedcomState.LineValue; } // not else if so we can handle tags with a trailing space but no line value if (i == len || IsTerminator(data[i])) { FoundTag(); while (i < len && IsTerminator(data[i])) { i++; } } else if (State != GedcomState.LineValue && !IgnoreMissingTerms) { ErrorState = GedcomErrorState.TagMissingDelimOrTerm; } } } break; case GedcomState.LineValue: if (IsPointer(data, temp)) { // bypass first @ i++; temp = i; while (temp < len && data[temp] != '@') { temp++; } if ((temp - i) > 0) { LineValue = XrefCollection[data, i, temp - i]; i = temp + 1; LineValueType = GedcomLineValueType.PointerType; } // GEDCOM only allows a single XREF for a pointer // Genopro ignores this and puts a comma separated // list of XREFs in the mess it pretends is GEDCOM. // This causes us to get stuck in the LineValue state // (this could of cause happen with anything after the // pointer) if (i < len) { // we will allow white space, but nothing else while (i < len && IsDelim(data[i])) { i++; } if (i < len && !IsTerminator(data[i])) { ErrorState = GedcomErrorState.LineValueInvalid; } } } else { while (ErrorState == GedcomErrorState.NoError && LineValue == string.Empty) { if (temp < len && IsAnyChar(data, temp)) { temp++; } else if (temp < len && IsEscape(data, temp)) { // bypass @# temp += 2; while (temp < len && data[temp] != '@') { temp++; } temp++; } // hack for presidents.ged, email address // is used in PHON on line 13 with a single @ // this isn't valid GEDCOM // Should be escaped as @@ but handle it anyway // Same thing occurs in user supplied file TOUT200801_unicode.ged // with RELA @INDI:BAPM else if (temp < len && data[temp] == '@') { temp++; } else if (temp != i) { if ((temp < len) && !IsTerminator(data[temp])) { ErrorState = GedcomErrorState.LineValueInvalid; } else { temp = Math.Min(temp, len); string dup = data.Substring(i, temp - i); // unescape @@ LineValue = dup.Replace("@@", "@"); LineValueType = GedcomLineValueType.DataType; } i = temp; } // TODO: no line value, but have hit the terminator // what should this be allowed for? // Family Tree Maker outputs emtpy CONT (and CONC?) else if (Tag == "CONT" || Tag == "CONC") { LineValue = " "; } else { // hit a terminator break; } } } if (ErrorState == GedcomErrorState.NoError) { // can't use FoundTag here, may not want to reset previousLevel = Level; previousTag = Tag; if (TagFound != null) { TagFound(this, EventArgs.Empty); } if (i == len || IsTerminator(data[i])) { while (i < len && IsTerminator(data[i])) { i++; } // reset states ResetParseState(false); } else if (!IgnoreMissingTerms) { ErrorState = GedcomErrorState.LineValueMissingTerm; } } break; } if (ErrorState != GedcomErrorState.NoError) { ParserError?.Invoke(this, EventArgs.Empty); break; } } // reset parse status for more input ResetParseState(false); return(ErrorState); }
protected virtual void RaiseParserError(string msg, Exception ex) => ParserError?.Invoke(this, new TwitterItemParserErrorEventArgs(msg, ex));