/// <summary> /// Parses the given data, which should be 1 or more lines, multiple /// calls can be made with multiple lines. /// Events are triggered upon reading a line, or on an error. /// If TagCollection and XrefTagCollection haven't been set /// prior to calling default IndexedKeyCollection objects will be /// used. To support replacing XRefs you need to set XrefTagCollection /// to an instance of XRefIndexedKeyCollection before calling. /// </summary> /// <param name="data">Data to parse, expected to be unicode</param> public bool GedcomParse(string data) { _Error = GedcomErrorState.NoError; int i = 0; int len = data.Length; // Tags are always the same, data.Substring was allocating lots // of memory, instead use a special collection which matches via // array index, e.g tagCollection[str, index, length] to avoid // the extra allocations, and caches the resulting string for // use again without having to substring if (_tagCollection == null) { _tagCollection = new Utility.IndexedKeyCollection(); } // same for Xrefs if (_xrefCollection == null) { _xrefCollection = new Utility.IndexedKeyCollection(); } while (i < len) { int temp = i; switch (_State) { case GedcomState.Level: // eat up leading white space while (temp < len && char.IsWhiteSpace(data[temp])) { temp ++; } bool hadLevel = false; int lvl = 0; while (temp < len && IsDigit(data[temp])) { hadLevel = true; lvl *= 10; lvl += (((int)data[temp++]) - (int)'0'); } // possible we had data after eating white space // but that it wasn't a digit if (!hadLevel) { if (ApplyConcContOnNewLineHack && (_previousTag == "CONC" || _previousTag == "CONT")) { _Level = _previousLevel; _Tag = "CONC"; _State = GedcomState.LineValue; } else { _Error = GedcomErrorState.LevelExpected; } } else if (temp == i) { if (!char.IsWhiteSpace(data[i])) { _Error = GedcomErrorState.LevelExpected; } else { i ++; } } else { _Level = lvl; if (_Level > MaxLevel) { _Error = GedcomErrorState.LevelInvalid; _Level = -1; } else { i = temp; } } // move to next state if we have a level // and we are still in a level state (may not be // if we have some hacks active) if (_Level != -1 && _State == GedcomState.Level) { if (IsDelim(data[i])) { i ++; if (IgnoreInvalidDelim) { while (i < len && IsDelim(data[i])) { i ++; } _State = GedcomState.XrefID; } else if (IsDelim(data[i])) { _Error = GedcomErrorState.InvalidDelim; } else { _State = GedcomState.XrefID; } } else { _Error = GedcomErrorState.LevelMissingDelim; } } break; case GedcomState.XrefID: // no optional xref id just move to next state // otherwise extract pointer if (IsXrefID(data,temp)) { // bypass first @ i ++; temp = i; while (temp < len && data[temp] != '@') { temp ++; } if ( (temp - i) > MaxXRefLength) { _Error = GedcomErrorState.XrefIDTooLong; } else { _XrefID = _xrefCollection[data, i, temp - i]; i = temp + 1; if (IsDelim(data[i])) { i ++; if (IgnoreInvalidDelim) { while (i < len && IsDelim(data[i])) { i ++; } _State = GedcomState.Tag; } else if (IsDelim(data[i])) { _Error = GedcomErrorState.InvalidDelim; } else { _State = GedcomState.Tag; } } else { _Error = GedcomErrorState.XrefIDMissingDelim; } } } else { _State = GedcomState.Tag; } break; case GedcomState.Tag: while (temp < len && (IsAlphaNum(data[temp]) || (AllowHyphenOrUnderscoreInTag && (data[temp] == '-' || data[temp] == '_')))) { temp ++; } if (temp == i) { _Error = GedcomErrorState.TagExpected; } else { _Tag = _tagCollection[data, i, temp - i]; i = temp; } if (_Tag != string.Empty) { if (_Tag == "TRLR" && i == len) { FoundTag(); } else { if (i < len && IsDelim(data[i])) { i ++; _State = GedcomState.LineValue; } // not else if so we can handle tags with a trailing space but no line value if (i == len || IsTerminator(data[i])) { FoundTag(); while (i < len && IsTerminator(data[i])) { i ++; } } else if (_State != GedcomState.LineValue && !IgnoreMissingTerms) { _Error = GedcomErrorState.TagMissingDelimOrTerm; } } } break; case GedcomState.LineValue: if (IsPointer(data,temp)) { // bypass first @ i ++; temp = i; while (temp < len && data[temp] != '@') { temp ++; } if ((temp - i) > 0) { _LineValue = _xrefCollection[data, i, temp - i]; i = temp + 1; _LineValueType = GedcomLineValueType.PointerType; } // GEDCOM only allows a single XREF for a pointer // Genopro ignores this and puts a comma separated // list of XREFs in the mess it pretends is GEDCOM. // This causes us to get stuck in the LineValue state // (this could of cause happen with anything after the // pointer) if (i < len) { // we will allow white space, but nothing else while (i < len && IsDelim(data[i])) { i ++; } if (i < len && !IsTerminator(data[i])) { _Error = GedcomErrorState.LineValueInvalid; } } } else { while (_Error == GedcomErrorState.NoError && _LineValue == string.Empty) { if (temp < len && IsAnyChar(data,temp)) { temp ++; } else if (temp < len && IsEscape(data,temp)) { // bypass @# temp += 2; while (temp < len && data[temp] != '@') { temp ++; } temp ++; } // hack for presidents.ged, email address // is used in PHON on line 13 with a single @ // this isn't valid GEDCOM // Should be escaped as @@ but handle it anyway // Same thing occurs in user supplied file TOUT200801_unicode.ged // with RELA @INDI:BAPM else if (temp < len && data[temp] == '@') { temp ++; } else if (temp != i) { if ((temp < len) && !IsTerminator(data[temp])) { _Error = GedcomErrorState.LineValueInvalid; } else { temp = Math.Min(temp, len); string dup = data.Substring(i, temp - i); // unescape @@ _LineValue = dup.Replace("@@", "@"); _LineValueType = GedcomLineValueType.DataType; } i = temp; } // FIXME: no line value, but have hit the terminator // what should this be allowed for? // Family Tree Maker outputs emtpy CONT (and CONC?) else if (_Tag == "CONT" || _Tag == "CONC") { _LineValue = " "; } else { // hit a terminator break; } } } if (_Error == GedcomErrorState.NoError) { // can't use FoundTag here, may not want to reset _previousLevel = _Level; _previousTag = _Tag; if (TagFound != null) { TagFound(this,EventArgs.Empty); } if (i == len || IsTerminator(data[i])) { while (i < len && IsTerminator(data[i])) { i ++; } // reset states ResetParseState(false); } else if (!IgnoreMissingTerms) { _Error = GedcomErrorState.LineValueMissingTerm; } } break; } if (_Error != GedcomErrorState.NoError) { if (ParserError != null) { ParserError(this,EventArgs.Empty); } break; } } // reset parse status for more input ResetParseState(false); return (_Error == GedcomErrorState.NoError); }
private void ResetParseState(bool resetLevel, GedcomCharset charset) { _charset = charset; _XrefID = string.Empty; _Tag = string.Empty; _LineValue = string.Empty; _State = GedcomState.Level; if (resetLevel) { _Level = -1; _previousLevel = -1; _previousTag = string.Empty; } _LineValueType = GedcomLineValueType.NoType; }