Example #1
0
		/// <summary>
		/// Parses the given data, which should be 1 or more lines, multiple
		/// calls can be made with multiple lines.
		/// Events are triggered upon reading a line, or on an error.
		/// If TagCollection and XrefTagCollection haven't been set
		/// prior to calling default IndexedKeyCollection objects will be
		/// used.  To support replacing XRefs you need to set XrefTagCollection
		/// to an instance of XRefIndexedKeyCollection before calling. 
		/// </summary>
		/// <param name="data">Data to parse, expected to be unicode</param>
		public bool GedcomParse(string data)
		{
			_Error = GedcomErrorState.NoError;
			
			int i = 0;
			
			int len = data.Length;
			
			// Tags are always the same, data.Substring was allocating lots
			// of memory, instead use a special collection which matches via
			// array index, e.g   tagCollection[str, index, length] to avoid
			// the extra allocations, and caches the resulting string for
			// use again without having to substring
			if (_tagCollection == null)
			{
				_tagCollection = new Utility.IndexedKeyCollection();
			}
			
			// same for Xrefs
			if (_xrefCollection == null)
			{
				_xrefCollection = new Utility.IndexedKeyCollection();
			}
						
			while (i < len)
			{
				int temp = i;
											
				switch (_State)
				{
					case GedcomState.Level:
						// eat up leading white space
						while (temp < len && char.IsWhiteSpace(data[temp]))
						{
							temp ++;
						}

						bool hadLevel = false;
						int lvl = 0;
						while (temp < len && IsDigit(data[temp]))
						{
							hadLevel = true;
							lvl *= 10;
							lvl += (((int)data[temp++]) - (int)'0');
						}

						// possible we had data after eating white space
						// but that it wasn't a digit
						if (!hadLevel)
						{
							if (ApplyConcContOnNewLineHack && 
								(_previousTag == "CONC" || _previousTag == "CONT"))
							{
								_Level = _previousLevel;
								_Tag = "CONC";
								_State = GedcomState.LineValue;
							}
							else
							{
								_Error = GedcomErrorState.LevelExpected;
							}
						}
						else if (temp == i)
						{
							if (!char.IsWhiteSpace(data[i]))
							{
								_Error = GedcomErrorState.LevelExpected;
							}
							else
							{
								i ++;	
							}
						}
						else
						{
							_Level = lvl;
							
							if (_Level > MaxLevel)
							{
								_Error = GedcomErrorState.LevelInvalid;
								_Level = -1;
							}
							else
							{
								i = temp;	
							}
						}
						
						// move to next state if we have a level
						// and we are still in a level state (may not be
						// if we have some hacks active)
						if (_Level != -1 && _State == GedcomState.Level)
						{
							if (IsDelim(data[i]))
							{
								i ++;
								if (IgnoreInvalidDelim)
								{
									while (i < len && IsDelim(data[i]))
									{
										i ++;
									}
									_State = GedcomState.XrefID;
								}
								else if (IsDelim(data[i]))
								{
									_Error = GedcomErrorState.InvalidDelim;
								}
								else
								{
									_State = GedcomState.XrefID;
								}
							}
							else
							{
								_Error = GedcomErrorState.LevelMissingDelim;
							}
						}
						
						break;
					
					case GedcomState.XrefID:
						
						// no optional xref id just move to next state
						// otherwise extract pointer
						
						if (IsXrefID(data,temp))
						{
							// bypass first @
							i ++;
							temp = i;
							
							while (temp < len && data[temp] != '@')
							{
								temp ++;	
							}
							
							if ( (temp - i) > MaxXRefLength)
							{
								_Error = GedcomErrorState.XrefIDTooLong;	
							}
							else
							{
								_XrefID = _xrefCollection[data, i, temp - i];
								
								i = temp + 1;
								
								if (IsDelim(data[i]))
								{
									i ++;
									if (IgnoreInvalidDelim)
									{
										while (i < len && IsDelim(data[i]))
										{
											i ++;
										}
										_State = GedcomState.Tag;
									}
									else if (IsDelim(data[i]))
									{
										_Error = GedcomErrorState.InvalidDelim;
									}
									else
									{
										_State = GedcomState.Tag;
									}
								}
								else
								{
									_Error = GedcomErrorState.XrefIDMissingDelim;	
								}
							}
						}
						else
						{
							_State = GedcomState.Tag;	
						}
						
						break;
					
					case GedcomState.Tag:
						while (temp < len && 
							   (IsAlphaNum(data[temp]) ||
								(AllowHyphenOrUnderscoreInTag &&
								 (data[temp] == '-' || data[temp] == '_'))))
						{
							temp ++;	
						}
						
						if (temp == i)
						{
							_Error = GedcomErrorState.TagExpected;	
						}
						else
						{
							_Tag = _tagCollection[data, i, temp - i];
							
							i = temp;
						}
												
						if (_Tag != string.Empty)
						{
							if (_Tag == "TRLR" && i == len)
							{
								FoundTag();
							}
							else
							{
								if (i < len && IsDelim(data[i]))
								{
									i ++;
																
									_State = GedcomState.LineValue;
								}
								// not else if so we can handle tags with a trailing space but no line value
								if (i == len || IsTerminator(data[i]))
								{
									FoundTag();
									
									while (i < len && IsTerminator(data[i]))
									{
										i ++;	
									}
								}
								else if (_State != GedcomState.LineValue && !IgnoreMissingTerms)
								{
									_Error = GedcomErrorState.TagMissingDelimOrTerm;	
								}
							}
						}
						
						break;
					
					case GedcomState.LineValue:
						if (IsPointer(data,temp))
						{
							// bypass first @
							i ++;
							temp = i;
							
							while (temp < len && data[temp] != '@')
							{
								temp ++;	
							}
							
							if ((temp - i) > 0)
							{
								_LineValue = _xrefCollection[data, i, temp - i];							
								i = temp + 1;
								_LineValueType = GedcomLineValueType.PointerType;
							}

							// GEDCOM only allows a single XREF for a pointer
							// Genopro ignores this and puts a comma separated
							// list of XREFs in the mess it pretends is GEDCOM.
							// This causes us to get stuck in the LineValue state
							// (this could of cause happen with anything after the
							//  pointer)
							if (i < len)
							{
								// we will allow white space, but nothing else
								while (i < len && IsDelim(data[i]))
								{
									i ++;
								}

								if (i < len && !IsTerminator(data[i]))
								{
									_Error = GedcomErrorState.LineValueInvalid;
								}
							}
						}
						else
						{
							while (_Error == GedcomErrorState.NoError && _LineValue == string.Empty)
							{
								if (temp < len && IsAnyChar(data,temp))
								{
									temp ++;	
								}
								else if (temp < len && IsEscape(data,temp))
								{
									// bypass @#
									
									temp += 2;
									
									while (temp < len && data[temp] != '@')
									{
										temp ++;	
									}
									temp ++;
								}
								// hack for presidents.ged, email address
								// is used in PHON on line 13 with a single @
								// this isn't valid GEDCOM
								// Should be escaped as @@ but handle it anyway
								// Same thing occurs in user supplied file TOUT200801_unicode.ged
								// with RELA @INDI:BAPM
								else if (temp < len && data[temp] == '@')
								{
									temp ++;	
								}
								else if (temp != i)
								{
									if ((temp < len) && !IsTerminator(data[temp]))
									{
										_Error = GedcomErrorState.LineValueInvalid;
									}
									else
									{
										temp = Math.Min(temp, len);
										string dup = data.Substring(i, temp - i);
										// unescape @@ 
										_LineValue = dup.Replace("@@", "@");
									
										_LineValueType = GedcomLineValueType.DataType;
									}
									i = temp;
								}
								// FIXME: no line value, but have hit the terminator
								// what should this be allowed for?
								// Family Tree Maker outputs emtpy CONT (and CONC?)
								else if (_Tag == "CONT" || _Tag == "CONC")
								{
									_LineValue = " ";
								}
								else
								{
									// hit a terminator
									break;
								}
							}
						}

						if (_Error == GedcomErrorState.NoError)
						{
							// can't use FoundTag here, may not want to reset
							_previousLevel = _Level;
							_previousTag = _Tag;
							
							if (TagFound != null)
							{	
								TagFound(this,EventArgs.Empty);	
							}	
							if (i == len || IsTerminator(data[i]))
							{
								while (i < len && IsTerminator(data[i]))
								{
									i ++;	
								}
								
								// reset states
								ResetParseState(false);
							}
							else if (!IgnoreMissingTerms)
							{
								_Error = GedcomErrorState.LineValueMissingTerm;
							}
						}
						break;
				}

				if (_Error != GedcomErrorState.NoError)
				{
					if (ParserError != null)
					{
						ParserError(this,EventArgs.Empty);	
					}
					break;
				}
			}
			
			// reset parse status for more input
			ResetParseState(false);			
			
			return (_Error == GedcomErrorState.NoError);
		}
Example #2
0
		private void ResetParseState(bool resetLevel, GedcomCharset charset)
		{
			_charset = charset;
			
			_XrefID = string.Empty;
			_Tag = string.Empty;
			_LineValue = string.Empty;
			_State = GedcomState.Level;
			if (resetLevel)
			{
				_Level = -1;
				_previousLevel = -1;
				_previousTag = string.Empty;
			}
			_LineValueType = GedcomLineValueType.NoType;
		}