Пример #1
0
        public void Parse(byte[] data)
        {
            try
            {
                if (_isParsingComplete)
                {
                    Debug.WriteLine("Parsing already complete !");
                    return;
                }

                _remainingData += Encoding.UTF8.GetString(data);

                while (true)
                {
                    var delimiterIndex = _remainingData.IndexOf(Delimiter, StringComparison.Ordinal);

                    if (delimiterIndex == -1)
                    {
                        break;
                    }

                    var line = _remainingData.Substring(0, delimiterIndex);

                    _isParsingComplete = ParseLine(line);

                    //Debug.WriteLine(line);

                    if (_isParsingComplete)
                    {
                        ParserComplete?.Invoke(this, new ParserCompleteEventArgs <AmcpParsedData>(_amcpParsedData));

                        break;
                    }

                    _remainingData = _remainingData.Substring(delimiterIndex + Delimiter.Length);
                }
            }
            catch (Exception ex)
            {
                ParserError?.Invoke(this, new ParserErrorEventArgs(ex));
            }
        }
Пример #2
0
        public virtual List <R> Parse(string html)
        {
            // Define html doc
            HtmlDocument htmlDocument;

            // Try to parse html string
            try
            {
                htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(html);
            }
            catch (Exception ex)
            {
                ParserError?.Invoke(this, new TwitterItemParserErrorEventArgs("Failed to create HtmlDocument from raw html string.", ex));
                return(null);
            }
            // Check nodes count
            var nodes = XPathHelper.GetJsStreamItemNodes(htmlDocument.DocumentNode);

            if (nodes == null || nodes.Count == 0)
            {
                return(new List <R>());
            }
            // Start parsing nodes
            List <R> results = new List <R>();

            foreach (var node in nodes)
            {
                var r = Parse(node);
                if (r != null)
                {
                    results.Add(r);
                }
            }
            return(results);
        }
Пример #3
0
 private void OnParserError(string error)
 {
     ParserError?.Invoke(this, error);
 }
Пример #4
0
        /// <summary>
        /// Parses the given data, which should be 1 or more lines, multiple
        /// calls can be made with multiple lines.
        /// Events are triggered upon reading a line, or on an error.
        /// If TagCollection and XrefTagCollection haven't been set
        /// prior to calling default IndexedKeyCollection objects will be
        /// used.  To support replacing XRefs you need to set XrefTagCollection
        /// to an instance of XRefIndexedKeyCollection before calling.
        /// </summary>
        /// <param name="data">Data to parse, expected to be unicode</param>
        /// <returns>The last error encountered.</returns>
        public GedcomErrorState GedcomParse(string data)
        {
            ErrorState = GedcomErrorState.NoError;

            int i = 0;

            int len = data.Length;

            // Tags are always the same, data.Substring was allocating lots
            // of memory, instead use a special collection which matches via
            // array index, e.g   tagCollection[str, index, length] to avoid
            // the extra allocations, and caches the resulting string for
            // use again without having to substring
            if (TagCollection == null)
            {
                TagCollection = new IndexedKeyCollection();
            }

            // same for Xrefs
            if (XrefCollection == null)
            {
                XrefCollection = new IndexedKeyCollection();
            }

            while (i < len)
            {
                int temp = i;

                switch (State)
                {
                case GedcomState.Level:
                    // eat up leading white space
                    while (temp < len && char.IsWhiteSpace(data[temp]))
                    {
                        temp++;
                    }

                    bool hadLevel = false;
                    int  lvl      = 0;
                    while (temp < len && IsDigit(data[temp]))
                    {
                        hadLevel = true;
                        lvl     *= 10;
                        lvl     += data[temp++] - '0';
                    }

                    // possible we had data after eating white space
                    // but that it wasn't a digit
                    if (!hadLevel)
                    {
                        if (ApplyConcContOnNewLineHack &&
                            (previousTag == "CONC" || previousTag == "CONT"))
                        {
                            Level = previousLevel;
                            Tag   = "CONC";
                            State = GedcomState.LineValue;
                        }
                        else
                        {
                            ErrorState = GedcomErrorState.LevelExpected;
                        }
                    }
                    else if (temp == i)
                    {
                        if (!char.IsWhiteSpace(data[i]))
                        {
                            ErrorState = GedcomErrorState.LevelExpected;
                        }
                        else
                        {
                            i++;
                        }
                    }
                    else
                    {
                        Level = lvl;

                        if (Level > MaxLevel)
                        {
                            ErrorState = GedcomErrorState.LevelInvalid;
                            Level      = -1;
                        }
                        else
                        {
                            i = temp;
                        }
                    }

                    // move to next state if we have a level
                    // and we are still in a level state (may not be
                    // if we have some hacks active)
                    if (Level != -1 && State == GedcomState.Level)
                    {
                        if (IsDelim(data[i]))
                        {
                            i++;
                            if (IgnoreInvalidDelim)
                            {
                                while (i < len && IsDelim(data[i]))
                                {
                                    i++;
                                }

                                State = GedcomState.XrefID;
                            }
                            else if (IsDelim(data[i]))
                            {
                                ErrorState = GedcomErrorState.InvalidDelim;
                            }
                            else
                            {
                                State = GedcomState.XrefID;
                            }
                        }
                        else
                        {
                            ErrorState = GedcomErrorState.LevelMissingDelim;
                        }
                    }

                    break;

                case GedcomState.XrefID:

                    // no optional xref id just move to next state
                    // otherwise extract pointer
                    if (IsXrefID(data, temp))
                    {
                        // bypass first @
                        i++;
                        temp = i;

                        while (temp < len && data[temp] != '@')
                        {
                            temp++;
                        }

                        if ((temp - i) > MaxXRefLength)
                        {
                            ErrorState = GedcomErrorState.XrefIDTooLong;
                        }
                        else
                        {
                            XrefID = XrefCollection[data, i, temp - i];

                            i = temp + 1;

                            if (IsDelim(data[i]))
                            {
                                i++;
                                if (IgnoreInvalidDelim)
                                {
                                    while (i < len && IsDelim(data[i]))
                                    {
                                        i++;
                                    }

                                    State = GedcomState.Tag;
                                }
                                else if (IsDelim(data[i]))
                                {
                                    ErrorState = GedcomErrorState.InvalidDelim;
                                }
                                else
                                {
                                    State = GedcomState.Tag;
                                }
                            }
                            else
                            {
                                ErrorState = GedcomErrorState.XrefIDMissingDelim;
                            }
                        }
                    }
                    else
                    {
                        State = GedcomState.Tag;
                    }

                    break;

                case GedcomState.Tag:
                    while (temp < len &&
                           (IsAlphaNum(data[temp]) ||
                            (AllowHyphenOrUnderscoreInTag &&
                             (data[temp] == '-' || data[temp] == '_'))))
                    {
                        temp++;
                    }

                    if (temp == i)
                    {
                        ErrorState = GedcomErrorState.TagExpected;
                    }
                    else
                    {
                        Tag = TagCollection[data, i, temp - i];

                        i = temp;
                    }

                    if (Tag != string.Empty)
                    {
                        if (Tag == "TRLR" && i == len)
                        {
                            FoundTag();
                        }
                        else
                        {
                            if (i < len && IsDelim(data[i]))
                            {
                                i++;

                                State = GedcomState.LineValue;
                            }

                            // not else if so we can handle tags with a trailing space but no line value
                            if (i == len || IsTerminator(data[i]))
                            {
                                FoundTag();

                                while (i < len && IsTerminator(data[i]))
                                {
                                    i++;
                                }
                            }
                            else if (State != GedcomState.LineValue && !IgnoreMissingTerms)
                            {
                                ErrorState = GedcomErrorState.TagMissingDelimOrTerm;
                            }
                        }
                    }

                    break;

                case GedcomState.LineValue:
                    if (IsPointer(data, temp))
                    {
                        // bypass first @
                        i++;
                        temp = i;

                        while (temp < len && data[temp] != '@')
                        {
                            temp++;
                        }

                        if ((temp - i) > 0)
                        {
                            LineValue     = XrefCollection[data, i, temp - i];
                            i             = temp + 1;
                            LineValueType = GedcomLineValueType.PointerType;
                        }

                        // GEDCOM only allows a single XREF for a pointer
                        // Genopro ignores this and puts a comma separated
                        // list of XREFs in the mess it pretends is GEDCOM.
                        // This causes us to get stuck in the LineValue state
                        // (this could of cause happen with anything after the
                        //  pointer)
                        if (i < len)
                        {
                            // we will allow white space, but nothing else
                            while (i < len && IsDelim(data[i]))
                            {
                                i++;
                            }

                            if (i < len && !IsTerminator(data[i]))
                            {
                                ErrorState = GedcomErrorState.LineValueInvalid;
                            }
                        }
                    }
                    else
                    {
                        while (ErrorState == GedcomErrorState.NoError && LineValue == string.Empty)
                        {
                            if (temp < len && IsAnyChar(data, temp))
                            {
                                temp++;
                            }
                            else if (temp < len && IsEscape(data, temp))
                            {
                                // bypass @#
                                temp += 2;

                                while (temp < len && data[temp] != '@')
                                {
                                    temp++;
                                }

                                temp++;
                            }

                            // hack for presidents.ged, email address
                            // is used in PHON on line 13 with a single @
                            // this isn't valid GEDCOM
                            // Should be escaped as @@ but handle it anyway
                            // Same thing occurs in user supplied file TOUT200801_unicode.ged
                            // with RELA @INDI:BAPM
                            else if (temp < len && data[temp] == '@')
                            {
                                temp++;
                            }
                            else if (temp != i)
                            {
                                if ((temp < len) && !IsTerminator(data[temp]))
                                {
                                    ErrorState = GedcomErrorState.LineValueInvalid;
                                }
                                else
                                {
                                    temp = Math.Min(temp, len);
                                    string dup = data.Substring(i, temp - i);

                                    // unescape @@
                                    LineValue = dup.Replace("@@", "@");

                                    LineValueType = GedcomLineValueType.DataType;
                                }

                                i = temp;
                            }

                            // TODO: no line value, but have hit the terminator
                            // what should this be allowed for?
                            // Family Tree Maker outputs emtpy CONT (and CONC?)
                            else if (Tag == "CONT" || Tag == "CONC")
                            {
                                LineValue = " ";
                            }
                            else
                            {
                                // hit a terminator
                                break;
                            }
                        }
                    }

                    if (ErrorState == GedcomErrorState.NoError)
                    {
                        // can't use FoundTag here, may not want to reset
                        previousLevel = Level;
                        previousTag   = Tag;

                        if (TagFound != null)
                        {
                            TagFound(this, EventArgs.Empty);
                        }

                        if (i == len || IsTerminator(data[i]))
                        {
                            while (i < len && IsTerminator(data[i]))
                            {
                                i++;
                            }

                            // reset states
                            ResetParseState(false);
                        }
                        else if (!IgnoreMissingTerms)
                        {
                            ErrorState = GedcomErrorState.LineValueMissingTerm;
                        }
                    }

                    break;
                }

                if (ErrorState != GedcomErrorState.NoError)
                {
                    ParserError?.Invoke(this, EventArgs.Empty);
                    break;
                }
            }

            // reset parse status for more input
            ResetParseState(false);

            return(ErrorState);
        }
Пример #5
0
 protected virtual void RaiseParserError(string msg, Exception ex) => ParserError?.Invoke(this, new TwitterItemParserErrorEventArgs(msg, ex));