Ejemplo n.º 1
0
        private ParseResult ProcessSourceContent(string content)
        {
            ParseResult ret = new ParseResult();

            ret.Xml = new ParseResult.XmlParseResult();

            int maxDepth = 0;
            int nodes    = 0;
            int arrays   = 0;

            string   pox = XmlTools.Convert(content);
            XElement xe  = XElement.Parse(pox);

            ret.Flattened    = Flatten(xe, out maxDepth, out nodes, out arrays);
            ret.Xml.MaxDepth = maxDepth;
            ret.Xml.Nodes    = nodes;
            ret.Xml.Arrays   = arrays;

            ret.Schema = ParserCommon.BuildSchema(ret.Flattened);
            ret.Tokens = ParserCommon.GetTokens(ret.Flattened, _TextParser);

            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }
Ejemplo n.º 2
0
        private ParseResult ProcessSourceContent(DataTable dataTable)
        {
            ParseResult ret = new ParseResult();

            ret.Sql = new ParseResult.SqlParseResult();

            if (dataTable == null || dataTable.Rows.Count < 1)
            {
                ret.Time.End = DateTime.UtcNow;
                return(ret);
            }

            List <Dictionary <string, object> > dicts = Common.DataTableToListDictionary(dataTable);

            foreach (Dictionary <string, object> dict in dicts)
            {
                foreach (KeyValuePair <string, object> kvp in dict)
                {
                    ret.Flattened.Add(new DataNode(kvp.Key, kvp.Value, DataNode.TypeFromValue(kvp.Value)));
                }
            }

            ret.Schema      = ParserCommon.BuildSchema(ret.Flattened);
            ret.Tokens      = ParserCommon.GetTokens(ret.Flattened, _TextParser);
            ret.Sql.Rows    = dataTable.Rows.Count;
            ret.Sql.Columns = dataTable.Columns.Count;

            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }
Ejemplo n.º 3
0
        private ParseResult ProcessSourceContent(string data)
        {
            ParseResult ret = new ParseResult();

            ret.Tokens   = ParserCommon.GetTokens(data, _ParseOptions.Text);
            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }
Ejemplo n.º 4
0
        private ParseResult ProcessSourceContent(string content)
        {
            int maxDepth;
            int arrayCount;
            int nodeCount;

            ParseResult ret = new ParseResult();

            ret.Json = new ParseResult.JsonParseResult();

            JToken jtoken = JToken.Parse(content);

            ret.Flattened     = Flatten(jtoken, out maxDepth, out arrayCount, out nodeCount);
            ret.Json.MaxDepth = maxDepth;
            ret.Json.Arrays   = arrayCount;
            ret.Json.Nodes    = nodeCount;
            ret.Schema        = ParserCommon.BuildSchema(ret.Flattened);
            ret.Tokens        = ParserCommon.GetTokens(ret.Flattened, _TextParser);

            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }
Ejemplo n.º 5
0
        private ParseResult ProcessSourceContent(byte[] data)
        {
            ParseResult ret = new ParseResult();

            ret.Csv = new ParseResult.CsvParseResult();
            string[] headerNames = null;
            List <Dictionary <string, object> > dicts = new List <Dictionary <string, object> >();
            int rows    = 0;
            int columns = 0;

            _CsvConfiguration.Delimiter   = _ParseOptions.Csv.ColumnDelimiter.ToString();
            _CsvConfiguration.TrimOptions = TrimOptions.Trim;

            using (MemoryStream ms = new MemoryStream())
            {
                ms.Write(data, 0, data.Length);
                ms.Seek(0, SeekOrigin.Begin);

                using (TextReader tr = new StreamReader(ms))
                {
                    using (CsvHelper.CsvParser cp = new CsvHelper.CsvParser(tr, _CsvConfiguration))
                    {
                        while (cp.Read())
                        {
                            if (cp.Record != null && cp.Record.Length > 0)
                            {
                                if (rows == 0)
                                {
                                    headerNames = cp.Record;

                                    List <string> headerNamesList = headerNames.Distinct().ToList();
                                    if (headerNamesList.Count != headerNames.Length)
                                    {
                                        throw new DuplicateNameException("Supplied CSV contains headers that would create duplicate columns.");
                                    }

                                    columns = headerNames.Length;
                                }
                                else
                                {
                                    Dictionary <string, object> dict = new Dictionary <string, object>();

                                    for (int i = 0; i < cp.Record.Length; i++)
                                    {
                                        if (headerNames.Length > i && !String.IsNullOrEmpty(headerNames[i]))
                                        {
                                            dict.Add(headerNames[i], cp.Record[i]);
                                        }
                                        else
                                        {
                                            dict.Add(_ParseOptions.Csv.UnknownColumnPrefix + i.ToString(), cp.Record[i]);
                                        }
                                    }

                                    if (cp.Record.Length != columns)
                                    {
                                        ret.Csv.Irregular = true;
                                    }
                                    if (cp.Record.Length > columns)
                                    {
                                        columns = cp.Record.Length;
                                    }
                                    dicts.Add(dict);
                                }

                                rows++;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }
                }
            }

            if (dicts != null && dicts.Count > 0)
            {
                foreach (Dictionary <string, object> dict in dicts)
                {
                    foreach (KeyValuePair <string, object> kvp in dict)
                    {
                        ret.Flattened.Add(new DataNode(kvp.Key, kvp.Value, DataNode.TypeFromValue(kvp.Value)));
                    }
                }
            }

            ret.Schema      = ParserCommon.BuildSchema(ret.Flattened);
            ret.Tokens      = ParserCommon.GetTokens(ret.Flattened, _TextParser);
            ret.Csv.Rows    = rows;
            ret.Csv.Columns = columns;

            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }
Ejemplo n.º 6
0
        private List <Token> GetTokens(string data)
        {
            if (String.IsNullOrEmpty(data))
            {
                return(new List <Token>());
            }

            List <string> lines = new List <string>();

            // Using HtmlAgilityPack
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(data);

            HtmlNode root = doc.DocumentNode;

            if (root != null)
            {
                IEnumerable <HtmlNode> nodes = root.DescendantsAndSelf();
                if (nodes != null && nodes.Count() > 0)
                {
                    foreach (HtmlNode node in nodes.ToList())
                    {
                        if (!node.HasChildNodes)
                        {
                            string text = node.InnerText;
                            if (!String.IsNullOrEmpty(text))
                            {
                                text = text.Trim();
                            }
                            if (!String.IsNullOrEmpty(text))
                            {
                                lines.Add(text);
                            }
                        }
                    }
                }
            }

            List <Token> ret = new List <Token>();

            if (lines != null && lines.Count > 0)
            {
                foreach (string line in lines)
                {
                    ParseResult pr = _TextParser.ParseString(line);
                    if (pr != null && pr.Tokens != null && pr.Tokens.Count > 0)
                    {
                        foreach (Token currToken in pr.Tokens)
                        {
                            ret = ParserCommon.AddToken(ret, currToken);
                        }
                    }
                }
            }

            if (ret != null && ret.Count > 0)
            {
                ret = ret.OrderByDescending(u => u.Count).ToList();
            }

            return(ret);
        }
Ejemplo n.º 7
0
        private ParseResult ProcessSourceContent(string data)
        {
            #region Load-Document

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(data);

            ParseResult ret = new ParseResult();
            ret.Html = new ParseResult.HtmlParseResult();

            #endregion

            #region Head

            ret.Html.Head.Title                    = GetTitle(data);
            ret.Html.Head.MetaDescription          = GetMetaDescription(doc);
            ret.Html.Head.MetaKeywords             = GetMetaKeywords(doc);
            ret.Html.Head.MetaImageOpengraph       = GetMetaImageOpengraph(doc);
            ret.Html.Head.MetaDescriptionOpengraph = GetMetaDescriptionOpengraph(doc);
            ret.Html.Head.MetaVideoTagsOpengraph   = GetMetaVideoTagsOpengraph(doc);

            StringBuilder head = new StringBuilder(" ");

            if (!String.IsNullOrEmpty(ret.Html.Head.Title))
            {
                head.Append(" " + ret.Html.Head.Title);
            }

            if (!String.IsNullOrEmpty(ret.Html.Head.MetaDescription))
            {
                head.Append(" " + ret.Html.Head.MetaDescription);
            }

            if (ret.Html.Head.MetaKeywords != null && ret.Html.Head.MetaKeywords.Count > 0)
            {
                head.Append(" " + String.Join(" ", ret.Html.Head.MetaKeywords));
            }

            if (!String.IsNullOrEmpty(ret.Html.Head.MetaDescriptionOpengraph))
            {
                head.Append(" " + ret.Html.Head.MetaDescriptionOpengraph);
            }

            if (ret.Html.Head.MetaVideoTagsOpengraph != null && ret.Html.Head.MetaVideoTagsOpengraph.Count > 0)
            {
                head.Append(" " + String.Join(" ", ret.Html.Head.MetaVideoTagsOpengraph));
            }

            ret.Html.Head.Content = head.ToString();
            ret.Html.Head.Tokens  = ParserCommon.GetTokens(ret.Html.Head.Content, _ParseOptions.Text);

            #endregion

            #region Body

            ret.Html.Body.ImageUrls = GetImageUrls(doc, data);
            ret.Html.Body.Links     = GetLinks(doc);
            ret.Html.Body.Content   = GetHtmlBody(doc);
            ret.Html.Body.Tokens    = ParserCommon.GetTokens(ret.Html.Body.Content, _ParseOptions.Text);

            #endregion

            #region Data

            ret.Tokens = new List <Token>();

            long bodyStartingPosition = 0;
            if (ret.Html.Head.Tokens != null && ret.Html.Head.Tokens.Count > 0)
            {
                ret.Tokens.AddRange(ret.Html.Head.Tokens);

                foreach (Token token in ret.Html.Head.Tokens)
                {
                    if (token.Positions != null && token.Positions.Count > 0)
                    {
                        long maxPos = token.Positions.Max();

                        if (maxPos >= bodyStartingPosition)
                        {
                            bodyStartingPosition = (maxPos + 1);
                        }
                    }
                }
            }

            // bodyStartingPosition + [body token position] will yield the correct position across the entire set of tokens

            if (ret.Html.Body.Tokens != null && ret.Html.Body.Tokens.Count > 0)
            {
                List <Token> updatedTokens = new List <Token>();

                foreach (Token token in ret.Html.Body.Tokens)
                {
                    Token updated = new Token();
                    updated.Value     = token.Value;
                    updated.Count     = token.Count;
                    updated.Positions = new List <long>();

                    if (token.Positions != null && token.Positions.Count > 0)
                    {
                        foreach (long tokenPos in token.Positions)
                        {
                            long updatedPosition = bodyStartingPosition + tokenPos;
                            updated.Positions.Add(updatedPosition);
                        }
                    }

                    updatedTokens.Add(updated);

                    ret.Html.Body.Tokens = updatedTokens;
                }

                foreach (Token token in ret.Html.Body.Tokens)
                {
                    ret.Tokens = ParserCommon.AddToken(ret.Tokens, token);
                }
            }

            ret.Schema = BuildSchema();

            #endregion

            ret.Success  = true;
            ret.Time.End = DateTime.UtcNow;
            return(ret);
        }