示例#1
0
        private IEnumerable <KeyValuePair <string, List <HtmlNode> > > GetNodes(HtmlNode marketNode)
        {
            HtmlNodeCollection containers = marketNode.SelectNodes(ContainerXPaths.MARKET);

            return(containers
                   .GroupBy(x => x.FirstChild.InnerText)
                   .ToDictionary(x => x.Key, y => y.SelectMany(node => node.SelectNodes(OddXPaths.NODE))
                                 .ToList()));
        }
示例#2
0
        public static object ProcessWebSite(string url)
        {
            string theScript   = PhantomConvenience.JavaScriptTemplate;
            var    cbPath      = Path.GetDirectoryName(new Uri(typeof(Processor).Assembly.CodeBase).LocalPath);
            var    phantomPath = Path.Combine(cbPath, "phantomjs.exe");

            if (File.Exists(phantomPath) == false)
            {
                throw new FileNotFoundException($"PhantomJS executable not found at: {phantomPath}");
            }

            var jsFile = Path.GetTempFileName();

            // Create a temp file path for the image... note that we are putting png on the end...
            // I discovered that phantomjs (or whatever it ends up using internally to render the
            // image) will SILENTLY FAIL if the path you give it doesn't have the extension on
            // the end. This only took me like ten hours (almost literally) to figure out.
            // ... OMFG I can't even...
            var imageOutputPath = $"{Path.GetTempFileName()}.png";

            var htmlOutputPath = Path.GetTempFileName();

            File.WriteAllText(jsFile, theScript);

            var numberOfTimesIHaveTried = 0;

            HtmlNodeCollection allTRtags = null;
            var    xmlDataSetPath        = string.Empty;
            var    htmlDoc              = new HtmlAgilityPack.HtmlDocument();
            var    allTagsJsonFileName  = string.Empty;
            object phantomConsoleOutput = null;

            // Since some web sites are apparently really sketchy, try multiple times to get some
            // valid-looking HTML output before we give up.
            while (numberOfTimesIHaveTried++ < WebSiteRetryCount && allTRtags == null)
            {
                phantomConsoleOutput = PhantomConvenience.ExecutePhantomScript(phantomPath, jsFile, url, imageOutputPath, htmlOutputPath);

                htmlDoc.Load(htmlOutputPath);

                // Some tables have THEAD, some have TBODY, etc. so it gets a bit tricky. But we SHOULD be able
                // to count on the fact that all the TD and TH tags are directly inside a row (TR). So find all the
                // TR tags, but group them by their "parent" node - again, which may be a <table> or <thead> or <tbody>
                // or WhateverTF. By grouping on the parent in this way, it should - pretty much - also treat the rows
                // so everything in one table is grouped together.
                allTRtags = htmlDoc.DocumentNode.SelectNodes("//tr");
                if (allTRtags == null)
                {
                    System.Threading.Thread.Sleep(300);
                }
            }

            if (allTRtags == null)
            {
                return(new
                {
                    WebPageImagePath = imageOutputPath,
                    WebPageHtmlPath = htmlOutputPath,
                    AllTagsJsonPath = (string)null, //allTagsJsonFileName,
                    DataSetXmlPath = (string)null,  //xmlDataSetPath,
                    PhantomConsoleOutput = phantomConsoleOutput
                });
            }
            var groupedByTables = allTRtags.GroupBy(t => t.ParentNode.XPath);

            var data = new DataSet();

            foreach (var tableGroup in groupedByTables)
            {
                var dataTable = new DataTable();
                dataTable.Columns.Add("rowid", typeof(int));

                var currentRowNumber = 0;
                foreach (var row in tableGroup)
                {
                    var cells = row.SelectNodes("td|th");
                    if (cells == null || cells.Count == 0)
                    {
                        continue;
                    }

                    while (dataTable.Columns.Count < cells.Count() + 1)
                    {
                        dataTable.Columns.Add($"Column_{dataTable.Columns.Count}", typeof(string));
                    }
                    List <object> values = new List <object>();
                    values.Add(currentRowNumber++);
                    values.AddRange(cells.Select(i => Decrapify(i.InnerText)));
                    dataTable.Rows.Add(values.ToArray());
                }
                data.Tables.Add(dataTable);
            }

            xmlDataSetPath = Path.GetTempFileName();
            //Path.Combine(outputDirectory, $"{name}-DataSet.xml");
            data.WriteXml(xmlDataSetPath, XmlWriteMode.WriteSchema);

            ////////////////////////////////////////////////////////////////////
            // now jam all the HTML nodes into a document that
            // kinda has everything - on some sites, data exists outside
            // the <table>, <td> and <th> tags
            ////////////////////////////////////////////////////////////////////
            var currentLineNumber = 0;
            var allTheNodes       = htmlDoc.DocumentNode.SelectNodes("//td|//th|//span|//div|//p|//h1|//h2|//h3|//h4");
            var dataToWrite       = from n in allTheNodes
                                    select new
            {
                Content    = Decrapify(n.InnerText),
                LineNumber = currentLineNumber++,
                Tag        = n.Name
            };

            allTagsJsonFileName = Path.GetTempFileName();
            File.WriteAllText(allTagsJsonFileName, Newtonsoft.Json.JsonConvert.SerializeObject(dataToWrite, Newtonsoft.Json.Formatting.Indented));

            return(new
            {
                WebPageImagePath = imageOutputPath,
                WebPageHtmlPath = htmlOutputPath,
                AllTagsJsonPath = allTagsJsonFileName,
                DataSetXmlPath = xmlDataSetPath,
                PhantomConsoleOutput = phantomConsoleOutput
            });

            //using (var phantomJS = new PhantomJS())
            //{
            //    phantomJS.OutputReceived += (sender, e) =>
            //    {
            //        Console.WriteLine("PhantomJS output: {0}", e.Data);
            //    };
            //    phantomJS.ErrorReceived += (sender, e) =>
            //    {
            //        Console.Error.WriteLine("PhantomJS error: {0}", e.Data);
            //    };

            //    string theScript = PhantomConvenience.JavaScriptTemplate;
            //    var cbPath = Path.GetDirectoryName(new Uri(typeof(Processor).Assembly.CodeBase).LocalPath);
            //    var phantomPath = Path.Combine(cbPath, "phantomjs.exe");

            //    if (File.Exists(phantomPath) == false)
            //    {
            //        throw new FileNotFoundException($"PhantomJS executable not found at: {phantomPath}");
            //    }

            //    phantomJS.PhantomJsExeName = phantomPath;
            //    var imageOutputPath = Path.GetTempFileName();
            //    Console.WriteLine($"Image output to: {imageOutputPath}");
            //    //Path.Combine(outputDirectory, $"{name}.png");
            //    var htmlOutputPath = Path.GetTempFileName();
            //    //Path.Combine(outputDirectory, $"{name}.html");
            //    phantomJS.RunScript(theScript, new string[] { url, imageOutputPath, htmlOutputPath });

            //    return null;

            //    var htmlDoc = new HtmlAgilityPack.HtmlDocument();
            //    htmlDoc.Load(htmlOutputPath);

            //    // Some tables have THEAD, some have TBODY, etc. so it gets a bit tricky. But we SHOULD be able
            //    // to count on the fact that all the TD and TH tags are directly inside a row (TR). So find all the
            //    // TR tags, but group them by their "parent" node - again, which may be a <table> or <thead> or <tbody>
            //    // or WhateverTF. By grouping on the parent in this way, it should - pretty much - also treat the rows
            //    // so everything in one table is grouped together.
            //    var allTRtags = htmlDoc.DocumentNode.SelectNodes("//tr");
            //    var groupedByTables = allTRtags.GroupBy(t => t.ParentNode.XPath);

            //    var data = new DataSet();
            //    foreach (var tableGroup in groupedByTables)
            //    {
            //        var dataTable = new DataTable();
            //        dataTable.Columns.Add("rowid", typeof(int));

            //        var currentRowNumber = 0;
            //        foreach (var row in tableGroup)
            //        {
            //            var cells = row.SelectNodes("td|th");
            //            if (cells == null || cells.Count == 0)
            //                continue;

            //            while (dataTable.Columns.Count < cells.Count() + 1)
            //                dataTable.Columns.Add($"Column_{dataTable.Columns.Count}", typeof(string));
            //            List<object> values = new List<object>();
            //            values.Add(currentRowNumber++);
            //            values.AddRange(cells.Select(i => Decrapify(i.InnerText)));
            //            dataTable.Rows.Add(values.ToArray());
            //        }
            //        data.Tables.Add(dataTable);
            //    }

            //    var xmlDataSetPath = Path.GetTempFileName();
            //    //Path.Combine(outputDirectory, $"{name}-DataSet.xml");
            //    data.WriteXml(xmlDataSetPath, XmlWriteMode.WriteSchema);

            //    ////////////////////////////////////////////////////////////////////
            //    // now jam all the HTML nodes into a document that
            //    // kinda has everything - on some sites, data exists outside
            //    // the <table>, <td> and <th> tags
            //    ////////////////////////////////////////////////////////////////////
            //    var currentLineNumber = 0;
            //    var allTheNodes = htmlDoc.DocumentNode.SelectNodes("//td|//th|//span|//div|//p");
            //    var dataToWrite = from n in allTheNodes
            //                      select new
            //                      {
            //                          Content = Decrapify(n.InnerText),
            //                          LineNumber = currentLineNumber++,
            //                          Tag = n.Name
            //                      };
            //    var allTagsJsonFileName = Path.GetTempFileName();
            //    //Path.Combine(outputDirectory, $"{name}-tags.json");
            //    File.WriteAllText(allTagsJsonFileName, Newtonsoft.Json.JsonConvert.SerializeObject(dataToWrite, Newtonsoft.Json.Formatting.Indented));

            //    return new
            //    {
            //        WebPageImagePath = imageOutputPath,
            //        WebPageHtmlPath = htmlOutputPath,
            //        AllTagsJsonPath = allTagsJsonFileName,
            //        DataSetXmlPath = xmlDataSetPath
            //    };
            //}
        }