Beispiel #1
0
        public static void WorkerSummary1(int poolid)
        {
            var driver = new ChromeDriver();

            var    jsonConfig = File.ReadAllText(@"Json\\f2pool.json");
            var    config     = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url        = GetUrl(poolid);

            driver.Navigate().GoToUrl(url);

            Thread.Sleep(8000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];

                //var all = (int)json[0];
                var active   = (int)json[1];
                var inactive = (int)json[2];
                var dead     = 0;
                if (json.Count() == 4)
                {
                    dead = (int)json[3];
                }

                JToken json2 = jObject["currenthash"];
                JToken json3 = jObject["dailyhash"];
                var    temp1 = (string)json2;
                var    temp2 = (string)json3;

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag1 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag1 = false;
                    UpdateErrorLog("f2pool", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            var configJson = @"
            {
                '':'//a[contains(text(), \'File\')]/@href'
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var       html    = @"http://rule34.paheal.net/post/list";
            HtmlWeb   web     = new HtmlWeb();
            WebClient wc      = new WebClient();
            var       htmlDoc = web.Load(html);
            var       body    = htmlDoc.Text;
            var       path    = Directory.GetCurrentDirectory() + "\\img\\";

            void CreataFolder()
            {
                if (!Directory.Exists(path))
                {
                    Directory.CreateDirectory(path);
                }
            }

            CreataFolder();
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(body);

            char[] charstotrim = { '\x5C', '\x22', '\x7B', '\x20' };
            var    output      = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented).Split(',');

            output[0] = output[0].Remove(0, 10);
            output[output.Length - 1] = output[output.Length - 1].Remove(output[output.Length - 1].Length - 8, 8);
            string fileName;

            for (int i = 0; i < output.Length; i++)
            {
                output[i] = output[i].Remove(0, 7);
                output[i] = output[i].Trim(charstotrim);
                if (output[i].Contains("webm"))
                {
                    fileName = path + i + ".webm";
                    Console.WriteLine(fileName);
                }
                else
                {
                    fileName = path + i + "." + output[i].Remove(0, output[i].Length - 3);
                    Console.WriteLine(fileName);
                }

                wc.DownloadFile((string)output[i], fileName);
            }
            Console.WriteLine("----------------------------");
            for (int i = 0; i < output.Length; i++)
            {
                Console.WriteLine(output[i]);
            }
            Console.ReadKey();
        }
        public void RemoveExtraWhitespaceTransformationTest()
        {
            var html = "<html><body><div id='content'><a href=''>A link</a>with     adjacent text. &quot;the final frontier&quot;</div></body></html>";

            var configJson = @"
            {
                'text': {
                    '_xpath': '//div[@id=\'content\']',
                    '_transformations': [
                        'ExtractTextTransformation',
                        'HtmlDecodeTransformation',
                        'RemoveExtraWhitespaceTransformation'
                    ]
                }
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(html);
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("A link with adjacent text. \"the final frontier\"", parsedJson["text"].Value);
        }
Beispiel #4
0
        public void Scrape()
        {
            var config = StructuredDataConfig.ParseJsonString(Top100Config);

            var html = DownloadPage(saveTo: IMDBhtml);

            var openScraping = new StructuredDataExtractor(config);

            var scrapingResults = openScraping.Extract(html);

            using (WebClient client = new WebClient())
            {
                foreach (var celeb in scrapingResults["celebrities"])
                {
                    celeb["birth"] = ScrapeCeleb(celeb);
                    var    wat = celeb["image"].ToString();
                    Uri    uri = new Uri(wat);
                    string fn  = Path.GetFileName(uri.LocalPath);
                    client.DownloadFile(wat, imgPath + fn);
                }
            }

            JsonSerializerSettings jss = new JsonSerializerSettings
            {
                StringEscapeHandling = StringEscapeHandling.Default
            };

            string textresult = JsonConvert.SerializeObject(scrapingResults, jss);

            File.WriteAllText(DBfile, textresult);
        }
Beispiel #5
0
        public MultiExtractor(string configRootFolder, string configFilesPattern)
        {
            var files      = Directory.GetFiles(configRootFolder, configFilesPattern);
            var regexRules = 0; // used to configure the C# regex cache size

            if (files != null && files.Length > 0)
            {
                foreach (var file in files)
                {
                    var config = StructuredDataConfig.ParseJsonFile(file);

                    if (config.UrlPatterns != null && config.UrlPatterns.Count > 0)
                    {
                        regexRules += config.UrlPatterns.Count;
                        var extractor = new StructuredDataExtractor(config);
                        this.configsToExtractors.Add(Tuple.Create <ConfigSection, StructuredDataExtractor>(config, extractor));
                    }
                }
            }

            // The default is 15
            if (regexRules > 15)
            {
                Regex.CacheSize = regexRules;
            }
        }
Beispiel #6
0
        public void OfficeSupportExtractionTest()
        {
            var configPath = Path.Combine("TestData", "support.office.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "support.office.com.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("Export data to Excel", parsedJson["title"].Value, "The extracted title is incorrect");
            Assert.AreEqual("You can copy data from a Microsoft Office Access 2007 database into a worksheet by exporting a database object to a Microsoft Office Excel 2007 workbook. You do this by using the Export Wizard in Office Access 2007.", parsedJson["abstract"].Value, "The extracted abstract is incorrect");
            Assert.AreEqual(9, parsedJson["versions"].Count, "The extracted versions field is incorrect");

            Assert.AreEqual(5, parsedJson["sections"].Count, "The extracted json should have 5 sections");

            var secondSection = parsedJson["sections"][1];

            Assert.AreEqual("Exporting data to Excel: the basics", secondSection["title"].Value, "The title of the second section is incorrect");
            Assert.AreEqual(9, secondSection["text"]["paragraphs"].Count, "The paragraphs count of the second section is incorrect");
            Assert.AreEqual(2, secondSection["text"]["unorderedLists"].Count, "The paragraphs count of the second section is incorrect");

            var secondList = secondSection["text"]["unorderedLists"][1];

            Assert.AreEqual("If this is the first time you are exporting data to Excel", secondList["title"].Value, "The title of the second list in the second section is incorrect");
            Assert.AreEqual(4, secondList["items"].Count, "The second list in the second section should have 4 items");
        }
Beispiel #7
0
        public void QuoraWithWikiExtractionTest()
        {
            var configPath = Path.Combine("TestData", "quora.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.withwiki.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("What can I learn/know right now in 10 minutes that will be useful for the rest of my life?", question["title"].Value, "The extracted title is incorrect");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file");

            // Best Answer
            Assert.AreNotEqual(null, parsedJson["bestAnswer"], "Extractor should find the best answer in the HTML file");

            var bestAnswer = parsedJson["bestAnswer"];

            Assert.AreNotEqual(null, bestAnswer["content"], "The content string should not be null in the extracted answer");
            Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer");
            Assert.AreEqual(9, bestAnswer["lists"].Count, "The lists array should have 9 items");
            Assert.AreEqual(25, bestAnswer["lists"][1]["items"].Count, "Second item in the lists array should have 25 items");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }

            var bestAnswerLists = bestAnswer["lists"];

            if (bestAnswerLists != null)
            {
                foreach (var list in bestAnswerLists)
                {
                    Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                    var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                    Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                }
            }
        }
        public IDataExtractor <TRawData> CreateFromName(string name)
        {
            string                    jsonConfig    = GetByName(name);
            ConfigSection             config        = StructuredDataConfig.ParseJsonString(jsonConfig);
            IDataExtractor <TRawData> dataExtractor = getDataExtractor(config);

            return(dataExtractor);
        }
        public async Task Transform(string content)
        {
            var config          = StructuredDataConfig.ParseJsonString(_jsonConfig);
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(content);

            await _sender.Send(scrapingResults.ToString());
        }
Beispiel #10
0
        public static void WorkerSummary7(int poolid)
        {
            var driver = new ChromeDriver();

            var jsonConfig = File.ReadAllText(@"Json\\spiderpool.json");

            var    config = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url    = GetUrl(poolid);

            //var url = "https://www.spiderpool.com/coin/show/btc/yibobtc01/detail.html";
            driver.Navigate().GoToUrl(url);

            Thread.Sleep(1000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];


                var temp1 = (string)json[8];
                var temp2 = (string)json[10];

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                var temp     = (string)json[6];
                var active   = Int32.Parse(temp.Substring(0, temp.IndexOf('/')));
                var total    = Int32.Parse(temp.Substring(temp.LastIndexOf('/') + 1));
                int inactive = total - active;
                int dead     = 0;

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag7 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag7 = false;
                    UpdateErrorLog("viabtc", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
        public IActionResult JsonResult([FromBody] ObjectJson json)
        {
            var         url    = WebUtility.UrlDecode(json.url);
            MyWebClient client = new MyWebClient()
            {
                Encoding = Encoding.UTF8
            };

            client.Headers[HttpRequestHeader.UserAgent] = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36";
            if (json.isID)
            {
                client.DownloadData(url);
                var    mainUrl = client.ResponseUri.ToString();
                var    regex   = JsonConvert.DeserializeObject <JsonIDInput>(json.data);
                string item    = "";
                try
                {
                    var regexMatch = Regex.Match(mainUrl, regex._xpath);
                    item = regexMatch.Groups[regex.group_number].Value;
                }
                catch (Exception) { }
                return(Json(JsonConvert.SerializeObject(new JsonIDresult {
                    url = mainUrl, id = item
                }, Formatting.Indented)));
            }
            else
            {
                var          baseUri  = new Uri(url);
                var          isScript = json.javascript;
                var          config   = StructuredDataConfig.ParseJsonString(json.data);
                var          html     = client.DownloadString(!isScript ? url : this.configuration.GetAppSetting("UrlSeleniumGetHtmlExcuteJavascript") + "?url=" + WebUtility.UrlEncode(url));
                HtmlDocument docc     = new HtmlDocument();
                docc.LoadHtml(html);
                var urltmp = "";
                HtmlNodeCollection nodes = docc.DocumentNode.SelectNodes("//a");
                if (nodes != null)
                {
                    foreach (HtmlNode node in nodes)
                    {
                        if ((node.Attributes["href"] != null) && (node.Attributes["href"].Value != ""))
                        {
                            try
                            {
                                urltmp = node.Attributes["href"].Value.Trim();
                                node.Attributes["href"].Value = new Uri(baseUri, urltmp).AbsoluteUri;
                            }
                            catch (Exception) { }
                        }
                    }
                }
                ;
                html = docc.DocumentNode.InnerHtml;
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(html);
                var result          = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented);
                return(Json(result));
            }
        }
Beispiel #12
0
        public static void WorkerSummary6(int poolid)
        {
            var driver = new ChromeDriver();

            var jsonConfig = File.ReadAllText(@"Json\\viabtc.json");

            var    config = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url    = GetUrl(poolid);

            //var url = "https://pool.viabtc.com/observer/dashboard?access_key=cb735a866859b626a748c0fb4a479394";
            driver.Navigate().GoToUrl(url);

            //Thread.Sleep(1000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];


                var temp1 = (string)json[0];
                var temp2 = (string)json[2];

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                var active   = Int32.Parse((string)json[3]);
                var inactive = Int32.Parse((string)json[4]);
                int total    = 0;
                int dead     = 0;

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag6 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag6 = false;
                    UpdateErrorLog("viabtc", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
        public async Task Transform(string content)
        {
            //transform intoJson, create object template in shared becouse we need the same object in Loader to deserialize
            var configJson      = GenerateJson();
            var config          = StructuredDataConfig.ParseJsonString(configJson);
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(content);

            await _sender.Send(scrapingResults.ToString());
        }
        public StructuredDataExtractor(string configString)
        {
            if (string.IsNullOrEmpty(configString))
            {
                throw new ArgumentNullException(nameof(configString));
            }
            config = StructuredDataConfig.ParseJsonString(configString);

            LoadTransformations();
        }
Beispiel #15
0
        public static void WorkerSummary5(int poolid)
        {
            var    driver     = new ChromeDriver();
            var    jsonConfig = File.ReadAllText(@"Json\\antpool.json");
            var    config     = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url        = GetUrl(poolid);

            driver.Navigate().GoToUrl(url);

            Thread.Sleep(1000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];

                var temp  = (string)json[0];
                var temp1 = (string)json[1];
                var temp2 = (string)json[3];

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                var numbers  = Regex.Split(temp.Trim(), @"\D+");
                var active   = Int32.Parse(numbers[0]);
                var total    = Int32.Parse(numbers[1]);
                var inactive = total - active;
                int dead     = 0;

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag5 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag5 = false;
                    UpdateErrorLog("antpool", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
Beispiel #16
0
        public void MicrosoftAnswersExtractionTest()
        {
            var configPath = Path.Combine("TestData", "answers.microsoft.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "answers.microsoft.com.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("8acb1ac5-0acd-4c68-9eeb-e4afff5b39d8", question["id"].Value, "The extracted id is incorrect");
            Assert.AreEqual("I want to reserve my free copy of Windows 10, but I don’t see the icon on the taskbar", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreEqual(1642653, question["views"].Value, "The extracted views snippet is incorrect");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints");
            Assert.AreEqual("PC", question["hints"][3].ToString(), "The 4th hint of the extracted question should be PC");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(2, parsedJson["answers"].Count, "Extractor should find two answers in the thread summary of the HTML file");

            var secondAnswer = parsedJson["answers"][1];

            Assert.AreEqual("Most Helpful Reply", secondAnswer["type"].Value, "The extracted type of the answer is incorrect");
            Assert.AreNotEqual(null, secondAnswer["content"], "The content array in the extracted answer should not be null");
            Assert.IsTrue(secondAnswer["content"].Count > 0, "The content array in the extracted answer should have one or more items");
            Assert.AreEqual(4, secondAnswer["lists"].Count, "The lists array should have 4 items");
            Assert.IsTrue(secondAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }
        }
Beispiel #17
0
        public void QuoraExtractionTest()
        {
            var configPath = Path.Combine("TestData", "quora.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("What are some tips for creating a successful Kickstarter project?", question["title"].Value, "The extracted title is incorrect");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(5, question["hints"].Count, "The extracted question should have 5 hints");
            Assert.AreEqual("Kickstarter", question["hints"][3].ToString(), "The 4th hint of the extracted question should be Kickstarter");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file");

            var firstAnswer = parsedJson["answers"][0];

            Assert.AreNotEqual(null, firstAnswer["content"], "The content string should not be null in the extracted answer");
            Assert.IsTrue(firstAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer");
            Assert.AreEqual(6800, firstAnswer["views"].Value, "The extracted views count is incorrect");
            Assert.AreEqual(1, firstAnswer["lists"].Count, "The lists array should have 1 item");
            Assert.IsTrue(firstAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item");

            var secondAnswer = parsedJson["answers"][1];

            Assert.AreEqual(2, secondAnswer["views"].Value, "The extracted views count is incorrect");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }
        }
Beispiel #18
0
        private JToken ScrapeCeleb(JToken jToken)
        {
            var config = StructuredDataConfig.ParseJsonString(CelebConfig);

            jToken["page"] = "https://www.imdb.com" + jToken["page"];
            var html            = DownloadPage(jToken["page"].ToString());
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(html);

            return(scrapingResults["celebrities"]["birth"]);
        }
Beispiel #19
0
        public void StackExchangeEx2ExtractionTest()
        {
            var configPath = Path.Combine("TestData", "stackexchange.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "stackoverflow.com.example2.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("How to configure Visual Studio 2008 to use IIS Express?", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field");
            Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int");
            Assert.AreEqual(9, question["votes"].Value, "The votes extracted from the question should have a value of 9");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(2, question["hints"].Count, "The extracted question should have 2 hints");
            Assert.AreEqual("iis-express", question["hints"][1].ToString(), "The 2nd hint of the extracted question should be passwords");

            // Best Answer
            var bestAnswer = parsedJson["bestAnswer"];

            Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content");
            Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string");
            Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0");
            Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field");
            Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int");
            Assert.AreEqual(17, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 17");
            Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item");
            Assert.AreEqual(7, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 7 items");

            // Check is textAboveLength exists in each list
            var lists = bestAnswer["lists"];

            if (lists != null)
            {
                foreach (var list in lists)
                {
                    Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                    var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                    Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                }
            }
        }
        public void StackExchangeEx1ExtractionTest()
        {
            var configPath = "stackexchange.com.json";
            var config     = StructuredDataConfig.Parse(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText("stackoverflow.com.example1.html"));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("Is there a way to crack the password on an Excel VBA Project?", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field");
            Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int");
            Assert.AreEqual(196, question["votes"].Value, "The votes extracted from the question should have a value of 196");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints");
            Assert.AreEqual("passwords", question["hints"][3].ToString(), "The 4th hint of the extracted question should be passwords");

            // Best Answer
            var bestAnswer = parsedJson["bestAnswer"];

            Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content");
            Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string");
            Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0");
            Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field");
            Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int");
            Assert.AreEqual(153, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 153");
            Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item");
            Assert.AreEqual(8, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 4 items");

            // Check is textAboveLength exists in each list
            var lists = bestAnswer["lists"];

            if (lists != null)
            {
                foreach (var list in lists)
                {
                    Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                    var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                    Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                }
            }
        }
        public void ParseDateTest()
        {
            var     configPath = Path.Combine("TestData", "parse_date_rules.json");
            var     config     = StructuredDataConfig.ParseJsonFile(configPath);
            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value);
            Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value);
            Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value);
        }
        public StructuredDataExtractor(FileInfo configFile)
        {
            if (configFile == null)
            {
                throw new ArgumentNullException(nameof(configFile));
            }

            var configString = File.ReadAllText(configFile.FullName);

            config = StructuredDataConfig.ParseJsonString(configString);

            LoadTransformations();
        }
        public void RemoveXPathsExtractionTest()
        {
            var configPath = Path.Combine("TestData", "article_with_comments_div.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_comments_div.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("Article  title", parsedJson["title"].Value, "The extracted title is incorrect");
            Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect");
        }
        private static ConfigSection CreateConfig(string resourceName)
        {
            var assembly = typeof(WikiSearcher).Assembly;

            string[] names = assembly.GetManifestResourceNames();
            if (!names.Any(n => n.Equals(resourceName)))
            {
                resourceName = names.FirstOrDefault(n => n.Contains(resourceName));
            }
            var stream = assembly.GetManifestResourceStream(resourceName);
            var reader = new System.IO.StreamReader(stream);

            return(StructuredDataConfig.ParseJsonString(reader.ReadToEnd()));
        }
        public void RegexTest()
        {
            var configPath       = Path.Combine("TestData", "regex_rules.json");
            var config           = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor        = new StructuredDataExtractor(config);
            var result           = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
            var actualJson       = JsonConvert.SerializeObject(result, Formatting.Indented);
            var parsedActualJson = JObject.Parse(actualJson);

            var expectedJsonPath   = Path.Combine("TestData", "regex_expected_result.json");
            var expectedJson       = File.ReadAllText(expectedJsonPath);
            var parsedExpectedJson = JObject.Parse(expectedJson);

            Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson));
        }
Beispiel #26
0
        private async Task Init()
        {
            // Path to the folder with classifiers models
            var jarRoot = @"C:\stanford-ner-2018-10-16";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            _classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            // Define a regular expression for finding the location element
            _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>",
                                    RegexOptions.Compiled | RegexOptions.IgnoreCase);

            // Define configurations for parsing artist and listener info
            var configArtistInfoJson = @"
            {
                'artist': '//h1[contains(@class, \'view-header\')]',
                'about': '//div[contains(@class, \'bio-primary\')]',
                'more': '//div[contains(@class, \'bio-secondary\')]',
                'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]',
                'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]'
            }";

            ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson);

            _artistScraping = new StructuredDataExtractor(configArtist);

            // Get the hosted feature layers for editing
            ArcGISPortal portal = await ArcGISPortal.CreateAsync();

            PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId);

            PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId);

            PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId);

            _hometownTable    = new ServiceFeatureTable(hometownLayerItem, 0);
            _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0);
            _listenerTable    = new ServiceFeatureTable(listenerLayerItem, 0);
            await _hometownTable.LoadAsync();

            await _otherPointsTable.LoadAsync();

            await _listenerTable.LoadAsync();
        }
Beispiel #27
0
        static void ExtractJsonLdRecipe()
        {
            //// 1.
            //// URL: http://en.wikipedia.org/wiki/Main_Page
            //WebClient w = new WebClient();
            //string s = w.DownloadString("https://headbangerskitchen.com/recipe/low-carb-dessert/");

            var configJson = @"
            {
                'title': '//h1',
                'body': '//script[contains(@type, \'application\/ld+json\')]'
            }";

            var           config = StructuredDataConfig.ParseJsonString(configJson);
            StringBuilder sb     = new StringBuilder("{");

            sb.Append("\"@context\":\"http://schema.org/\",");
            sb.Append("\"@type\":\"Recipe\",");
            sb.Append("\"name\":\"Keto Coffee & Chocolate Tart\",\"author\":{ \"@type\":\"Person\",\"name\":\"Sahil Makhija\"},\"datePublished\":\"2009-11-05T00:00:00+00:00\",");
            sb.Append("\"description\":\"A delicious layered low carb dessert with the flavours of chocolate and coffee\",\"recipeYield\":\"3 servings\",\"aggregateRating\":{ \"@type\":\"AggregateRating\",\"ratingValue\":\"5\",\"ratingCount\":\"6\"},");
            sb.Append("\"prepTime\":\"PT10M\",\"cookTime\":\"PT20M\",");
            sb.Append("\"recipeIngredient\":[\"45 grams Almond Flour ( I use this one )\",\"30 grams Salted Butter\",\"1 Tbsp Unsweetened Coco Powder ( I recommend this one )\",\"150 grams Mascarpone cheese\",\"1 Tsp Vanilla Extract\",\"2 Tbsp Water\",\"1 Tsp Instant espresso powder\",\"100 ml Heavy Cream\",\"30 grams Dark Chocolate (85% or Higher) (I use Lindt 85%)\",\"Stevia to taste\"],");
            sb.Append("\"recipeInstructions\":[");
            sb.Append("\"Microwave the butter for 30 seconds till melted\",\"Add in your stevia/sweetner to taste, vanilla essence and the coco powder and mix well together\",\"Add in the almond flour and combine till well incorporated\",");
            sb.Append("\"Divide the mixture in 3 tart tins or ramekins and shape the base\",\"Bake at 175 C/ 350 F for 10 minutes and then allow them to cool\",\"Heat 2 tablespoons of water and mix 1 tsp of instant espresso powder into that\",");
            sb.Append("\"Whip the mascarpone cheese, stevia, vanilla extract and coffee mixture together till nice and fluffy\",\"Pour the mascarpone mixture over the base and chill in the fridge for 15 minutes\",");
            sb.Append("\"Meanwhile warm up the cream for 30 seconds in the microwave and add the chocolate and sweetner to that and mix till fully melted and you have a creamy ganache\",\"Pour the ganache over the mascarpone mousse in the tart molds and chill in the fridge for an hour\",\"Finish with some sea salt on top of each tart.\"],");
            sb.Append("\"recipeCategory\":\"Dessert\",\"recipeCuisine\":\"General\",\"suitableForDiet\": \"http://schema.org/LowFatDiet\"}");

            var json = sb.ToString();
            var serializerSettings = new JsonSerializerSettings()
            {
                DateParseHandling = DateParseHandling.DateTimeOffset
            };

            serializerSettings.Converters.Add(new IsoDateTimeConverter());
            serializerSettings.Converters.Add(new TimeSpanToISO8601DurationValuesConverter());


            Recipe rec = JsonConvert.DeserializeObject <Recipe>(json, serializerSettings);

            Console.WriteLine("Extracting LD+JSON Recipe.....");
            Console.Write(rec);
            Console.ReadKey();
        }
Beispiel #28
0
        public T Run()
        {
            using (var client = new WebClient())
            {
                var html = client.DownloadString(UrlConstants.BaseUrl + _relativeUrl);

                var configuration   = StructuredDataConfig.ParseJsonString(ConfigurationJson);
                var openScraping    = new StructuredDataExtractor(configuration);
                var scrapingResults = openScraping.Extract(html);

                var serializedObject   = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented);
                var deserializedObject = JsonConvert.DeserializeObject <T>(serializedObject);

                WaitRandom();

                return(deserializedObject);
            }
        }
Beispiel #29
0
        static void Main(string[] args)
        {
            var jsonConfig = File.ReadAllText(@"match-result.config.json");
            var config     = StructuredDataConfig.ParseJsonString(jsonConfig);

            var html = string.Empty;

            using (WebClient client = new WebClient())
            {
                client.Encoding = Encoding.UTF8;
                html            = client.DownloadString("http://virtualsoccer.ru/viewmatch.php?day=12968&match_id=213340");
            }

            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(html);

            Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented));
            Console.ReadKey();
        }
        public static MyRecipe ExtractRecipe(string url)
        {
            string   urlResponse;
            MyRecipe myRecipe = null;

            // 1. Get Response from url
            using (WebClient w = new WebClient())
            {
                urlResponse = w.DownloadString(url);
            }

            //2: Check and scrape if any structured JSON is present (application/ld+json)
            var configJson      = @"{                
                'data': '//script[contains(@type, \'application\/ld+json\')]'
            }";
            var config          = StructuredDataConfig.ParseJsonString(configJson);
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(urlResponse);

            if (scrapingResults != null && scrapingResults["data"] != null)
            {
                var content = scrapingResults["data"].ToString();
                if (content.Contains("\"@type\":\"Recipe\""))
                {
                    try
                    {
                        var serializerSettings = new JsonSerializerSettings()
                        {
                            DateParseHandling = DateParseHandling.DateTimeOffset
                        };
                        Recipe        rec     = JsonConvert.DeserializeObject <Recipe>(content, serializerSettings);
                        RecipeBuilder builder = new RecipeBuilder();
                        myRecipe = builder.Build(rec);
                    }
                    catch (Exception e)
                    {
                    }
                }
            }
            return(myRecipe);
        }