public object TransformHtml() { var queryString = Request.GetQueryNameValuePairs(); var urlQuery = queryString.Where(a => a.Key.Equals("url")); if (!urlQuery.Any()) { return(BadRequest("Request parameter 'url' was missing")); } var url = urlQuery.First().Value; Logger.Info($"Recieved request with querystring url: {url}"); var requestContent = Request.Content; var html = requestContent.ReadAsStringAsync().Result; if (string.IsNullOrWhiteSpace(html)) { return(BadRequest("Request body was empty")); } Logger.Info($"Recieved request with html: {html}"); var extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, "*.txt"); var matchingExtractors = extractor.FindAllExtractors(url); if (!matchingExtractors.Any()) { Logger.Info($"No extractors matched for url {url}"); return(BadRequest($"Could not find any extractors configured that match url: {url}")); } return(extractor.ExtractAll(url, html)); }
public object ListUrlPatterns() { Logger.Info($"Recieved call to /ListUrlPatterns"); var extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, "*.txt"); return(extractor.configsToExtractors.Select(a => new { a.Configuration.ConfigName, a.Configuration.UrlPatterns })); }
public object TransformUrl() { string text = null; // TODO: fix why URL Querystring parameter needs to be provided double url encoded, // TODO: otherwise query string params in the url may break out. var queryString = Request.GetQueryNameValuePairs(); var queryUrl = queryString.Where(a => a.Key.Equals("url", StringComparison.InvariantCultureIgnoreCase)); if (!queryUrl.Any()) { return(BadRequest("Request parameter 'url' was missing")); } var url = queryUrl.First().Value; MultiExtractor extractor = null; var queryExtractorName = queryString.Where(a => a.Key.Equals("extractorName", StringComparison.InvariantCultureIgnoreCase)); if (queryExtractorName.Any()) { extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, $"{queryExtractorName.First().Value}.txt"); } else { extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, "*.txt"); } Logger.Info($"Recieved request with querystring url: {url}"); var matchingExtractors = extractor.FindAllExtractors(url).ToList(); if (!matchingExtractors.Any()) { Logger.Info($"No extractors matched for url {url}"); return(BadRequest($"Could not find any extractors configured that match url: {url}")); } Logger.Info($"Matched extractors {matchingExtractors.Select(x => x.Configuration?.ConfigName)}"); // If any of the extractors that are matched by the url have renderJS = true, then use // a browser that is capable of running JavaScript to render the DOM bool renderJs = matchingExtractors.Any(e => e.Configuration.RequiresJavascript); if (renderJs) { text = ExtractHtmlWithChrome(url); //text = ExtractHtmlWithPhantomJSNoWebdriver(url); var results = extractor.ExtractAll(url, text, "PhantomJS"); return(results); } else { text = ExtractHtmlWithWebClient(url); return(extractor.ExtractAll(url, text, "WebClient")); } }
public void MultiWebsiteExtractionTest() { var multiExtractor = new MultiExtractor(configRootFolder: "TestData", configFilesPattern: "*.json"); var json = multiExtractor.ParsePage( url: "http://answers.microsoft.com/en-us/windows/forum/windows_10-win_upgrade/i-want-to-reserve-my-free-copy-of-windows-10-but-i/9c3f7f56-3da8-4b40-a30f-e33772439ee1", html: File.ReadAllText(Path.Combine("TestData", "answers.microsoft.com.html"))); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("I want to reserve my free copy of Windows 10, but I don’t see the icon on the taskbar", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreEqual(1642653, question["views"].Value, "The extracted views snippet is incorrect"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints"); Assert.AreEqual("PC", question["hints"][3].ToString(), "The 4th hint of the extracted question should be PC"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(2, parsedJson["answers"].Count, "Extractor should find two answers in the thread summary of the HTML file"); var secondAnswer = parsedJson["answers"][1]; Assert.AreEqual("Most Helpful Reply", secondAnswer["type"].Value, "The extracted type of the answer is incorrect"); Assert.AreNotEqual(null, secondAnswer["content"], "The content array in the extracted answer should not be null"); Assert.IsTrue(secondAnswer["content"].Count > 0, "The content array in the extracted answer should have one or more items"); Assert.AreEqual(4, secondAnswer["lists"].Count, "The lists array should have 4 items"); Assert.IsTrue(secondAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } }