Ejemplo n.º 1
0
        public object TransformHtml()
        {
            var queryString = Request.GetQueryNameValuePairs();
            var urlQuery    = queryString.Where(a => a.Key.Equals("url"));

            if (!urlQuery.Any())
            {
                return(BadRequest("Request parameter 'url' was missing"));
            }
            var url = urlQuery.First().Value;

            Logger.Info($"Recieved request with querystring url: {url}");

            var requestContent = Request.Content;
            var html           = requestContent.ReadAsStringAsync().Result;

            if (string.IsNullOrWhiteSpace(html))
            {
                return(BadRequest("Request body was empty"));
            }

            Logger.Info($"Recieved request with html: {html}");

            var extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, "*.txt");

            var matchingExtractors = extractor.FindAllExtractors(url);

            if (!matchingExtractors.Any())
            {
                Logger.Info($"No extractors matched for url {url}");
                return(BadRequest($"Could not find any extractors configured that match url: {url}"));
            }

            return(extractor.ExtractAll(url, html));
        }
Ejemplo n.º 2
0
        public object ListUrlPatterns()
        {
            Logger.Info($"Recieved call to /ListUrlPatterns");
            var extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, "*.txt");

            return(extractor.configsToExtractors.Select(a => new { a.Configuration.ConfigName, a.Configuration.UrlPatterns }));
        }
Ejemplo n.º 3
0
        public object TransformUrl()
        {
            string text = null;

            // TODO: fix why URL Querystring parameter needs to be provided double url encoded,
            // TODO: otherwise query string params in the url may break out.
            var queryString = Request.GetQueryNameValuePairs();
            var queryUrl    = queryString.Where(a => a.Key.Equals("url", StringComparison.InvariantCultureIgnoreCase));

            if (!queryUrl.Any())
            {
                return(BadRequest("Request parameter 'url' was missing"));
            }
            var url = queryUrl.First().Value;

            MultiExtractor extractor = null;

            var queryExtractorName = queryString.Where(a => a.Key.Equals("extractorName", StringComparison.InvariantCultureIgnoreCase));

            if (queryExtractorName.Any())
            {
                extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, $"{queryExtractorName.First().Value}.txt");
            }
            else
            {
                extractor = new MultiExtractor(Settings.Default.TransformationsDirectory, "*.txt");
            }

            Logger.Info($"Recieved request with querystring url: {url}");

            var matchingExtractors = extractor.FindAllExtractors(url).ToList();

            if (!matchingExtractors.Any())
            {
                Logger.Info($"No extractors matched for url {url}");
                return(BadRequest($"Could not find any extractors configured that match url: {url}"));
            }

            Logger.Info($"Matched extractors {matchingExtractors.Select(x => x.Configuration?.ConfigName)}");

            // If any of the extractors that are matched by the url have renderJS = true, then use
            // a browser that is capable of running JavaScript to render the DOM
            bool renderJs = matchingExtractors.Any(e => e.Configuration.RequiresJavascript);

            if (renderJs)
            {
                text = ExtractHtmlWithChrome(url);
                //text = ExtractHtmlWithPhantomJSNoWebdriver(url);
                var results = extractor.ExtractAll(url, text, "PhantomJS");
                return(results);
            }
            else
            {
                text = ExtractHtmlWithWebClient(url);
                return(extractor.ExtractAll(url, text, "WebClient"));
            }
        }
Ejemplo n.º 4
0
        public void MultiWebsiteExtractionTest()
        {
            var multiExtractor = new MultiExtractor(configRootFolder: "TestData", configFilesPattern: "*.json");
            var json           = multiExtractor.ParsePage(
                url: "http://answers.microsoft.com/en-us/windows/forum/windows_10-win_upgrade/i-want-to-reserve-my-free-copy-of-windows-10-but-i/9c3f7f56-3da8-4b40-a30f-e33772439ee1",
                html: File.ReadAllText(Path.Combine("TestData", "answers.microsoft.com.html")));

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("I want to reserve my free copy of Windows 10, but I don’t see the icon on the taskbar", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreEqual(1642653, question["views"].Value, "The extracted views snippet is incorrect");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints");
            Assert.AreEqual("PC", question["hints"][3].ToString(), "The 4th hint of the extracted question should be PC");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(2, parsedJson["answers"].Count, "Extractor should find two answers in the thread summary of the HTML file");

            var secondAnswer = parsedJson["answers"][1];

            Assert.AreEqual("Most Helpful Reply", secondAnswer["type"].Value, "The extracted type of the answer is incorrect");
            Assert.AreNotEqual(null, secondAnswer["content"], "The content array in the extracted answer should not be null");
            Assert.IsTrue(secondAnswer["content"].Count > 0, "The content array in the extracted answer should have one or more items");
            Assert.AreEqual(4, secondAnswer["lists"].Count, "The lists array should have 4 items");
            Assert.IsTrue(secondAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }
        }