public bool HasSimilarElements(ScrapedElement targetElement, IEnumerable<ScrapedElement> candidateElements, decimal maxDistance, out Tuple<ElementMatch<ScrapedElement>, decimal>[] similarElements) { var targetElementTextLength = targetElement.Text.Length; var candidates = GetElementsWithSimilarTag(targetElement.Tag, candidateElements); var tuples = (from c in candidates let maxLength = Math.Max(c.Text.Length, targetElementTextLength) let textDiff = maxLength.GetPercentageOfTotal(TextDistance(c.Text, targetElement.Text)) let attrDiff = AttributesSimilarity(targetElement, c) where textDiff <= _maxTextDiffThreshold && attrDiff <= _maxAttributeDiffThreshold select new Tuple<ScrapedElement, decimal>(c, (textDiff * _textWeight) + (attrDiff * _attributeWeight))).ToArray(); tuples = (from c in tuples let distRect = maxDistance.GetPercentageOfTotal(RectangleUtil.DistanceBetweenRectangles(c.Item1.Location, targetElement.Location)) let areaRect = RectangleUtil.AreaChangeAsPercent(c.Item1.Location, targetElement.Location) let tagDist = c.Item1.Tag.Equals(targetElement.Tag) ? (100 * _tagWeight) : 0 let total = c.Item2 + tagDist + (distRect * _distanceWeight) + (areaRect * _areaWeight) orderby total select new Tuple<ScrapedElement, decimal>(c.Item1, total)).ToArray(); similarElements = tuples.Select(t => new Tuple<ElementMatch<ScrapedElement>, decimal>(new ElementMatch<ScrapedElement> { This = t.Item1 }, t.Item2)).ToArray(); return tuples.Length > 0; }
private static bool ElementsEqual(ScrapedElement e1, ScrapedElement e2) { /* Some comparisons are redundant - no need to check the html if we've checked * the attributes and tag (and vice versa). However, checking the tag and attributes * is more accurate because order doesn't matter, whereas the html could have the * attributes in any order. */ var e1Null = ReferenceEquals(null, e1); var e2Null = ReferenceEquals(null, e2); if (e1Null && e2Null) return true; if (e1Null || e2Null) return false; return (e1.Tag == e2.Tag && e1.Location.Equals(e2.Location) && e1.Css.DictionaryEqual(e2.Css) && e1.Attributes.DictionaryEqual(e2.Attributes) && e1.Text == e2.Text); }
public void ExactMatch() { // Arrange var scrapedElement = new ScrapedElement { Attributes = new Dictionary<string, string> { { "id", "foo" } }, Css = new Dictionary<string, string> { { "foo", "bar" } }, Text = "foo", Tag = "div", Location = new Rectangle(10, 10, 15, 15), }; ScrapedElement match; // Act var found = _elementMapper.HasExactMatch(scrapedElement, _scrapedElements, out match); // Assert Assert.IsTrue(found); Assert.AreEqual("foo", match.Text); }
public void HasSimilarTextMatch() { // Arrange var scrapedElement = new ScrapedElement { Text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque justo lorem, lacinia ac accumsan ut, auctor vel ligula. Fusce ut arcu purus. Proin id metus sit amet est venenatis auctor. Curabitur nunc elit, pretium at mattis luctus, molestie quis enim. Mauris eu ipsum a ligula auctor facilisis ac sed arcu. Vestibulum tristique lobortis nibh id blandit. Suspendisse nulla elit, dictum eget lobortis et, luctus at felis. Duis aliquet, quam lobortis congue rutrum, mauris libero posuere est, eu faucibus leo risus eu arcu. Nam viverra lobortis sem egestas fringilla.", Tag = "div", Location = new Rectangle(10, 10, 15, 15), Attributes = new Dictionary<string, string> { { "foo", "bar" } }, }; Tuple<ElementMatch<ScrapedElement>, decimal>[] matches; // Act var found = _elementMapper.HasSimilarElements(scrapedElement, _scrapedElements, 50M, out matches); // Assert Assert.IsTrue(found); Assert.IsTrue(ReferenceEquals(matches[0].Item1.This, _scrapedElements[0])); Assert.IsTrue(ReferenceEquals(matches[1].Item1.This, _scrapedElements[1])); Assert.IsTrue(ReferenceEquals(matches[2].Item1.This, _scrapedElements[2])); }
public void IdMatch() { // Arrange var scrapedElement = new ScrapedElement { Attributes = new Dictionary<string, string> { { "id", "foo" } }, }; ScrapedElement match; // Act var found = _elementMapper.HasIdMatch(scrapedElement, _scrapedElements, out match); // Assert Assert.IsTrue(found); Assert.IsTrue(match.Attributes["id"] == "foo"); }
public void NoIdMatch() { // Arrange var scrapedElement = new ScrapedElement { Attributes = new Dictionary<string, string> { { "id", "foobar" } }, }; ScrapedElement match; // Act var found = _elementMapper.HasIdMatch(scrapedElement, _scrapedElements, out match); // Assert Assert.IsFalse(found); Assert.IsNull(match); }
public void NoExactMatch() { // Arrange var scrapedElement = new ScrapedElement { Text = "foo", Tag = "span", Location = new Rectangle(10, 10, 15, 15), Attributes = new Dictionary<string, string> { { "id", "foo" } }, Css = new Dictionary<string, string> { { "foo", "bar" } }, }; ScrapedElement match; // Act var found = _elementMapper.HasExactMatch(scrapedElement, _scrapedElements, out match); // Assert Assert.IsFalse(found); Assert.IsNull(match); }
public void MatchUnorderedHtmlAttributes() { // Arrange var scrapedElement = new ScrapedElement { Html = @"<div id=""foo"" style=""bar"" class=""baz"">foo</div>", Text = "foo", Tag = "div", Location = new Rectangle(10, 10, 15, 15), Attributes = new Dictionary<string, string> { { "style", "bar" }, { "class", "baz" }, { "id", "foo" } }, Css = new Dictionary<string, string> { { "foo", "bar" } }, }; var scrapedElements = new[] { new ScrapedElement { Html = @"<div id=""foo"" style=""bar"" class=""baz"">foo</div>", Text = "foo", Tag = "div", Location = new Rectangle(10, 10, 15, 15), Attributes = new Dictionary<string, string> { { "id", "foo" }, { "style", "bar" }, { "class", "baz" } }, Css = new Dictionary<string, string> { { "foo", "bar" } }, } }; ScrapedElement match; // Act var found = _elementMapper.HasExactMatch(scrapedElement, scrapedElements, out match); // Assert Assert.IsTrue(found); }
// TODO: high value testing area private bool HasChanges(ScrapedElement element, Image pageScreenshotA, Image pageScreenshotB, Scrape pageA, Scrape pageB, out ElementChangeResult changes) { var correspondingScrapedElement = element.CorrespondingScrapedElement; changes = new ElementChangeResult(); var changed = false; // Location changed decimal percentageChange = 0; if (!correspondingScrapedElement.Location.Equals(element.Location)) { changed = true; changes.LocationChanges = GetLocationChanges(correspondingScrapedElement.Location, element.Location, out percentageChange); } changes.LocationPercentageChange = percentageChange; // Css changed percentageChange = 0; if (!correspondingScrapedElement.Css.DictionaryEqual(element.Css)) { changed = true; changes.CssChanges = GetCssChanges(correspondingScrapedElement.Css, element.Css, out percentageChange); } changes.CssPercentageChange = percentageChange; // Html changed percentageChange = 0; if (correspondingScrapedElement.Html != element.Html) { changed = true; changes.HtmlChanges = GetStringChanges(correspondingScrapedElement.Html, element.Html, out percentageChange); } changes.HtmlPercentageChange = percentageChange; // Text changed percentageChange = 0; if (correspondingScrapedElement.Text != element.Text) { changed = true; changes.TextChanges = GetStringChanges(correspondingScrapedElement.Text, element.Text, out percentageChange); } changes.TextPercentageChange = percentageChange; // Pixels changed changes.PixelChanges = GetPixelChanges(pageScreenshotA, pageScreenshotB, element, correspondingScrapedElement, pageA, pageB, out percentageChange); changes.PixelPercentageChange = percentageChange; if (percentageChange > 0M) { changed = true; } // Store location on screenshot changes.LocationOnScreenshot = element.LocationOnScreenshot; return changed; }
/// <summary> /// Calculates the pixel difference between two elements. /// </summary> /// <param name="psA">The screenshot of the first page.</param> /// <param name="psB">The screenshot of the second page.</param> /// <param name="eA">The first element.</param> /// <param name="eB">The second element.</param> /// <param name="pA">The information about the first page.</param> /// <param name="pB">The information about the second page.</param> /// <param name="percentageChange">The change as a percentage.</param> /// <returns>An object containing information about the pixel differences.</returns> private PixelChange GetPixelChanges(Image psA, Image psB, ScrapedElement eA, ScrapedElement eB, Scrape pA, Scrape pB, out decimal percentageChange) { PixelChange pixelChange = null; Image originalA = ImageUtil.CropImage(psA, eA.LocationOnScreenshot); Image originalB = ImageUtil.CropImage(psB, eB.LocationOnScreenshot); Region regionA = ImageUtil.GetClippedRegion(eA.LocationOnScreenshot, pA.Elements.Select(e => e.LocationOnScreenshot)); Region regionB = ImageUtil.GetClippedRegion(eB.LocationOnScreenshot, pB.Elements.Select(e => e.LocationOnScreenshot)); Bitmap clippedA = ImageUtil.GetClippedImage(new Size(eA.LocationOnScreenshot.Width, eA.LocationOnScreenshot.Height), originalA, regionA); Bitmap clippedB = ImageUtil.GetClippedImage(new Size(eB.LocationOnScreenshot.Width, eB.LocationOnScreenshot.Height), originalB, regionB); Bitmap diffMask = ImageUtil.BitmapDiff(clippedA, clippedB, _ia, out percentageChange); Bitmap fromRegionMask = ImageUtil.DrawRegionAsMasks(new Size(eA.LocationOnScreenshot.Width, eA.LocationOnScreenshot.Height), regionA, originalA, _ia); Bitmap toRegionMask = ImageUtil.DrawRegionAsMasks(new Size(eB.LocationOnScreenshot.Width, eB.LocationOnScreenshot.Height), regionB, originalB, _ia); if (percentageChange > 0 || eB.LocationOnScreenshot.Width != eA.LocationOnScreenshot.Width || eB.LocationOnScreenshot.Height != eA.LocationOnScreenshot.Height) { pixelChange = new PixelChange { From = originalA, FromClipped = clippedA, FromMask = fromRegionMask, To = originalB, ToClipped = clippedB, ToMask = toRegionMask, Diff = diffMask }; } regionA.Dispose(); regionB.Dispose(); return pixelChange; }
/// <summary> /// Returns information about the given element. /// </summary> /// <param name="pageScreenshot">The screenshot of the page containing the element.</param> /// <param name="scrapedElement">The element.</param> /// <param name="page">The page containing the element.</param> /// <returns>An ElementAddRemoveResult object.</returns> private ElementAddRemoveResult GetElementData(Image pageScreenshot, ScrapedElement scrapedElement, Scrape page) { var originalA = ImageUtil.CropImage(pageScreenshot, scrapedElement.LocationOnScreenshot); var regionA = ImageUtil.GetClippedRegion(scrapedElement.LocationOnScreenshot, page.Elements.Select(e => e.LocationOnScreenshot)); var clippedA = ImageUtil.GetClippedImage(new Size(scrapedElement.LocationOnScreenshot.Width, scrapedElement.LocationOnScreenshot.Height), originalA, regionA); var imageMask = ImageUtil.DrawRegionAsMasks(new Size(scrapedElement.LocationOnScreenshot.Width, scrapedElement.LocationOnScreenshot.Height), regionA, originalA, _ia); var add = new ElementAddRemoveResult { Attributes = scrapedElement.Attributes, Html = scrapedElement.Html, Text = scrapedElement.Text, Location = scrapedElement.LocationOnScreenshot, Tag = scrapedElement.Tag, Image = originalA, ImageClipped = clippedA, ImageMask = imageMask }; return add; }
public Scrape FakeScrape(FakeScrapeParams fakeScrapeParams) { fakeScrapeParams.Cookies = new List<string>(); #region Header/Cookie Content if (!fakeScrapeParams.Cookies.Any()) { fakeScrapeParams.Cookies.AddRange(FakeCookies()); } var headerOne = new List<string>() { "Content-Length:194", "Cache-Control:public, must-revalidate", "Content-Type:application/x-javascript", "Date:Thu, 20 Sep 2012 17:15:03 GMT", "ETag:JsJt380DknGc4kAEEn76og==" }; var headerTwo = new List<string>() { "Content-Length:17423", "Cache-Control:public, must-revalidate", "Content-Type:application/x-javascript", "Date:Thu, 20 Sep 2012 17:15:03 GMT", "ETag:qloGz7WY45YMKQ1Fmuuw8A==" }; var headerThree = new List<string>() { "Content-Length:2552", "Cache-Control:public, must-revalidate", "Content-Type:image/gif", "Date:Thu, 20 Sep 2012 17:15:03 GMT", "ETag:UAFdRlkmdsJ1EGIoGalWng==" }; #endregion if (fakeScrapeParams.Resources == null) { //Uri, statusCode, StatusDesc, Headers var first = GetSession().List<Resource>(3).First(1) .Impose(x => x.Uri, "http://c.mfcreativedev.com/webparts/banner/Banner.js?v=c5589edb") .Impose(x => x.StatusCode, HttpStatusCode.OK) .Impose(x => x.StatusDescription, "OK") .Impose(x => x.Headers, headerOne) .Next(1) .Impose(x => x.Uri, "http://c.mfcreativedev.com/webparts/header/HeaderV1_2.js?v=730f5c7b1") .Impose(x => x.StatusCode, HttpStatusCode.OK) .Impose(x => x.StatusDescription, "OK") .Impose(x => x.Headers, headerTwo) .Next(1) .Impose(x => x.Uri, "http://c.mfcreativedev.com/s/0/p/0/i/ances_logo.gif") .Impose(x => x.StatusCode, HttpStatusCode.OK) .Impose(x => x.StatusDescription, "OK") .Impose(x => x.Headers, headerThree) .All().Get().ToArray(); fakeScrapeParams.Resources = first; } if (fakeScrapeParams.Elements == null) { var elements = new List<ScrapedElement>(); var ele1 = new ScrapedElement() { Attributes = new Dictionary<string, string>() {{"id", "mngb"}}, CorrespondingScrapedElement = null, Css = new Dictionary<string, string>(), Html = "<div id=\"mngb\"></div>", Location = new Rectangle(0, 0, 800, 30), LocationOnScreenshot = new Rectangle(0, 0, 800, 30), Tag = "div", Text = "" }; elements.Add(ele1); fakeScrapeParams.Elements = new List<ScrapedElement>(elements); } var scr = new Scrape { Id = new ObjectId(fakeScrapeParams.Id), ExcludeJquerySelector = fakeScrapeParams.Exclude, IncludeJquerySelector = fakeScrapeParams.Include, Script = fakeScrapeParams.Script, BoundingRectangle = fakeScrapeParams.Bounding, Path = new StringAsReference {Value = fakeScrapeParams.Path}, Elements = fakeScrapeParams.Elements, Resources = fakeScrapeParams.Resources, Html = fakeScrapeParams.Html, HtmlRef = new StringAsReference {Value = fakeScrapeParams.HtmlRef}, Url = fakeScrapeParams.Url, Screenshot = fakeScrapeParams.ScreenShot, ScreenshotRef = new StringAsReference {Value = fakeScrapeParams.ScreenShotRef}, ViewportSize = fakeScrapeParams.ViewportSize == null ? new Size(800 , 600) : fakeScrapeParams.ViewportSize.Value, Browser = fakeScrapeParams.Browser, BrowserVersion = fakeScrapeParams.BrowserVersion, TimeStamp = fakeScrapeParams.TimeStamp == null ? DateTime.Now : fakeScrapeParams.TimeStamp.Value, Platform = fakeScrapeParams.Platform, Cookies = fakeScrapeParams.Cookies }; return scr; }
public bool HasIdMatch(ScrapedElement element, IEnumerable<ScrapedElement> elements, out ScrapedElement idMatch) { idMatch = elements.FirstOrDefault(e => IdsMatch(e, element)); return idMatch != null; }
public bool HasExactMatch(ScrapedElement element, IEnumerable<ScrapedElement> elements, out ScrapedElement exactMatch) { exactMatch = elements.FirstOrDefault(e => ElementsEqual(e, element)); return exactMatch != null; }
private bool IdsMatch(ScrapedElement eA, ScrapedElement eB) { string idB; string idA; if (!eB.Attributes.TryGetValue("id", out idB)) return false; if (!eA.Attributes.TryGetValue("id", out idA)) return false; return idA == idB; }
/// <summary> /// Determines the attribute similarity of two given attribute dictionaries. /// </summary> /// <param name="eA">The first element.</param> /// <param name="eB">The second element.</param> /// <returns>A decimal value.</returns> private decimal AttributesSimilarity(ScrapedElement eA, ScrapedElement eB) { var keysA = eA.Attributes.Select(k => k.Key).ToArray(); var keysB = eB.Attributes.Select(k => k.Key).ToArray(); var added = keysB.Except(keysA).ToArray(); var deleted = keysA.Except(keysB).ToArray(); var common = keysA.Except(deleted).ToArray(); var changed = (from key in common where eA.Attributes[key] != eB.Attributes[key] select new AttributeChangeDetail { From = eA.Attributes[key], To = eB.Attributes[key], Key = key }).ToArray(); var unchanged = common.Except(changed.Select(i => i.Key)).ToArray(); var addedDeleted = (added.Length + added.Length) * 100; var changedPercentage = (from detail in changed let maxLength = Math.Max(detail.From.Length, detail.To.Length) let distance = TextDistance(detail.From, detail.To) select maxLength.GetPercentageOfTotal(distance)).Sum(); var total = (unchanged.Length + changed.Length + deleted.Length + added.Length) * 100; var change = total.GetPercentageOfTotal(addedDeleted + changedPercentage); return change; }