Пример #1
0
        public void AttachTo(IHtmlDocument document)
        {
            Contract.RequiresNotNull(document, "document");

            var documentAdapter = document as HtmlDocumentAdapter;

            Contract.Requires(documentAdapter != null, "Only documents of type {0} supported", typeof(HtmlDocumentAdapter).FullName);

            // WebBrowser reused HtmlDocument instances
            // -> we cannot ignore assignment of same HtmlDocument here

            var behaviorHashCode = documentAdapter.Document.Body.GetAttribute("RaynMakerHtmlMarkupBehavior");

            Contract.Invariant(string.IsNullOrEmpty(behaviorHashCode) || behaviorHashCode == GetHashCode().ToString(),
                               "A HtmlMarkupBehavior already attached to the given HtmlDocument. Only one attached HtmlMarkupBehavior per HtmlDocument supported");

            Detach();

            Document = documentAdapter;
            Document.Document.Click += HtmlDocument_Click;

            Document.Document.Body.SetAttribute("RaynMakerHtmlMarkupBehavior", GetHashCode().ToString());

            // In Detach() we resetted everything so nothing to apply here
            // Apply();
        }
Пример #2
0
        /// <summary>
        /// Constructor.
        /// <remarks>
        /// Default markup color: yellow.
        /// </remarks>
        /// </summary>
        public HtmlMarker(HtmlDocumentAdapter document)
        {
            myDocument = document;

            MarkedElements = new List <HtmlElement>();
            DefaultColor   = Color.Yellow;
        }
Пример #3
0
 private async Task SavePage(long pageNumber, string pageUrl, HtmlDocumentAdapter html)
 {
     using (var stream1 = new FileStream(_filesLocationPrefix + "/" + pageNumber + ".html", FileMode.OpenOrCreate))
     {
         var buffer = Encoding.UTF8.GetBytes(html.ToString());
         await stream1.WriteAsync(buffer, 0, buffer.Length);
     }
 }
Пример #4
0
        public void SelectedElement_NoDocumentAttached_Throws()
        {
            var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);
            var ex       = Assert.Throws <InvalidOperationException>(() => behavior.SelectedElement = ( HtmlElementAdapter )document.GetElementById("x11"));

            Assert.That(ex.Message, Does.Contain("Document not attached"));
        }
Пример #5
0
        public void PathToSelectedElement_NoDocumentAttached_PathSet()
        {
            var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);
            var path     = document.GetElementById("x11").GetPath().ToString();

            behavior.PathToSelectedElement = path;

            Assert.That(behavior.PathToSelectedElement, Is.EqualTo(path));
        }
Пример #6
0
        public void PathToSelectedElement_SetToNull_SelectedElementAdjusted()
        {
            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);

            var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            behavior.AttachTo(document);

            behavior.PathToSelectedElement = null;

            Assert.That(behavior.SelectedElement, Is.Null);
        }
Пример #7
0
        public void PathToSelectedElement_SetToNotNull_SelectedElementAdjusted()
        {
            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);

            var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            behavior.AttachTo(document);

            behavior.PathToSelectedElement = document.GetElementById("x11").GetPath().ToString();

            Assert.That(behavior.SelectedElement, Is.SameAs(document.GetElementById("x11")));
        }
Пример #8
0
        public void AttachTo_SetToDocumentToWhichAnotherBehaviorIsAlreadyAttached_Throws()
        {
            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);

            var behavior1 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            behavior1.AttachTo(document);

            var behavior2 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Red));
            var ex        = Assert.Throws <InvalidOperationException>(() => behavior2.AttachTo(document));

            Assert.That(ex.Message, Does.Contain("Only one attached HtmlMarkupBehavior per HtmlDocument supported"));
        }
Пример #9
0
        public void Detach_WhenCalled_AnotherBehaviorCanBeAttached()
        {
            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);

            var behavior1 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            behavior1.AttachTo(document);
            behavior1.Detach();

            var behavior2 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Red));

            behavior2.AttachTo(document);

            Assert.That(behavior1.Document, Is.Null);
            Assert.That(behavior2.Document, Is.EqualTo(document));
        }
Пример #10
0
        public void PathToSelectedElement_WhenSet_SelectionChangedRaised()
        {
            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);

            var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            behavior.AttachTo(document);

            bool selectionChangedRaised = false;

            behavior.SelectionChanged += (s, e) => selectionChangedRaised = true;

            behavior.PathToSelectedElement = document.GetElementById("x11").GetPath().ToString();;

            Assert.That(selectionChangedRaised, Is.True);
        }
Пример #11
0
        public void Detach_WhenCalled_SelectedElementAndPathAndDocumentNulled()
        {
            myBrowser.LoadHtml(HtmlDocument1);
            var document = new HtmlDocumentAdapter(myBrowser.Document);

            var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow));

            behavior.AttachTo(document);

            behavior.SelectedElement = ( HtmlElementAdapter )document.GetElementById("x11");

            behavior.Detach();

            Assert.That(behavior.Document, Is.Null);
            Assert.That(behavior.SelectedElement, Is.Null);
            Assert.That(behavior.PathToSelectedElement, Is.Null);
        }
Пример #12
0
        public async Task Crawl()
        {
            var index = 1;

            while (_traversalQueue.Count != 0)
            {
                var pageUrl = _traversalQueue.Peek();

                var response = await GetResponse(pageUrl);

                if (!response.IsSuccessStatusCode)
                {
                    _traversalQueue.Dequeue();
                    continue;
                }

                var htmlDoc = new HtmlDocumentAdapter(await response.Content.ReadAsStreamAsync());

                if (htmlDoc.GetInnerTextLength() < _options.MinWordsCount)
                {
                    _traversalQueue.Dequeue();
                    continue;
                }

                await SavePage(index, pageUrl, htmlDoc);

                _indexed.TryAdd(index + ".html", pageUrl);
                _processedPagesCount++;

                _crawledUrls.Add(_traversalQueue.Dequeue());

                if (Balance != 0)
                {
                    UpdateQueue(htmlDoc);
                }

                index++;
            }

            if (_indexed.Count != 0)
            {
                SaveIndex();
            }
        }
Пример #13
0
        public void Detach()
        {
            if (Document == null)
            {
                return;
            }

            Document.Document.Click -= HtmlDocument_Click;

            mySelectedElement = null;
            myPath            = null;

            // We have to unmark all because anyway with new document the HtmlElements inside the Marker are invalid.
            // We do not call Reset() in order to keep the settings in the Marker (e.g. HtmlTableMarker).
            Marker.Unmark();

            Document.Document.Body.SetAttribute("RaynMakerHtmlMarkupBehavior", null);

            Document = null;
        }
Пример #14
0
        private void UpdateQueue(HtmlDocumentAdapter htmlDoc)
        {
            var urls = htmlDoc
                       .GetDocumentHrefUrls(RootSiteUri.AbsoluteUri)
                       .Except(_crawledUrls)
                       .Distinct()
                       .ToArray();

            if (urls.Length == 0)
            {
                return;
            }

            var limit       = _traversalQueue.Count >= Balance ? 0 : Balance - _traversalQueue.Count;
            var limitedUrls = limit > urls.Length ? urls : urls.Take(limit).ToArray();

            foreach (var url in limitedUrls)
            {
                _traversalQueue.Enqueue(url);
            }
        }