public void AttachTo(IHtmlDocument document) { Contract.RequiresNotNull(document, "document"); var documentAdapter = document as HtmlDocumentAdapter; Contract.Requires(documentAdapter != null, "Only documents of type {0} supported", typeof(HtmlDocumentAdapter).FullName); // WebBrowser reused HtmlDocument instances // -> we cannot ignore assignment of same HtmlDocument here var behaviorHashCode = documentAdapter.Document.Body.GetAttribute("RaynMakerHtmlMarkupBehavior"); Contract.Invariant(string.IsNullOrEmpty(behaviorHashCode) || behaviorHashCode == GetHashCode().ToString(), "A HtmlMarkupBehavior already attached to the given HtmlDocument. Only one attached HtmlMarkupBehavior per HtmlDocument supported"); Detach(); Document = documentAdapter; Document.Document.Click += HtmlDocument_Click; Document.Document.Body.SetAttribute("RaynMakerHtmlMarkupBehavior", GetHashCode().ToString()); // In Detach() we resetted everything so nothing to apply here // Apply(); }
/// <summary> /// Constructor. /// <remarks> /// Default markup color: yellow. /// </remarks> /// </summary> public HtmlMarker(HtmlDocumentAdapter document) { myDocument = document; MarkedElements = new List <HtmlElement>(); DefaultColor = Color.Yellow; }
private async Task SavePage(long pageNumber, string pageUrl, HtmlDocumentAdapter html) { using (var stream1 = new FileStream(_filesLocationPrefix + "/" + pageNumber + ".html", FileMode.OpenOrCreate)) { var buffer = Encoding.UTF8.GetBytes(html.ToString()); await stream1.WriteAsync(buffer, 0, buffer.Length); } }
public void SelectedElement_NoDocumentAttached_Throws() { var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var ex = Assert.Throws <InvalidOperationException>(() => behavior.SelectedElement = ( HtmlElementAdapter )document.GetElementById("x11")); Assert.That(ex.Message, Does.Contain("Document not attached")); }
public void PathToSelectedElement_NoDocumentAttached_PathSet() { var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var path = document.GetElementById("x11").GetPath().ToString(); behavior.PathToSelectedElement = path; Assert.That(behavior.PathToSelectedElement, Is.EqualTo(path)); }
public void PathToSelectedElement_SetToNull_SelectedElementAdjusted() { myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); behavior.AttachTo(document); behavior.PathToSelectedElement = null; Assert.That(behavior.SelectedElement, Is.Null); }
public void PathToSelectedElement_SetToNotNull_SelectedElementAdjusted() { myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); behavior.AttachTo(document); behavior.PathToSelectedElement = document.GetElementById("x11").GetPath().ToString(); Assert.That(behavior.SelectedElement, Is.SameAs(document.GetElementById("x11"))); }
public void AttachTo_SetToDocumentToWhichAnotherBehaviorIsAlreadyAttached_Throws() { myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var behavior1 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); behavior1.AttachTo(document); var behavior2 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Red)); var ex = Assert.Throws <InvalidOperationException>(() => behavior2.AttachTo(document)); Assert.That(ex.Message, Does.Contain("Only one attached HtmlMarkupBehavior per HtmlDocument supported")); }
public void Detach_WhenCalled_AnotherBehaviorCanBeAttached() { myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var behavior1 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); behavior1.AttachTo(document); behavior1.Detach(); var behavior2 = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Red)); behavior2.AttachTo(document); Assert.That(behavior1.Document, Is.Null); Assert.That(behavior2.Document, Is.EqualTo(document)); }
public void PathToSelectedElement_WhenSet_SelectionChangedRaised() { myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); behavior.AttachTo(document); bool selectionChangedRaised = false; behavior.SelectionChanged += (s, e) => selectionChangedRaised = true; behavior.PathToSelectedElement = document.GetElementById("x11").GetPath().ToString();; Assert.That(selectionChangedRaised, Is.True); }
public void Detach_WhenCalled_SelectedElementAndPathAndDocumentNulled() { myBrowser.LoadHtml(HtmlDocument1); var document = new HtmlDocumentAdapter(myBrowser.Document); var behavior = new HtmlMarkupBehavior <HtmlElementMarker>(new HtmlElementMarker(Color.Yellow)); behavior.AttachTo(document); behavior.SelectedElement = ( HtmlElementAdapter )document.GetElementById("x11"); behavior.Detach(); Assert.That(behavior.Document, Is.Null); Assert.That(behavior.SelectedElement, Is.Null); Assert.That(behavior.PathToSelectedElement, Is.Null); }
public async Task Crawl() { var index = 1; while (_traversalQueue.Count != 0) { var pageUrl = _traversalQueue.Peek(); var response = await GetResponse(pageUrl); if (!response.IsSuccessStatusCode) { _traversalQueue.Dequeue(); continue; } var htmlDoc = new HtmlDocumentAdapter(await response.Content.ReadAsStreamAsync()); if (htmlDoc.GetInnerTextLength() < _options.MinWordsCount) { _traversalQueue.Dequeue(); continue; } await SavePage(index, pageUrl, htmlDoc); _indexed.TryAdd(index + ".html", pageUrl); _processedPagesCount++; _crawledUrls.Add(_traversalQueue.Dequeue()); if (Balance != 0) { UpdateQueue(htmlDoc); } index++; } if (_indexed.Count != 0) { SaveIndex(); } }
public void Detach() { if (Document == null) { return; } Document.Document.Click -= HtmlDocument_Click; mySelectedElement = null; myPath = null; // We have to unmark all because anyway with new document the HtmlElements inside the Marker are invalid. // We do not call Reset() in order to keep the settings in the Marker (e.g. HtmlTableMarker). Marker.Unmark(); Document.Document.Body.SetAttribute("RaynMakerHtmlMarkupBehavior", null); Document = null; }
private void UpdateQueue(HtmlDocumentAdapter htmlDoc) { var urls = htmlDoc .GetDocumentHrefUrls(RootSiteUri.AbsoluteUri) .Except(_crawledUrls) .Distinct() .ToArray(); if (urls.Length == 0) { return; } var limit = _traversalQueue.Count >= Balance ? 0 : Balance - _traversalQueue.Count; var limitedUrls = limit > urls.Length ? urls : urls.Take(limit).ToArray(); foreach (var url in limitedUrls) { _traversalQueue.Enqueue(url); } }