Пример #1
0
        public IEnumerable<TestMethod> Select(CrawlResult result, IEnumerable<TestDefinition> tests)
        {
            foreach (var test in tests.Where(t => t.ParsedObjectType == null))
                yield return new TestMethod(test, result);

            if (result.ContentType != null)
                foreach (var test in tests.Where(t => t.ParsedObjectType == result.ContentType))
                    yield return new TestMethod(test, result, result.Content);
        }
Пример #2
0
        public IEnumerable<TestMethod> Select(CrawlResult result, IEnumerable<TestDefinition> tests)
        {
            if (result.ContentType == typeof(CQ))
            {
                var document = (CQ)result.Content;

                foreach (var widget in document[WidgetSelector])
                    foreach (var test in tests.Cast<WidgetTestDefinition>())
                        foreach (var selector in test.CssSelector)
                        {
                            if (string.IsNullOrEmpty(selector))
                            {
                                yield return new TestMethod(test, result, widget);
                                continue;
                            }

                            if (IsMatch(widget, selector))
                            {
                                yield return new TestMethod(test, result, widget);
                            }
                        }
            }
        }
Пример #3
0
        public IObservable<CrawlResult> Crawl()
        {
            return Observable.Create(
                async (IObserver<CrawlResult> observer) =>
                {
                    var httpClient = new HttpClient();
                    var tasks = new List<Task<CrawlResult>>();
                    var requests = new List<CrawlRequest>();

                    while (Uris.Count > 0)
                    {
                        var crawlRequest = new CrawlRequest(Uris.Dequeue());
                        if (requests.Any(r => r.Uri.Equals(crawlRequest.Uri))) // TODO: Add in filter checks for Depth, Domain white list, visited list
                            continue;
                        requests.Add(crawlRequest);

                        var task = httpClient.GetAsync(crawlRequest.Uri, Token).ContinueWith(
                            (responseTask, state) =>
                                {
                                    if (responseTask.IsFaulted)
                                    {
                                        observer.OnError(responseTask.Exception);
                                        // TODO: observer.OnNext(new CrawlResult(exception))??
                                    }

                                    var response = responseTask.Result;
                                    var stopwatch = (Stopwatch)state;

                                    var crawlResult = new CrawlResult(response.RequestMessage, response, stopwatch.Elapsed);

                                    if (response.IsSuccessStatusCode)
                                    {
                                        var parsedContent = ParseContent(response.Content);
                                        crawlResult.SetContent(response.Content, parsedContent);

                                        // TODO: Find additional links in parsedContent and add to Uris
                                    }

                                    return crawlResult;
                                }, Stopwatch.StartNew(), Token);

                            tasks.Add(task);
                            observer.OnNext(await task);
                    }

                    Task.WhenAll(tasks).ContinueWith(_ => observer.OnCompleted());
                    return Disposable.Empty;
                });
        }
Пример #4
0
 public abstract bool CompatibleWith(CrawlResult result);
Пример #5
0
 public override bool CompatibleWith(CrawlResult result)
 {
     return result.ContentType == typeof(CQ);
 }
Пример #6
0
 public override bool CompatibleWith(CrawlResult result)
 {
     return Regex.Any(regex => regex.IsMatch(result.Uri.AbsoluteUri));
 }
Пример #7
0
 public void WidgetIdTest(CrawlResult result, IDomElement widget)
 {
     PAssert.IsTrue(() => "nav".Equals(widget.GetAttribute("id")));
 }
Пример #8
0
 public void TwitterTest(CrawlResult result, JObject content)
 {
     PAssert.IsTrue(() => true);
 }
Пример #9
0
 public void HtmlTest(CrawlResult result, CQ content)
 {
     PAssert.IsTrue(() => true);
 }
Пример #10
0
 public void AllPagesTest(CrawlResult result)
 {
     PAssert.IsTrue(() => true);
 }