public void parse(CrawlerResult result) { Console.WriteLine($"File: {result.FileName}, " + $"Content: {result.MatchContent} " + $"Extension: {result.Extension} " + $"Path: {result.Path}"); }
async public static Task <CrawlerResult <IHtmlElementCollection> > TryFindAsync(this ICrawler crawler, Action <HttpRequestMessageBuilder> config) { try { return(CrawlerResult.Ok(await crawler.FindAsync(config))); } catch (Exception ex) { return(CrawlerResult.Error(ex)); } }
/// <summary> /// Creates, executes crawler requests. Forms and returns results. /// </summary> /// <param name="text">Search text.</param> /// <param name="limit">Limit of products per site.</param> /// <param name="sources">Array of crawler sources.</param> /// <returns></returns> public static CrawlerResult[] Search(string text, int limit, CrawlerSource[] sources = null, CrawlerSource[] strictSearchSources = null) { CrawlerSource[] crawlerSources = sources ?? DefaultCrawlerSources; var crawlerRequests = crawlerSources.ToDictionary(crawlerSource => crawlerSource, crawlerSource => CrawlerRequests[crawlerSource]); var results = new Dictionary <CrawlerSource, CrawlerResult>(); Parallel.ForEach(crawlerRequests, request => { var watch = new Stopwatch(); watch.Start(); bool isStrictSearch = strictSearchSources != null && strictSearchSources.Contains(request.Key); var crawlerRequest = request.Value(text, limit, isStrictSearch); CrawlerResult result = null; try { var crawlerResult = crawlerRequest.ExecuteSearchRequest(); result = new CrawlerResult { Products = crawlerResult, Count = crawlerResult.Length, Name = crawlerRequest.SourceName, State = SearchResultStatus.Success, Id = crawlerRequest.Id }; } catch (Exception ex) { result = new CrawlerResult { Count = 0, Name = crawlerRequest.SourceName, Products = new ProductInfo[0], State = SearchResultStatus.Failure, Exception = ex, Id = crawlerRequest.Id }; } finally { lock (results) { watch.Stop(); result.ExecutionTime = watch.ElapsedMilliseconds; results.Add(request.Key, result); } } }); var crawlerResults = new List <CrawlerResult>(); crawlerResults.AddRange(crawlerSources.Select(source => results[source])); return(crawlerResults.ToArray()); }
/// <summary> /// 法币卖一 /// tradeType值1为买0为卖 /// coin值1为btc,2为usdt /// </summary> /// <returns></returns> public async Task <CrawlerResult <string> > LegalTenderSell() { var result = new CrawlerResult <string>(); var crawler = Singleton <CrawlerManager> .Instance.GetCrawler(); crawler.OnCompleted += (s, re) => { result.Success = true; result.Msg = "爬虫抓取成功!耗时:" + re.Milliseconds; }; crawler.OnError += (s, ex) => { result.Success = false; result.Msg = "爬虫抓取失败:" + ex; }; //启动爬虫 var reJson = await crawler.Start(new Uri("https://api-otc.huobi.pro/v1/otc/trade/list/public?coinId=1&tradeType=0¤tPage=1&payWay=&country=&merchant=0&online=1&range=0"), null); if (!result.Success) { return(result); } try { //Json解析 var x = JsonConvert.DeserializeObject <LegalTenderPage>(reJson); var buyFirst = x.data[0]; if (buyFirst == null) { result.Success = true; result.Result = "厉害了!居然没人交易!!!"; } else { result.Success = true; result.Result = buyFirst.price.ToString(); } } catch (JsonException ex) { result.Success = false; result.Msg = "Json解析失败:" + ex; } return(result); }
/// <summary> /// 更新金色财经推送的快讯 /// </summary> /// <returns></returns> public async Task <CrawlerResult <CrawlNews> > UpdatePushNewsFlash() { var result = new CrawlerResult <CrawlNews>(); var crawler = Singleton <CrawlerManager> .Instance.GetCrawler(); crawler.OnCompleted += (s, re) => { result.Success = true; result.Msg = "爬虫抓取成功!耗时:" + re.Milliseconds; }; crawler.OnError += (s, ex) => { result.Success = false; result.Msg = "爬虫抓取失败:" + ex; }; //启动爬虫 var rePageStr = await crawler.Start(new Uri("http://www.jinse.com/lives"), null); if (!result.Success) { return(result); } try { var dom = new HtmlParser().Parse(rePageStr); //页面元素 var newsList = dom.QuerySelectorAll(".clearfix").FirstOrDefault(); var first = NewsFlashItem(newsList); //返回 result.Success = true; result.Result = first; } catch (JsonException ex) { result.Success = false; result.Msg = "Json解析失败:" + ex; } return(result); }
public static CrawlerResults Crawl(String directory, String fileExtension, bool includeEmpty = false) { var result = new CrawlerResults(); var dirs = CrawlDirectories(directory, fileExtension, includeEmpty); if (!String.IsNullOrEmpty(dirs.Error)) { result.Error = dirs.Error; return(result); } foreach (var dir in dirs.Directories) { var crawlResult = new CrawlerResult(dir.Name, dir, fileExtension); result.Items.Add(crawlResult); } return(result); }
private void Parse(FileInfo fileInfo) { string line; using (var streamReader = new StreamReader(fileInfo.FullName)) { while ((line = streamReader.ReadLine()) != null) { if (_matcher.Match(line)) { var result = new CrawlerResult() { FileName = fileInfo.Name, Extension = fileInfo.Extension, MatchContent = line, Path = fileInfo.FullName }; _parser.parse(result); } } } }
private CrawlerResult CleanResult(CrawlerResult result) { result.MatchContent = _cleaner.clean(result.MatchContent); return(result); }
public void parse(CrawlerResult result) { _resultList.Add(CleanResult(result)); }
public HttpResponseMessage Crawl(WebPageModel model) { try { var watch = System.Diagnostics.Stopwatch.StartNew(); var result = new CrawlerResult() { IsMatch = false, Urls = new List <KeyValuePair <string, string> >() }; var htmlWeb = new HtmlWeb() { AutoDetectEncoding = true, UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" }; var htmlDocument = htmlWeb.Load(model.Url); var dataLinks = htmlDocument.DocumentNode.SelectNodes("//a[@href]") .Where(w => LevenshteinDistance(w.InnerText, model.Location) < 2).ToList(); if (!dataLinks.Any()) { return(Request.CreateResponse(HttpStatusCode.OK, result)); } var aElement = dataLinks.FirstOrDefault(w => w.InnerText == model.Location) ?? dataLinks.First(); var hrefValue = aElement.Attributes.Single(w => w.Name == "href").Value; var parsedUrl = new Uri(model.Url); var link = BuildUrl(parsedUrl, hrefValue); var results = SeekPath(link); if (!results.Any()) { return(Request.CreateResponse(HttpStatusCode.OK, result)); } var matchUrl = MatchKey(results, model.Key); if (null == matchUrl) { foreach (var r in results) { result.Urls.Add(new KeyValuePair <string, string>(r, GetLatestDataUrl(BuildUrl(parsedUrl, r)))); } } else { result.IsMatch = true; result.Urls.Add(new KeyValuePair <string, string>(null, GetLatestDataUrl(BuildUrl(parsedUrl, matchUrl)))); } var list = new JavaScriptSerializer().Serialize(result); var dataFormatted = JToken.Parse(list).ToString(Formatting.Indented); _db.Data.Add(new DataEntity() { CreatedOn = DateTime.Now, IdCollectionType = (int)CollectionTypeEnum.Nav, JsonObject = dataFormatted }); _db.SaveChanges(); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; System.Diagnostics.Debug.WriteLine("Timp crawler: " + elapsedMs); return(Request.CreateResponse(HttpStatusCode.OK, result)); } catch (Exception ex) { return(new HttpResponseMessage(HttpStatusCode.InternalServerError)); } }
//Check XmlDocument for documentation on HtmlAgilityPack //XPath cheat sheet http://xpath.alephzarro.com/content/cheatsheet.html public override ContextResult ParseHtml(Instruction instruction) { ContextResult context = new ContextResult(); CrawlerResult output = null; string url = instruction.Url; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(GetContent(url)); HtmlNodeCollection posts = doc.DocumentNode.SelectNodes("//div[@id='content']//div[contains(@class,'post') and contains(@class,'status-publish')]"); HtmlNodeCollection candidateLinks = doc.DocumentNode.SelectNodes("//div[@id='postnav']//a[contains(.,'Próxima')]"); if (candidateLinks != null) { foreach (var link in candidateLinks) { AddCandidateLink(context, link.Attributes["href"].Value); } } foreach (HtmlNode post in posts) { output = new CrawlerResult(); HtmlNode title = post.SelectSingleNode(".//h3[@class='storytitle']"); HtmlNode content = post.SelectSingleNode(".//div[@class='storycontent']"); HtmlNodeCollection tags = post.SelectNodes(".//div[@class='meta']//a"); if (title != null && content != null) { HtmlNode postUrl = title.SelectSingleNode(".//a"); //if (title.ChildNodes.Count > 0 && title.ChildNodes[0].Attributes["href"] != null) //output.Url = title.ChildNodes[0].Attributes["href"].Value; if (postUrl != null) output.Url = postUrl.Attributes["href"].Value; output.Title = System.Web.HttpUtility.HtmlDecode(title.InnerText); output.Content = content.InnerHtml; //var aux = Sanitize.Strip(output.Content); HtmlNode date = post.SelectSingleNode(".//h3[@class='storytitle']//span[@class='date']"); if (date != null) { output.Data = (new LeitoraCompulsivaData() { Date = date.InnerText }).ToJson(); } if (tags != null) { List<string> postTags = new List<string>(); foreach (var tag in tags) { postTags.Add(tag.InnerText); } output.Tags = string.Join(", ", postTags.ToArray()); } context.Results.Add(output); } } return context; }
public void parse(CrawlerResult result) { }