Esempio n. 1
0
 public void parse(CrawlerResult result)
 {
     Console.WriteLine($"File: {result.FileName}, " +
                       $"Content: {result.MatchContent} " +
                       $"Extension: {result.Extension} " +
                       $"Path: {result.Path}");
 }
Esempio n. 2
0
 async public static Task <CrawlerResult <IHtmlElementCollection> > TryFindAsync(this ICrawler crawler, Action <HttpRequestMessageBuilder> config)
 {
     try
     {
         return(CrawlerResult.Ok(await crawler.FindAsync(config)));
     }
     catch (Exception ex)
     {
         return(CrawlerResult.Error(ex));
     }
 }
Esempio n. 3
0
        /// <summary>
        /// Creates, executes crawler requests. Forms and returns results.
        /// </summary>
        /// <param name="text">Search text.</param>
        /// <param name="limit">Limit of products per site.</param>
        /// <param name="sources">Array of crawler sources.</param>
        /// <returns></returns>
        public static CrawlerResult[] Search(string text, int limit, CrawlerSource[] sources = null, CrawlerSource[] strictSearchSources = null)
        {
            CrawlerSource[] crawlerSources  = sources ?? DefaultCrawlerSources;
            var             crawlerRequests = crawlerSources.ToDictionary(crawlerSource => crawlerSource,
                                                                          crawlerSource => CrawlerRequests[crawlerSource]);
            var results = new Dictionary <CrawlerSource, CrawlerResult>();

            Parallel.ForEach(crawlerRequests, request =>
            {
                var watch = new Stopwatch();
                watch.Start();
                bool isStrictSearch  = strictSearchSources != null && strictSearchSources.Contains(request.Key);
                var crawlerRequest   = request.Value(text, limit, isStrictSearch);
                CrawlerResult result = null;
                try
                {
                    var crawlerResult = crawlerRequest.ExecuteSearchRequest();
                    result            = new CrawlerResult
                    {
                        Products = crawlerResult,
                        Count    = crawlerResult.Length,
                        Name     = crawlerRequest.SourceName,
                        State    = SearchResultStatus.Success,
                        Id       = crawlerRequest.Id
                    };
                }
                catch (Exception ex)
                {
                    result = new CrawlerResult
                    {
                        Count     = 0,
                        Name      = crawlerRequest.SourceName,
                        Products  = new ProductInfo[0],
                        State     = SearchResultStatus.Failure,
                        Exception = ex,
                        Id        = crawlerRequest.Id
                    };
                }
                finally
                {
                    lock (results)
                    {
                        watch.Stop();
                        result.ExecutionTime = watch.ElapsedMilliseconds;
                        results.Add(request.Key, result);
                    }
                }
            });
            var crawlerResults = new List <CrawlerResult>();

            crawlerResults.AddRange(crawlerSources.Select(source => results[source]));
            return(crawlerResults.ToArray());
        }
Esempio n. 4
0
        /// <summary>
        /// 法币卖一
        /// tradeType值1为买0为卖
        /// coin值1为btc,2为usdt
        /// </summary>
        /// <returns></returns>
        public async Task <CrawlerResult <string> > LegalTenderSell()
        {
            var result = new CrawlerResult <string>();

            var crawler = Singleton <CrawlerManager> .Instance.GetCrawler();

            crawler.OnCompleted += (s, re) =>
            {
                result.Success = true;
                result.Msg     = "爬虫抓取成功!耗时:" + re.Milliseconds;
            };
            crawler.OnError += (s, ex) =>
            {
                result.Success = false;
                result.Msg     = "爬虫抓取失败:" + ex;
            };

            //启动爬虫
            var reJson = await crawler.Start(new Uri("https://api-otc.huobi.pro/v1/otc/trade/list/public?coinId=1&tradeType=0&currentPage=1&payWay=&country=&merchant=0&online=1&range=0"), null);

            if (!result.Success)
            {
                return(result);
            }

            try
            {
                //Json解析
                var x        = JsonConvert.DeserializeObject <LegalTenderPage>(reJson);
                var buyFirst = x.data[0];

                if (buyFirst == null)
                {
                    result.Success = true;
                    result.Result  = "厉害了!居然没人交易!!!";
                }
                else
                {
                    result.Success = true;
                    result.Result  = buyFirst.price.ToString();
                }
            }
            catch (JsonException ex)
            {
                result.Success = false;
                result.Msg     = "Json解析失败:" + ex;
            }

            return(result);
        }
Esempio n. 5
0
        /// <summary>
        /// 更新金色财经推送的快讯
        /// </summary>
        /// <returns></returns>
        public async Task <CrawlerResult <CrawlNews> > UpdatePushNewsFlash()
        {
            var result = new CrawlerResult <CrawlNews>();

            var crawler = Singleton <CrawlerManager> .Instance.GetCrawler();

            crawler.OnCompleted += (s, re) =>
            {
                result.Success = true;
                result.Msg     = "爬虫抓取成功!耗时:" + re.Milliseconds;
            };
            crawler.OnError += (s, ex) =>
            {
                result.Success = false;
                result.Msg     = "爬虫抓取失败:" + ex;
            };

            //启动爬虫
            var rePageStr = await crawler.Start(new Uri("http://www.jinse.com/lives"), null);

            if (!result.Success)
            {
                return(result);
            }

            try
            {
                var dom = new HtmlParser().Parse(rePageStr);

                //页面元素
                var newsList = dom.QuerySelectorAll(".clearfix").FirstOrDefault();
                var first    = NewsFlashItem(newsList);

                //返回
                result.Success = true;
                result.Result  = first;
            }
            catch (JsonException ex)
            {
                result.Success = false;
                result.Msg     = "Json解析失败:" + ex;
            }

            return(result);
        }
Esempio n. 6
0
        public static CrawlerResults Crawl(String directory, String fileExtension, bool includeEmpty = false)
        {
            var result = new CrawlerResults();
            var dirs   = CrawlDirectories(directory, fileExtension, includeEmpty);

            if (!String.IsNullOrEmpty(dirs.Error))
            {
                result.Error = dirs.Error;
                return(result);
            }

            foreach (var dir in dirs.Directories)
            {
                var crawlResult = new CrawlerResult(dir.Name, dir, fileExtension);
                result.Items.Add(crawlResult);
            }

            return(result);
        }
Esempio n. 7
0
        private void Parse(FileInfo fileInfo)
        {
            string line;

            using (var streamReader = new StreamReader(fileInfo.FullName))
            {
                while ((line = streamReader.ReadLine()) != null)
                {
                    if (_matcher.Match(line))
                    {
                        var result = new CrawlerResult()
                        {
                            FileName     = fileInfo.Name,
                            Extension    = fileInfo.Extension,
                            MatchContent = line,
                            Path         = fileInfo.FullName
                        };

                        _parser.parse(result);
                    }
                }
            }
        }
Esempio n. 8
0
 private CrawlerResult CleanResult(CrawlerResult result)
 {
     result.MatchContent = _cleaner.clean(result.MatchContent);
     return(result);
 }
Esempio n. 9
0
 public void parse(CrawlerResult result)
 {
     _resultList.Add(CleanResult(result));
 }
        public HttpResponseMessage Crawl(WebPageModel model)
        {
            try
            {
                var watch = System.Diagnostics.Stopwatch.StartNew();


                var result = new CrawlerResult()
                {
                    IsMatch = false,
                    Urls    = new List <KeyValuePair <string, string> >()
                };

                var htmlWeb = new HtmlWeb()
                {
                    AutoDetectEncoding = true,
                    UserAgent          = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
                };
                var htmlDocument = htmlWeb.Load(model.Url);
                var dataLinks    =
                    htmlDocument.DocumentNode.SelectNodes("//a[@href]")
                    .Where(w => LevenshteinDistance(w.InnerText, model.Location) < 2).ToList();

                if (!dataLinks.Any())
                {
                    return(Request.CreateResponse(HttpStatusCode.OK, result));
                }

                var aElement = dataLinks.FirstOrDefault(w => w.InnerText == model.Location) ?? dataLinks.First();

                var hrefValue = aElement.Attributes.Single(w => w.Name == "href").Value;
                var parsedUrl = new Uri(model.Url);

                var link    = BuildUrl(parsedUrl, hrefValue);
                var results = SeekPath(link);
                if (!results.Any())
                {
                    return(Request.CreateResponse(HttpStatusCode.OK, result));
                }

                var matchUrl = MatchKey(results, model.Key);
                if (null == matchUrl)
                {
                    foreach (var r in results)
                    {
                        result.Urls.Add(new KeyValuePair <string, string>(r, GetLatestDataUrl(BuildUrl(parsedUrl, r))));
                    }
                }
                else
                {
                    result.IsMatch = true;
                    result.Urls.Add(new KeyValuePair <string, string>(null, GetLatestDataUrl(BuildUrl(parsedUrl, matchUrl))));
                }

                var list          = new JavaScriptSerializer().Serialize(result);
                var dataFormatted = JToken.Parse(list).ToString(Formatting.Indented);

                _db.Data.Add(new DataEntity()
                {
                    CreatedOn        = DateTime.Now,
                    IdCollectionType = (int)CollectionTypeEnum.Nav,
                    JsonObject       = dataFormatted
                });
                _db.SaveChanges();

                watch.Stop();
                var elapsedMs = watch.ElapsedMilliseconds;
                System.Diagnostics.Debug.WriteLine("Timp crawler: " + elapsedMs);
                return(Request.CreateResponse(HttpStatusCode.OK, result));
            }
            catch (Exception ex)
            {
                return(new HttpResponseMessage(HttpStatusCode.InternalServerError));
            }
        }
Esempio n. 11
0
        //Check XmlDocument for documentation on HtmlAgilityPack
        //XPath cheat sheet http://xpath.alephzarro.com/content/cheatsheet.html
        public override ContextResult ParseHtml(Instruction instruction)
        {
            ContextResult context = new ContextResult();
            CrawlerResult output = null;

            string url = instruction.Url;

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(GetContent(url));

            HtmlNodeCollection posts = doc.DocumentNode.SelectNodes("//div[@id='content']//div[contains(@class,'post') and contains(@class,'status-publish')]");

            HtmlNodeCollection candidateLinks = doc.DocumentNode.SelectNodes("//div[@id='postnav']//a[contains(.,'Próxima')]");

            if (candidateLinks != null)
            {
                foreach (var link in candidateLinks)
                {
                    AddCandidateLink(context, link.Attributes["href"].Value);
                }
            }

            foreach (HtmlNode post in posts)
            {
                output = new CrawlerResult();
                HtmlNode title = post.SelectSingleNode(".//h3[@class='storytitle']");
                HtmlNode content = post.SelectSingleNode(".//div[@class='storycontent']");
                HtmlNodeCollection tags = post.SelectNodes(".//div[@class='meta']//a");

                if (title != null && content != null)
                {
                    HtmlNode postUrl = title.SelectSingleNode(".//a");
                    //if (title.ChildNodes.Count > 0 && title.ChildNodes[0].Attributes["href"] != null)
                        //output.Url = title.ChildNodes[0].Attributes["href"].Value;
                    if (postUrl != null)
                        output.Url = postUrl.Attributes["href"].Value;

                    output.Title = System.Web.HttpUtility.HtmlDecode(title.InnerText);
                    output.Content = content.InnerHtml;
                    //var aux = Sanitize.Strip(output.Content);

                    HtmlNode date = post.SelectSingleNode(".//h3[@class='storytitle']//span[@class='date']");

                    if (date != null)
                    {
                        output.Data = (new LeitoraCompulsivaData() { Date = date.InnerText }).ToJson();
                    }

                    if (tags != null)
                    {
                        List<string> postTags = new List<string>();
                        foreach (var tag in tags)
                        {
                            postTags.Add(tag.InnerText);
                        }
                        output.Tags = string.Join(", ", postTags.ToArray());
                    }

                    context.Results.Add(output);
                }
            }

            return context;
        }
Esempio n. 12
0
 public void parse(CrawlerResult result)
 {
 }