public async Task <bool> ChangeIsCrawled(Models.BaseContent.BaseContent model)
        {
            try
            {
                var res = await NoSql.Instance.RunCommandAsync <BsonDocument>("{update:'basecontent',updates:[{q:{_id:ObjectId('" + model._id + "')},u:{$set:{iscrawled:true}},upsert:false}]}");

                return(true);
            }
            catch (Exception ex)
            {
                return(false);
            }
        }
Beispiel #2
0
        public async Task <bool> Crawler(Models.BaseContent.BaseContent model)
        {
            try
            {
                string htmlContent = "";
                var    uri         = new Uri(model.url);
                using (var client = new HttpClient())
                {
                    client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0");
                    client.DefaultRequestHeaders.Add("Host", uri.Authority);
                    using (var r = await client.GetAsync(uri))
                    {
                        htmlContent = await r.Content.ReadAsStringAsync();
                    }
                }
                var parser   = new HtmlParser();
                var document = parser.Parse(htmlContent);

                var rssModel = await ContentManagerRepository.RssInfo(model.rssid);

                var SiteModel = await ContentManagerRepository.SiteInfo(rssModel.siteid);

                //var template = Util.Common.fromJSON<Models.Crawl.CrawlTemplate>(System.IO.File.ReadAllText(HttpContext.Current.Server.MapPath("~/Models/simplecrawltemplate.json")));
                //var template = Util.Common.fromJSON<Models.Crawl.CrawlTemplate>(System.IO.File.ReadAllText(HostingEnvironment.MapPath("~/Models/simplecrawltemplate.json")));
                var template = SiteModel.template.FirstOrDefault();

                if (template != null)
                {
                    var content = new Models.Content.NewsContent();
                    content.rssid         = model.rssid;
                    content.userid        = model.userid;
                    content.contenttype   = (int)Util.Configuration.ContentType.News;
                    content.basecontentid = model._id;
                    content.url           = model.url;

                    foreach (var item in template.structure)
                    {
                        var element        = document.QuerySelector(item.query);
                        var elementcontent = "";
                        if (element != null && !string.IsNullOrWhiteSpace(item.query))
                        {
                            switch (item.type)
                            {
                            case "innerhtml":
                                elementcontent = Util.Common.CleanHtmlContent(element.InnerHtml);
                                break;

                            case "src":
                                elementcontent = new Uri(uri, element.GetAttribute(item.type)).ToString();
                                break;

                            default:
                                break;
                            }
                            try
                            {
                                content.GetType().GetProperty(item.field).SetValue(content, elementcontent, null);
                            }
                            catch
                            {
                            }
                        }
                    }

                    var result = await ContentRepository.AddContent(content);

                    if (result)
                    {
                        await baseContentRepository.ChangeIsCrawled(model);
                    }
                    return(result);
                }

                return(false);
            }
            catch (Exception e)
            {
                return(false);
            }
        }