public static bool Get_Tagged_Pins(ref CMS_CrawlerModels model, string search_str, int limit = 1, string bookmarks_str = null, int page = 1) { var url = "https://www.facebook.com/lifewithsunshine/ads/?ref=page_internal&dpr=1&ajaxpipe=1&ajaxpipe_token=AXh9WxHKM0f06Rq9&country=1&path=%2Flifewithsunshine%2Fads%2F&__user=100003324695675&__a=1&__dyn=5V4cjLx2ByK5A9UkKHqAyqomzFE9XG8GAdyeGDirWqF1G7UnGdwIhEnUF7yWCHxCEjCyEnyo88ObGubyRUC48G5WAxamjDK7GgPwzxuFS58-ER2KdyU8p94jUXVoS48nVV8Gicx2q5o4OmayrBy8GudAx6cw_xle9xmjx2Qm3GE-qp3FK4bUCaxKh1e5pVkdxCi78SaCCy89ooKHVohxyhu9K9BmFpEBq8IHGfio8l8imEggmKbKqify4cXJ2oS3m6ogUK8GE_WUWiUd9azEKiEDyp8ymaVeaDU8fiAx2miQhxdyopBAyEN4yprypVUV1bCxe9yEgy8LzU9FWDz8a8Z112HJ7VVHAyEsyUlzF8WEKU&__req=jsonp_3&__be=1&__pc=PHASED%3ADEFAULT&__rev=4066324&__spin_r=4066324&__spin_b=trunk&__spin_t=1530514908&__adt=3"; //var url = "https://www.facebook.com/lifewithsunshine/photos/a.1797478757133845.1073741829.1757631344451920/2017087941839591/?type=3"; getDataPinterest(url, model, "", ref bookmarks_str); return(false); }
public CMS_ProductsModels() { ListTime = new List <SelectListItem>(); ListQuantity = new List <SelectListItem>(); Crawler = new CMS_CrawlerModels(); FromDate = new DateTime(1990, 01, 01); ToDate = DateTime.Now; listKeywords = new List <string>(); listGroups = new List <string>(); }
public ActionResult LoadScroll(PinFilterDTO pinFilter) { try { if (!string.IsNullOrEmpty(pinFilter.Url)) { NameValueCollection QueryString = CommonHelper.GetQueryParameters(pinFilter.Url); var _Key = QueryString["BoardID"] ?? ""; var _Group = QueryString["GroupID"] ?? ""; if (!string.IsNullOrEmpty(_Key)) { pinFilter.LstBoardID.Add(_Key); } if (!string.IsNullOrEmpty(_Group)) { pinFilter.LstGroupBoardID.Add(_Group); var _lstKeywords = getListBoardByGroud(_Group); pinFilter.LstBoardID.AddRange(_lstKeywords); } } if (pinFilter.LstBoardID != null && pinFilter.LstBoardID.Count > 0) { if (string.IsNullOrEmpty(pinFilter.LstBoardID[0])) { pinFilter.LstBoardID = null; } } if (pinFilter.LstBoardID == null || pinFilter.LstBoardID.Count == 0) { pinFilter.LstBoardID = getListBoard().Select(o => o.Value).ToList(); } var modelCrawler = new CMS_CrawlerModels(); if (pinFilter.LstBoardID != null && pinFilter.LstBoardID.Count > 0) { var _pinModels = new List <PinsModels>(); var msg = ""; pinFilter.PageSize = Commons.PageSize; int totalPin = 0; var result = _fac.GetPin(ref _pinModels, ref totalPin, pinFilter, ref msg); if (result) { modelCrawler.Pins = _pinModels; } } return(PartialView("_ListItem", modelCrawler)); } catch (Exception) { } return(new HttpStatusCodeResult(HttpStatusCode.BadRequest)); }
public static void CrawlerAllFb(string url, ref CMS_CrawlerModels pins) { try { var page_Id = ""; var user_Id = ""; url = url + "ads/?country=1&ref=page_internal"; if (!string.IsNullOrEmpty(Cookies)) { var start = Cookies.IndexOf("c_user="******"=", start) + 1; for (int i = start; i < Cookies.Length; i++) { char key = Cookies[i]; if (key == ';') { end = i; break; } } user_Id = Cookies.Substring(start, (end - start)); } if (!string.IsNullOrEmpty(url)) { var end = url.IndexOf("/ads/"); end = url.IndexOf("/", end); var start = 0; for (int i = end; i > 0; i--) { char key = url[i]; if (key == '-') { start = i; break; } } page_Id = url.Substring(start + 1, (end - start - 1)); } /* crawl first page */ CrawlerFb(url, ref pins); /* check next page */ if (!string.IsNullOrEmpty(page_Id) && !string.IsNullOrEmpty(user_Id)) { CrawlerNextPage(page_Id, user_Id, 8, url, ref pins); } } catch (Exception ex) { } }
public ActionResult ProductDetail(string id, string Key) { var modelCrawler = new CMS_CrawlerModels(); try { var model = new PinsModels(); CrawlerHelper.Get_Tagged_PinsDetail(ref model, id); CrawlerHelper.Get_Tagged_OrtherPins(ref modelCrawler, Key, Commons.PinOrtherDefault, "", 1, id); modelCrawler.Pin = model; } catch (Exception) { } return(View(modelCrawler)); }
public static void CrawlerAllFb(string url, string cookie, ref CMS_CrawlerModels pins) { try { /* pre-processing */ var user_Id = GetUserIDFromCookies(cookie); url = CheckUrl(url); /* crawl first page */ string _pageId = ""; NSLog.Logger.Info("Start Craw :" + url); NSLog.Logger.Info("Cookie : " + cookie); int countExp = 0; CrawlerFb(url, cookie, ref pins, ref countExp, ref _pageId); /* crawl detail */ if (pins != null && pins.Pins != null && pins.Pins.Any()) { var totalPin = pins.Pins.Count; NSLog.Logger.Info("Total Pin master :" + totalPin); Parallel.ForEach(pins.Pins, (item) => { countExp = 0; if (!item.IsDynamic) { Thread.Sleep(5000); CrawlerFBDetail(item.LinkApi, item.FbIds, cookie, ref item); } }); } /* check next page ID */ _pageId = string.IsNullOrEmpty(_pageId) ? GetNextPageID(url) : _pageId; /* crawl next page */ if (!string.IsNullOrEmpty(_pageId) && !string.IsNullOrEmpty(user_Id)) { countExp = 0; CrawlerNextPage(_pageId, user_Id, 8, url, cookie, ref countExp, ref pins); var totalPin = pins.Pins.Count; NSLog.Logger.Info("Total Pin master :" + totalPin); } NSLog.Logger.Info("End Craw :" + url + " :" + pins.Pins.Count); } catch (Exception ex) { } }
public bool CrawlData(string Id, string createdBy, ref string msg) { NSLog.Logger.Info("CrawlData: " + Id); var model = new CMS_CrawlerModels(); var sequence = 0; var key = ""; var result = true; try { using (var _db = new CMS_Context()) { /* get key by ID */ var keyWord = _db.CMS_KeyWord.Where(o => o.ID == Id).FirstOrDefault(); if (keyWord != null) { sequence = keyWord.Sequence; key = keyWord.KeyWord; /* check time span crawl */ var timeSpanCrawl = DateTime.Now - keyWord.UpdatedDate; if (timeSpanCrawl.Value.TotalMinutes > 5 || keyWord.UpdatedDate == keyWord.CreatedDate) /* 5min to crawl data again */ { /* update crawer date */ var bkTime = keyWord.UpdatedDate; keyWord.UpdatedDate = DateTime.Now; keyWord.UpdatedBy = createdBy; _db.SaveChanges(); /* call drawler api to crawl data */ CMSPinFactory _fac = new CMSPinFactory(); var listAcc = _db.CMS_Account.Where(o => o.Status == (byte)Commons.EStatus.Active && o.IsActive).ToList(); var listCookie = listAcc.Select(x => x.Cookies).ToList(); var _cookie = CommonHelper.RamdomCookie(listCookie); CrawlerFbHelpers_v2.CrawlerAllFb(keyWord.KeyWord, _cookie, ref model); var res = false; if (model.Pins.Count > 0) { res = _fac.CreateOrUpdate(model.Pins, keyWord.ID, createdBy, ref msg); } if (res == false) { /* back to last crawl data */ //keyWord.UpdatedDate = bkTime; //_db.SaveChanges(); result = false; } else { keyWord.UpdatedDate = DateTime.Now; _db.SaveChanges(); } } } } LogHelper.WriteLogs(sequence.ToString() + " " + key, "Num post: " + model.Pins.Count().ToString()); NSLog.Logger.Info("ResponseCrawlData", result.ToString()); } catch (Exception ex) { msg = "Crawl data is unsuccessfully."; result = false; LogHelper.WriteLogs("ErrorCrawlData: " + Id, JsonConvert.SerializeObject(ex)); NSLog.Logger.Error("ErrorCrawlData: " + Id, ex); } return(result); }
public static void CrawlerFb(string url, ref CMS_CrawlerModels pins) { try { int _port = 0; string _proxy = CommonHelper.RamdomProxy(ref _port); Uri uri = new Uri(url); var httpWebRequest = (HttpWebRequest)WebRequest.Create(uri); //httpWebRequest.Proxy = new WebProxy(_proxy,_port); /* request need cookie & user agent */ httpWebRequest.Headers["Cookie"] = Cookies; httpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"; httpWebRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; httpWebRequest.Timeout = 100000; var httpResponse = (HttpWebResponse)httpWebRequest.GetResponse(); using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var html = streamReader.ReadToEnd(); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); List <HtmlNode> nodeHtml = doc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5pbx userContent _3576"))).ToList(); var ListDescription = new List <string>(); if (nodeHtml != null && nodeHtml.Count > 0) { foreach (var item in nodeHtml) { var NodeDescription = item.Descendants("p").ToList(); if (NodeDescription != null) { var description = NodeDescription[0].InnerText; if (!string.IsNullOrEmpty(description)) { description = description.Replace(""", ""); } ListDescription.Add(description); } else { ListDescription.Add(""); } } } //Name List <HtmlNode> nodehtmlName = doc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_6a _5u5j _6b"))).ToList(); var ListName = new List <string>(); if (nodehtmlName != null && nodehtmlName.Count > 0) { foreach (var item in nodehtmlName) { var NodeName = item.Descendants("a").ToList(); if (NodeName != null) { var name = NodeName[0].InnerText; if (!string.IsNullOrEmpty(name)) { name = name.Replace(""", ""); } ListName.Add(name); } else { ListName.Add(""); } } } else { /* */ pins.ErrorStatus = (byte)Commons.EErrorStatus.AccBlocked; } // fb_id var nodeFb_Id = doc.DocumentNode.Descendants().Where ( x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5pcp _5lel _2jyu _232_")) ).ToList(); List <string> fb_ids = new List <string>(); if (nodeFb_Id != null && nodeFb_Id.Count > 0) { foreach (var item in nodeFb_Id) { var strfb_id = item.GetAttributeValue("id", ""); if (!string.IsNullOrEmpty(strfb_id)) { //var split = strfb_id.Split(';').ToList(); //if (split != null && split.Count > 1) //{ // var fb_id = split[1]; // fb_ids.Add(fb_id); //} //else //{ // fb_ids.Add(""); //} var fb_id = findFbId_v2(strfb_id); if (!string.IsNullOrEmpty(fb_id)) { fb_ids.Add(fb_id); } else { fb_ids.Add(""); } } } } LogHelper.WriteLogs("fb_ids: " + url, JsonConvert.SerializeObject(fb_ids)); // node html image List <HtmlNode> nodeHtmlImage = doc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("mtm"))).ToList(); if (nodeHtmlImage != null && nodeHtmlImage.Count > 0) { var index = 0; foreach (var item in nodeHtmlImage) { List <string> fb_id = new List <string>(); // post normal var nodeChildImage = item.Descendants("a").ToList(); if (nodeChildImage != null && nodeChildImage.Count > 0) { foreach (var itemImage in nodeChildImage) { var _image = itemImage.GetAttributeValue("data-ploi", ""); var _apiDetail = itemImage.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(_image)) { _image = _image.Replace("amp;", ""); } var Pin = new PinsModels(); if (!string.IsNullOrEmpty(_image) && !string.IsNullOrEmpty(_apiDetail)) { var Splits = _apiDetail.Split('/').ToList(); if (Splits != null && Splits.Count >= 5) { fb_id.Add(Splits[4]); } if (fb_ids != null && fb_ids.Count >= index /*&& nodeChildImage.Count == 1*/) { if (!string.IsNullOrEmpty(fb_ids[index])) { fb_id.Add(fb_ids[index]); } } CrawlerFBDetail(_apiDetail, fb_id, ref Pin); Pin.ImageURL = _image; if (ListDescription != null && ListDescription.Count >= index) { Pin.Description = ListDescription[index]; } if (ListName != null && ListName.Count >= index) { Pin.OwnerName = ListName[index]; } pins.Pins.Add(Pin); } } } //post dynamic var nodeChildDynamic = item.Descendants("ul").ToList(); if (nodeChildDynamic != null && nodeChildDynamic.Count > 0) { var _doc = new HtmlDocument(); _doc.LoadHtml(nodeChildDynamic[0].InnerHtml); var nodeLI = _doc.DocumentNode.Descendants().Where( x => (x.Name == "li" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5ya"))).ToList(); if (nodeLI != null && nodeLI.Count > 0) { foreach (var itemLI in nodeLI) { var Pin = new PinsModels(); var nodeLIImage = itemLI.Descendants("img").ToList(); if (nodeLIImage != null && nodeLIImage.Count > 0) { var _image = nodeLIImage[0].GetAttributeValue("src", ""); if (!string.IsNullOrEmpty(_image)) { _image = _image.Replace("amp;", ""); Pin.ImageURL = _image; var PinId = findFbOh(_image); Pin.ID = PinId + "_" + fb_ids[index]; } } var nodeLink = itemLI.Descendants("a").ToList(); if (nodeLink != null && nodeLink.Count > 0) { var _link = nodeLink[0].GetAttributeValue("href", ""); Pin.Link = _link; } //description var nodeLIDescription = itemLI.Descendants("div").ToList(); if (nodeLIDescription != null && nodeLIDescription.Count > 0) { var _description = nodeLIDescription.Where(x => x.LastChild.Name.Equals("#text")).FirstOrDefault(); if (_description != null) { Pin.Description = _description.InnerText; } } if (ListName != null && ListName.Count >= index) { Pin.OwnerName = ListName[index]; } if (!string.IsNullOrEmpty(Pin.ID)) { pins.Pins.Add(Pin); } } } } index++; } } } } catch (Exception ex) { LogHelper.WriteLogs("ErrorCrawlerFB: " + url, JsonConvert.SerializeObject(ex)); NSLog.Logger.Error("Crawler Fb: ", ex); } }
public static void CrawlerNextPage(string pageId, string userId, int cursor, string referer, ref CMS_CrawlerModels pins) { int _port = 0; string _proxy = CommonHelper.RamdomProxy(ref _port); var url = "https://www.facebook.com/pages/ads/more/?cursor=" + cursor + "&surface=www_page_ads&unit_count=" + cursor + "&country=1&dpr=1&__user="******"&__a=1&__req=v&__be=1&__pc=PHASED%3ADEFAULT&__rev=4075583&__spin_r=4075583&__spin_b=trunk&__spin_t=1530846023&page_id=" + pageId + ""; Uri uri = new Uri(url); var httpWebRequest = (HttpWebRequest)WebRequest.Create(uri); //httpWebRequest.Proxy = new WebProxy(_proxy, _port); /* request need cookie & user agent */ httpWebRequest.Headers["Cookie"] = Cookies; httpWebRequest.Referer = referer; httpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"; httpWebRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; httpWebRequest.Timeout = 100000; var httpResponse = (HttpWebResponse)httpWebRequest.GetResponse(); using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var html = streamReader.ReadToEnd(); if (!string.IsNullOrEmpty(html)) { html = html.Replace("for (;;);", ""); JavaScriptSerializer jsonSerializer = new JavaScriptSerializer(); dynamic dobj = jsonSerializer.Deserialize <dynamic>(html); var domops = dobj["domops"]; if (domops != null) { var _objhtmt = domops[0][3]; if (_objhtmt != null) { var _html = _objhtmt["__html"]; if (!string.IsNullOrEmpty(_html)) { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(_html); List <HtmlNode> nodeHtml = htmlDoc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5pbx userContent _3576"))).ToList(); var ListDescription = new List <string>(); if (nodeHtml != null && nodeHtml.Count > 0) { foreach (var item in nodeHtml) { var NodeDescription = item.Descendants("p").ToList(); if (NodeDescription != null) { var description = NodeDescription[0].InnerText; if (!string.IsNullOrEmpty(description)) { description = description.Replace(""", ""); } ListDescription.Add(description); } else { ListDescription.Add(""); } } } //Name List <HtmlNode> nodehtmlName = htmlDoc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_6a _5u5j _6b"))).ToList(); var ListName = new List <string>(); if (nodehtmlName != null && nodehtmlName.Count > 0) { foreach (var item in nodehtmlName) { var NodeName = item.Descendants("a").ToList(); if (NodeName != null) { var name = NodeName[0].InnerText; if (!string.IsNullOrEmpty(name)) { name = name.Replace(""", ""); } ListName.Add(name); } else { ListName.Add(""); } } } // fb_id var nodeFb_Id = htmlDoc.DocumentNode.Descendants().Where ( x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5pcp _5lel _2jyu _232_")) ).ToList(); List <string> fb_ids = new List <string>(); if (nodeFb_Id != null && nodeFb_Id.Count > 0) { foreach (var item in nodeFb_Id) { var strfb_id = item.GetAttributeValue("id", ""); if (!string.IsNullOrEmpty(strfb_id)) { //var split = strfb_id.Split(';').ToList(); //if (split != null && split.Count > 1) //{ // var fb_id = split[1]; // fb_ids.Add(fb_id); //} //else //{ // fb_ids.Add(""); //} var fb_id = findFbId_v2(strfb_id); if (!string.IsNullOrEmpty(fb_id)) { fb_ids.Add(fb_id); } else { fb_ids.Add(""); } } } } List <HtmlNode> nodeHtmlImage = htmlDoc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("mtm"))).ToList(); if (nodeHtmlImage != null && nodeHtmlImage.Count > 0) { var index = 0; foreach (var item in nodeHtmlImage) { List <string> fb_id = new List <string>(); var nodeChildImage = item.Descendants("a").ToList(); if (nodeChildImage != null && nodeChildImage.Count > 0) { foreach (var itemImage in nodeChildImage) { var _image = itemImage.GetAttributeValue("data-ploi", ""); var _apiDetail = itemImage.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(_image)) { _image = _image.Replace("amp;", ""); } var Pin = new PinsModels(); if (!string.IsNullOrEmpty(_image) && !string.IsNullOrEmpty(_apiDetail)) { var Splits = _apiDetail.Split('/').ToList(); if (Splits != null && Splits.Count >= 5) { fb_id.Add(Splits[4]); } if (fb_ids != null && fb_ids.Count >= index /*&& nodeChildImage.Count == 1*/) { if (!string.IsNullOrEmpty(fb_ids[index])) { fb_id.Add(fb_ids[index]); } } CrawlerFBDetail(_apiDetail, fb_id, ref Pin); Pin.ImageURL = _image; if (ListDescription != null && ListDescription.Count >= index) { Pin.Description = ListDescription[index]; } if (ListName != null && ListName.Count >= index) { Pin.OwnerName = ListName[index]; } pins.Pins.Add(Pin); } } } //post dynamic var nodeChildDynamic = item.Descendants("ul").ToList(); if (nodeChildDynamic != null && nodeChildDynamic.Count > 0) { var _doc = new HtmlDocument(); _doc.LoadHtml(nodeChildDynamic[0].InnerHtml); var nodeLI = _doc.DocumentNode.Descendants().Where( x => (x.Name == "li" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5ya"))).ToList(); if (nodeLI != null && nodeLI.Count > 0) { foreach (var itemLI in nodeLI) { var Pin = new PinsModels(); var nodeLIImage = itemLI.Descendants("img").ToList(); if (nodeLIImage != null && nodeLIImage.Count > 0) { var _image = nodeLIImage[0].GetAttributeValue("src", ""); if (!string.IsNullOrEmpty(_image)) { _image = _image.Replace("amp;", ""); Pin.ImageURL = _image; var PinId = findFbOh(_image); Pin.ID = PinId + "_" + fb_ids[index]; } } var nodeLink = itemLI.Descendants("a").ToList(); if (nodeLink != null && nodeLink.Count > 0) { var _link = nodeLink[0].GetAttributeValue("href", ""); Pin.Link = _link; } //description var nodeLIDescription = itemLI.Descendants("div").ToList(); if (nodeLIDescription != null && nodeLIDescription.Count > 0) { var _description = nodeLIDescription.Where(x => x.LastChild.Name.Equals("#text")).FirstOrDefault(); if (_description != null) { Pin.Description = _description.InnerText; } } if (ListName != null && ListName.Count >= index) { Pin.OwnerName = ListName[index]; } if (!string.IsNullOrEmpty(Pin.ID)) { pins.Pins.Add(Pin); } } } } index++; } } } else { return; } } } } } // đệ quy craweler next page cursor = cursor + 8; CrawlerNextPage(pageId, userId, cursor, referer, ref pins); }
public static void CrawlerNextPage(string pageId, string userId, int cursor, string referer, string cookie, ref int countExp, ref CMS_CrawlerModels pins) { int _port = 0; string _proxy = CommonHelper.RamdomProxy(ref _port); var url = "https://www.facebook.com/pages/ads/more/?cursor=" + cursor + "&surface=www_page_ads&unit_count=8&country=1&dpr=1&__user="******"&__a=1&__req=v&__be=1&__pc=PHASED%3ADEFAULT&__rev=4075583&__spin_r=4075583&__spin_b=trunk&__spin_t=1530846023&page_id=" + pageId + ""; Uri uri = new Uri(url); var httpWebRequest = (HttpWebRequest)WebRequest.Create(uri); //httpWebRequest.Proxy = new WebProxy(_proxy, _port); httpWebRequest.KeepAlive = false; /* request need cookie & user agent */ httpWebRequest.Headers["Cookie"] = cookie; httpWebRequest.Referer = referer; httpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"; httpWebRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; httpWebRequest.Timeout = 9000000; try { using (HttpWebResponse httpResponse = (HttpWebResponse)httpWebRequest.GetResponse()) { try { if (httpResponse.StatusCode == HttpStatusCode.OK) { using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var html = streamReader.ReadToEnd(); if (!string.IsNullOrEmpty(html)) { html = html.Replace("for (;;);", ""); JavaScriptSerializer jsonSerializer = new JavaScriptSerializer(); dynamic dobj = jsonSerializer.Deserialize <dynamic>(html); var domops = dobj["domops"]; if (domops != null) { var _objhtmt = domops[0][3]; if (_objhtmt != null) { var _html = _objhtmt["__html"]; if (!string.IsNullOrEmpty(_html)) { CrawlerDataFacebook(_html, true, ref pins, ref pageId); streamReader.Close(); streamReader.Dispose(); Thread.Sleep(500); /* crawl detail */ if (pins != null && pins.Pins != null && pins.Pins.Any()) { var totalPin = pins.Pins.Count; NSLog.Logger.Info("Total Pin master :" + totalPin); Parallel.ForEach(pins.Pins, (item) => { if (!item.IsDynamic && string.IsNullOrEmpty(item.ID)) { Thread.Sleep(5000); CrawlerFBDetail(item.LinkApi, item.FbIds, cookie, ref item); } }); } // đệ quy craweler next page cursor = cursor + 8; CrawlerNextPage(pageId, userId, cursor, referer, cookie, ref countExp, ref pins); } else { return; } } } } } } } catch (IOException exIO) { NSLog.Logger.Info("rawl next page error io exception" + url + " ", exIO.Message); Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerNextPage(pageId, userId, cursor, referer, cookie, ref countExp, ref pins); } } catch (Exception ex) { if (httpResponse.StatusCode == HttpStatusCode.NotFound) { Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerNextPage(pageId, userId, cursor, referer, cookie, ref countExp, ref pins); } } } } } catch (WebException ex) { NSLog.Logger.Info("Crawl next page error : " + url + " " + ex.Message); if (ex.Status == WebExceptionStatus.ProtocolError && ex.Response != null) { var resp = (HttpWebResponse)ex.Response; if (resp.StatusCode == HttpStatusCode.NotFound) { Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerNextPage(pageId, userId, cursor, referer, cookie, ref countExp, ref pins); } } } else { Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerNextPage(pageId, userId, cursor, referer, cookie, ref countExp, ref pins); } } } catch (IOException exIO) { NSLog.Logger.Info("rawl next page error io exception" + url + " ", exIO.Message); Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerNextPage(pageId, userId, cursor, referer, cookie, ref countExp, ref pins); } } catch (Exception ex) { NSLog.Logger.Error("rawl next page error :", ex); } // httpWebRequest.Abort();//cancel request }
public static bool Get_Tagged_OrtherPins(ref CMS_CrawlerModels model, string search_str, int limit = 1, string bookmarks_str = null, int page = 1, string pinId = "") { if (page > limit) { return(false); } var next_page = false; if (!string.IsNullOrEmpty(bookmarks_str)) { next_page = true; } string data = string.Empty; var urlOrg = Commons.HostApiOrtherPin; var path = string.Empty; if (!next_page) { var objJson = new { options = new { field_set_key = "base_grid", pin = pinId, prepend = false, search_query = search_str, source = "search", top_level_source = "search", top_level_source_depth = 1, context_pin_ids = new string[] { } }, context = new { }, }; string input = JsonConvert.SerializeObject(objJson); urlOrg = Commons.HostApiOrtherPin + "/pin/" + pinId + "/"; path = ""; string[] pattern = new string[] { "\n", "\r", "\t" }; string[] replacements = new string[] { "", "", "" }; data = Preg_replace(input, pattern, replacements); } else { var objJson = new { options = new { field_set_key = "base_grid", pin = pinId, prepend = false, search_query = search_str, source = "search", top_level_source = "search", top_level_source_depth = 1, bookmarks = new string[] { bookmarks_str }, context_pin_ids = new string[] { } }, context = new { }, }; string input = JsonConvert.SerializeObject(objJson); urlOrg = Commons.HostApiOrtherPin + "/pin/" + pinId + "/"; path = ""; string[] pattern = new string[] { "\n", "\r", "\t" }; string[] replacements = new string[] { "", "", "" }; data = Preg_replace(input, pattern, replacements); } // data = HttpContext.Current.Server.UrlEncode(data); var timestamp = GetTimestamp(DateTime.Now); var url = urlOrg + "&data=" + data + "&_=" + timestamp; var bookmarks = ""; getDataPinterest(url, model, pinId, ref bookmarks); if (!string.IsNullOrEmpty(bookmarks)) { Get_Tagged_OrtherPins(ref model, search_str, limit, bookmarks, ++page, pinId); } return(false); }
public static bool Get_Tagged_Pins(ref CMS_CrawlerModels model, string search_str, int limit = 1, string bookmarks_str = null, int page = 1) { if (page > limit) { return(false); } var next_page = false; if (!string.IsNullOrEmpty(bookmarks_str)) { next_page = true; } string data = string.Empty; var urlOrg = Commons.HostApi + search_str; var path = string.Empty; if (!next_page) { var objJson = new { options = new { scope = "pins", show_scope_selector = true, query = search_str }, context = new { app_version = "aad9791" }, module = new { name = "SearchPage", options = new { scope = "pins", query = search_str } }, append = false, error_strategy = 0 }; string input = JsonConvert.SerializeObject(objJson); urlOrg = Commons.HostApi + search_str + "&rs=typed&term_meta[]= " + search_str + "|typed"; path = ""; string[] pattern = new string[] { "\n", "\r", "\t" }; string[] replacements = new string[] { "", "", "" }; data = Preg_replace(input, pattern, replacements); } else { var objJson = new { options = new { scope = "pins", show_scope_selector = "null", query = search_str, bookmarks = new string[] { bookmarks_str }, }, context = new { app_version = "2f83a7e" }, module = new { name = "GridItems", options = new { scope = "pins", scrollable = true, show_grid_footer = true, centered = true, reflow_all = true, virtualize = true, item_options = new { show_pinner = true, show_pinned_from = false, show_board = true }, layout = "variable_height", } }, append = true, error_strategy = 2 }; string input = JsonConvert.SerializeObject(objJson); urlOrg = Commons.HostApi + search_str + "&rs=typed&term_meta[]=" + search_str + "|typed"; path = ""; string[] pattern = new string[] { "\n", "\r", "\t" }; string[] replacements = new string[] { "", "", "" }; data = Preg_replace(input, pattern, replacements); } // data = HttpContext.Current.Server.UrlEncode(data); var timestamp = GetTimestamp(DateTime.Now); var url = urlOrg + "&data=" + data + "" + path + "&_=" + timestamp; var bookmarks = ""; getDataPinterest(url, model, "", ref bookmarks); if (!string.IsNullOrEmpty(bookmarks)) { Get_Tagged_Pins(ref model, search_str, limit, bookmarks, ++page); } return(false); }
public ActionResult Search() { try { var FilterModel = new PinFilterDTO(); FilterModel.PageIndex = Commons.PageIndex; FilterModel.PageSize = Commons.PageSize; FilterModel.CreatedDateFrom = null; FilterModel.CreatedDateTo = null; // var _Key = Request["Key"] ?? ""; var TypeTime = Request["TypeTime"] ?? "2"; //var Sort1 = Request["Sort1"] ?? ""; var Sort2 = Request["Sort2"] ?? "2"; // var TypePin = Request["TypePin"] ?? ""; var _TypeQuantity = Request["TypeQuantity"]; int TypeQuantity = -1; if (!string.IsNullOrEmpty(_TypeQuantity)) { TypeQuantity = Convert.ToInt16(_TypeQuantity); } var Keywords = Request["listKeywords"] ?? null; char[] separator = new char[] { ',' }; var ListKeyword = CommonHelper.ParseStringToList(Keywords, separator); var _FromDate = Convert.ToDateTime(Request["FromDate"]); var _ToDate = Convert.ToDateTime(Request["ToDate"]); #region "comment" //cache data //Response.Cookies["TypeTime"].Value = TypeTime.ToString(); //Response.Cookies["TypeTime"].Expires = DateTime.Now.AddYears(1); // add expiry time //Response.Cookies["TypePin"].Value = TypePin.ToString(); //Response.Cookies["TypePin"].Expires = DateTime.Now.AddYears(1); // add expiry time //Response.Cookies["FromDate"].Value = _FromDate.ToString(); //Response.Cookies["FromDate"].Expires = DateTime.Now.AddYears(1); // add expiry time //Response.Cookies["ToDate"].Value = _ToDate.ToString(); //Response.Cookies["ToDate"].Expires = DateTime.Now.AddYears(1); // add expiry time //if(TypeQuantity != 0) //{ // Response.Cookies["TypeQuantity"].Value = TypeQuantity.ToString(); // Response.Cookies["TypeQuantity"].Expires = DateTime.Now.AddYears(1); // add expiry time //} #endregion FilterModel.CreatedAtFrom = _FromDate; FilterModel.CreatedAtTo = _ToDate; var _Group = Request["GroupID"] ?? ""; if (!string.IsNullOrEmpty(_Group)) { FilterModel.LstGroupID.Add(_Group); var _lstKeywords = getListKeyWordByGroup(_Group); FilterModel.LstKeyWordID.AddRange(_lstKeywords); } if (TypeQuantity.ToString() == Commons.EQuantityType.ZeroToOne.ToString("d")) { FilterModel.PinCountFrom = 0; FilterModel.PinCountTo = 100; } if (TypeQuantity.ToString() == Commons.EQuantityType.OneToTwo.ToString("d")) { FilterModel.PinCountFrom = 100; FilterModel.PinCountTo = 200; } if (TypeQuantity.ToString() == Commons.EQuantityType.TwoToThree.ToString("d")) { FilterModel.PinCountFrom = 200; FilterModel.PinCountTo = 300; } if (TypeQuantity.ToString() == Commons.EQuantityType.ThreeToFour.ToString("d")) { FilterModel.PinCountFrom = 300; FilterModel.PinCountTo = 400; } if (TypeQuantity.ToString() == Commons.EQuantityType.FourToFive.ToString("d")) { FilterModel.PinCountFrom = 400; FilterModel.PinCountTo = 500; } if (TypeQuantity.ToString() == Commons.EQuantityType.MoreFive.ToString("d")) { FilterModel.PinCountFrom = 500; } if (ListKeyword != null && ListKeyword.Count > 0) { FilterModel.LstKeyWordID = ListKeyword; // Response.Cookies["Keywords"].Value = Keywords.ToString(); // Response.Cookies["Keywords"].Expires = DateTime.Now.AddYears(1); // add expiry time } FilterModel.TypeTime = TypeTime; var tmp = 0; int.TryParse(TypeTime, out tmp); FilterModel.Sort1 = tmp; int.TryParse(Sort2, out tmp); FilterModel.Sort2 = tmp; var modelCrawler = new CMS_CrawlerModels(); var _pinModels = new List <PinsModels>(); var msg = ""; int totalPin = 0; var result = _fac.GetPin(ref _pinModels, ref totalPin, FilterModel, ref msg); if (result) { modelCrawler.Pins = _pinModels; } return(PartialView("_ListItem", modelCrawler)); } catch (Exception ex) { } return(new HttpStatusCodeResult(HttpStatusCode.BadRequest)); }
public bool CrawlData(string Id, string createdBy, ref string msg) { NSLog.Logger.Info("CrawlData: " + Id); var result = true; try { using (var _db = new CMS_Context()) { /* get key by ID */ var keyWord = _db.CMS_KeyWord.Where(o => o.ID == Id && o.Status == (byte)Commons.EStatus.Active).FirstOrDefault(); if (keyWord != null) { /* check time span crawl */ var timeSpanCrawl = DateTime.Now - keyWord.UpdatedDate; if (timeSpanCrawl.Value.TotalMinutes > 5 || keyWord.UpdatedDate == keyWord.CreatedDate) /* 5min to crawl data again */ { /* update crawer date */ var bkTime = keyWord.UpdatedDate; keyWord.UpdatedDate = DateTime.Now; keyWord.UpdatedBy = createdBy; keyWord.KeyWord = keyWord.KeyWord.Trim(); _db.SaveChanges(); /* cookies * User: [email protected] * Pass: pitool.org79 */ if (!string.IsNullOrEmpty(keyWord.CrawlAccountID)) { var _Cookie = keyWord.CMS_Account.Cookies; if (!string.IsNullOrEmpty(_Cookie)) { CrawlerBoardHelper._Cookies = _Cookie; } else { CrawlerHelper._Cookies = "_b = \"AS+B1gn0GdpGgLQl83JubKX1bG19kiuUUvX8lnvITKDHNq2tJcgqXNIQ0cLN+kjq4KM=\"; _pinterest_pfob = enabled; _ga = GA1.2.229901352.1528170174; pnodepath = \"/pin4\"; fba = True; G_ENABLED_IDPS = google; bei = false; logged_out = True; sessionFunnelEventLogged = 1; cm_sub = denied; _auth = 1; csrftoken = fkrSitmDb4vW2kT1G3GfOkcC8mPvl0kV; _pinterest_sess = \"TWc9PSZWaE0xeDZOVm4yL3Yva0VSazRkRjlHR013bk9mdVJBcU9zVEtEOUhXVjhKSFZmZUEreWJiNDYrV3FubVRoVzdqdDF0dmtDcXErcFF6MmlXQUQ3RDVzWERCWTZYZUt4eXMzemkvOGlXRFZQT1J6MjkwampOZlVJUFEvTnNkTUZYMkJ3dGxPTTRKaVIwdGNJY2h1MUhaSHlFT3djd0huNHE0YmtiTTBZR3dVTVB3d0RyYVE4UC8rMjZCYWo2eTJLNGJVSHR6KzRENjlWVE0rNFMxNWdGMUtVL0VtL2RDZktiUFg3M1Y2Z2dEbllPeUxFR3FOdEd6SUJSRTlBMWs1YkJnbTBlWHhwcC9pMmlqRmoydlh0V2VQSGYvYk1zeXlSM2dIU1dmUXIyRWVxWVBPdTYzbHFjcVhYRWRBT0FTQ3VBNmdWMm5QUlREZDdSY2ZQeE1NWklqSUZxNDllVHF1WUVzRFRrRjBXQnZCMVBGTlYxT0UzM1daeHFOUnBBTzliMzFJdmovQ1hQR2Vvc1pkTHNxL1FjT3FrWllTR1d6VHFrd2g5cFBFMmswM3dIa0dOOHVCbGd6aVlKUkJlZlZNeWVyRTBYREcrQVFlUTdRc1NqMlFlQ3RvaWlZMjJXZ1RURmIxNDA2d2JTODRGNk9BYWpoRzVJTUhLMkJ4UDJGb0NmN0NOQXpmZ0FoR08xcElmWmh5S29OeGRadFpDVWR1RGw3ZzZGRS81SlU4UlhSUVlIWm4wRzRJMGFVaTQzdGI3T2ovSCtHR2ZSWlk0M1RCN2JXSmZJRFdQUUpZWVpRMW5ta0pMbXgwT2NZckZJcHg0RTJrTjJlZWJIdXFSdkdJTWNXc2d3NHpXdzFTRGhKVkN4YmY4SCtJaTdSQSt0K2dhc1VDc0tkNnJIeVFhb3BHeDd6OUwvamZsanRKV0ZYNGFmZWFQNGlqNFVqekVFcGUreHU4UGVqZXRuMFVDNE1QbkFuWnJ6YzNjMTF3dVNZUHJ2MjBwMi8xeXNwbnczMlpSa3cvbzVPQUhQSyswNlU4Y2JQaThxNWN1NWtHVm83SWc0YjJVVW1tUWZYcHpWR2RCYS8wRE0yb2RtNUs0NzRteFp4JjVhOXZDbjB5RGtxL1lROE5WOVNDMjB4c1dMND0=\""; } } else { CrawlerHelper._Cookies = "_b = \"AS+B1gn0GdpGgLQl83JubKX1bG19kiuUUvX8lnvITKDHNq2tJcgqXNIQ0cLN+kjq4KM=\"; _pinterest_pfob = enabled; _ga = GA1.2.229901352.1528170174; pnodepath = \"/pin4\"; fba = True; G_ENABLED_IDPS = google; bei = false; logged_out = True; sessionFunnelEventLogged = 1; cm_sub = denied; _auth = 1; csrftoken = fkrSitmDb4vW2kT1G3GfOkcC8mPvl0kV; _pinterest_sess = \"TWc9PSZWaE0xeDZOVm4yL3Yva0VSazRkRjlHR013bk9mdVJBcU9zVEtEOUhXVjhKSFZmZUEreWJiNDYrV3FubVRoVzdqdDF0dmtDcXErcFF6MmlXQUQ3RDVzWERCWTZYZUt4eXMzemkvOGlXRFZQT1J6MjkwampOZlVJUFEvTnNkTUZYMkJ3dGxPTTRKaVIwdGNJY2h1MUhaSHlFT3djd0huNHE0YmtiTTBZR3dVTVB3d0RyYVE4UC8rMjZCYWo2eTJLNGJVSHR6KzRENjlWVE0rNFMxNWdGMUtVL0VtL2RDZktiUFg3M1Y2Z2dEbllPeUxFR3FOdEd6SUJSRTlBMWs1YkJnbTBlWHhwcC9pMmlqRmoydlh0V2VQSGYvYk1zeXlSM2dIU1dmUXIyRWVxWVBPdTYzbHFjcVhYRWRBT0FTQ3VBNmdWMm5QUlREZDdSY2ZQeE1NWklqSUZxNDllVHF1WUVzRFRrRjBXQnZCMVBGTlYxT0UzM1daeHFOUnBBTzliMzFJdmovQ1hQR2Vvc1pkTHNxL1FjT3FrWllTR1d6VHFrd2g5cFBFMmswM3dIa0dOOHVCbGd6aVlKUkJlZlZNeWVyRTBYREcrQVFlUTdRc1NqMlFlQ3RvaWlZMjJXZ1RURmIxNDA2d2JTODRGNk9BYWpoRzVJTUhLMkJ4UDJGb0NmN0NOQXpmZ0FoR08xcElmWmh5S29OeGRadFpDVWR1RGw3ZzZGRS81SlU4UlhSUVlIWm4wRzRJMGFVaTQzdGI3T2ovSCtHR2ZSWlk0M1RCN2JXSmZJRFdQUUpZWVpRMW5ta0pMbXgwT2NZckZJcHg0RTJrTjJlZWJIdXFSdkdJTWNXc2d3NHpXdzFTRGhKVkN4YmY4SCtJaTdSQSt0K2dhc1VDc0tkNnJIeVFhb3BHeDd6OUwvamZsanRKV0ZYNGFmZWFQNGlqNFVqekVFcGUreHU4UGVqZXRuMFVDNE1QbkFuWnJ6YzNjMTF3dVNZUHJ2MjBwMi8xeXNwbnczMlpSa3cvbzVPQUhQSyswNlU4Y2JQaThxNWN1NWtHVm83SWc0YjJVVW1tUWZYcHpWR2RCYS8wRE0yb2RtNUs0NzRteFp4JjVhOXZDbjB5RGtxL1lROE5WOVNDMjB4c1dMND0=\""; } var searchStr = HttpUtility.UrlEncode(keyWord.KeyWord); /* get first class result */ var model = new CMS_CrawlerModels(); CMSPinFactory _fac = new CMSPinFactory(); CrawlerHelper.Get_Tagged_Pins(ref model, searchStr, Commons.PinDefault); if (model != null && model.Pins != null && model.Pins.Any()) { /* get second class result */ var listPinID = model.Pins.Select(o => o.ID).ToList(); Parallel.ForEach(listPinID, pinID => { CrawlerHelper.Get_Tagged_OrtherPins(ref model, searchStr, Commons.PinOrtherDefault, "", 1, pinID); }); } /* create or update pin */ var res = _fac.CreateOrUpdate(model.Pins, keyWord.ID, createdBy, ref msg); if (res == false) { /* back to last crawl data */ //keyWord.UpdatedDate = bkTime; //_db.SaveChanges(); result = false; } else { keyWord.UpdatedDate = DateTime.Now; _db.SaveChanges(); } } } } NSLog.Logger.Info("ResponseCrawlData: " + Id, result); } catch (Exception ex) { msg = "Crawl data is unsuccessfully."; result = false; LogHelper.WriteLogs("ErrorCrawlData: " + Id, JsonConvert.SerializeObject(ex)); NSLog.Logger.Error("ErrorCrawlData: " + Id, ex); } return(result); }
public static CMS_CrawlerModels getDataPinterest(string url, CMS_CrawlerModels model, string pinId, ref string bookmarks) { try { Uri uri = new Uri(url); var httpWebRequest = (HttpWebRequest)WebRequest.Create(uri); /* request need cookie & user agent */ httpWebRequest.Headers["Cookie"] = "fr=0HZLfh0cIOmtmNqCq.AWXR_MW9yNog0CyLSTuvJhhdnGM.BajqQf.RE.AAA.0.0.BbOi7n.AWUh9bO8; sb=Cs05WwnlYymkzEg6Xn32mzc8; wd=1366x654; datr=Js05W_jbAaa1Ij5CurtBJmwC; locale=en_GB; c_user=100003324695675; xs=23%3AVia9gvMSQtiufw%3A2%3A1530514908%3A467%3A6165; pl=n; spin=r.4066324_b.trunk_t.1530514908_s.1_v.2_; act=1530541156947%2F6; presence=EDvF3EtimeF1530541148EuserFA21B03324695675A2EstateFDutF1530541148851CEchFDp_5f1B03324695675F4CC; x-src=%2Fpg%2Flifewithsunshine%2Fads%2F%7Ccontent_container; pnl_data2=eyJhIjoib25hZnRlcmxvYWQiLCJjIjoiWFBhZ2VzUHJvZmlsZUhvbWVDb250cm9sbGVyIiwiYiI6ZmFsc2UsImQiOiIvbGlmZXdpdGhzdW5zaGluZS9hZHMvIiwiZSI6W119"; httpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"; httpWebRequest.Timeout = 100000; var httpResponse = (HttpWebResponse)httpWebRequest.GetResponse(); using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var answer = streamReader.ReadToEnd(); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(answer); /* get list scripts */ var scripts = htmlDoc.DocumentNode.Descendants("script").ToList(); var listData = new List <string>(); int i = 0; foreach (var script in scripts) { i++; if (i == 3) /* ERROR IN 3TH SCRIPT */ { break; } /* find pay load element */ var res = findElement(script.InnerHtml, "payload", 0); if (!string.IsNullOrEmpty(res)) { JavaScriptSerializer jsonSerializer = new JavaScriptSerializer(); dynamic dobj = jsonSerializer.Deserialize <dynamic>(res); var htmlData = dobj["content"]; if (htmlData != null) { var xmlData = htmlData["content"]; if (xmlData != null) { /* get list tag a */ htmlDoc.LoadHtml(xmlData); var lstA = htmlDoc.DocumentNode.Descendants("a").Where(n => n.GetAttributeValue("rel", "") == "theater").ToList(); foreach (var tagA in lstA) { /* GET DATA MODEL */ var href = tagA.GetAttributeValue("href", ""); var ajaxify = tagA.GetAttributeValue("ajaxify", ""); var fbID = findID(ajaxify); var pin = new PinsModels() { ID = fbID, Link = href, }; model.Pins.Add(pin); CrawlerFBDetail(href, fbID, ref pin); } } } } } streamReader.Close(); streamReader.Dispose(); } } catch (Exception ex) { NSLog.Logger.Error("ErrorgetDataPinterest" + "\n url: " + url + "\nBookmarks:" + bookmarks, ex); } return(model); }
public static void CrawlerDataFacebook(string strHtml, bool IsNextPage, ref CMS_CrawlerModels pins1, ref string _pageId) { try { if (!string.IsNullOrEmpty(strHtml)) { CMS_CrawlerModels pins = new CMS_CrawlerModels(); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(strHtml); //find page id of fan page if (!IsNextPage) { var nodePageId = htmlDoc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_643h"))).ToList(); if (nodePageId != null && nodePageId.Count > 0) { var _643h = nodePageId[0].GetAttributeValue("data-report-meta", ""); var str_643h = System.Web.HttpUtility.HtmlDecode(_643h); if (!string.IsNullOrEmpty(_643h)) { JObject o = JObject.Parse(str_643h); if (o != null) { _pageId = o.SelectToken("landing_page_id").ToString(); } } } } List <HtmlNode> nodeHtml = htmlDoc.DocumentNode.Descendants().Where (x => (x.Name == "div" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_643h"))).ToList(); if (nodeHtml != null && nodeHtml.Count > 0) { var description = ""; var OwnerName = ""; List <string> fb_ids = null; foreach (var itemHtml in nodeHtml) { var _node = itemHtml.Descendants("div") .Where(x => !x.InnerText.Equals("report") && x.InnerHtml.Contains("_5pbx userContent _3576") && x.InnerHtml.Contains("_6a _5u5j _6b") && x.InnerHtml.Contains("_5pcp _5lel _2jyu _232_") && x.InnerHtml.Contains("mtm")).ToList(); if (_node != null && _node.Count > 0) { fb_ids = new List <string>(); var item = _node[0]; var _Html = item.InnerHtml; if (!string.IsNullOrEmpty(_Html)) { var _Doc = new HtmlDocument(); _Doc.LoadHtml(_Html); // Description var _des = _Doc.DocumentNode.Descendants().Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5pbx userContent _3576")).ToList(); if (_des != null && _des.Count > 0) { description = _des[0].InnerText; if (!string.IsNullOrEmpty(description)) { description = description.Replace(""", ""); } } // Owner name var _ownerName = _Doc.DocumentNode.Descendants().Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_6a _5u5j _6b")).ToList(); if (_ownerName != null && _ownerName.Count > 0) { foreach (var itemOwner in _ownerName) { var NodeName = itemOwner.Descendants("a").ToList(); if (NodeName != null) { OwnerName = NodeName[0].InnerText; if (!string.IsNullOrEmpty(OwnerName)) { OwnerName = OwnerName.Replace(""", ""); } else { pins.ErrorStatus = (byte)Commons.EErrorStatus.AccBlocked; } break; } } } // fb_id var _FbId = _Doc.DocumentNode.Descendants().Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5pcp _5lel _2jyu _232_")).ToList(); if (_FbId != null && _FbId.Count > 0) { foreach (var itemFbId in _FbId) { var strfb_id = itemFbId.Id; if (!string.IsNullOrEmpty(strfb_id)) { var charecter = ""; var fb_id = findFbId_v3(strfb_id, "subtitle_", "_", ref charecter); if (!string.IsNullOrEmpty(fb_id)) { fb_ids.Add(fb_id); } if (!string.IsNullOrEmpty(charecter) && charecter.Equals(";")) { fb_id = findFbId_v3(strfb_id, ";", ";", ref charecter); if (!string.IsNullOrEmpty(fb_id)) { fb_ids.Add(fb_id); } } break; } } } // Image var _Image = _Doc.DocumentNode.Descendants().Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("mtm")).ToList(); if (_Image != null && _Image.Count > 0) { foreach (var itemImg in _Image) { // post normal var nodeChildImage = item.Descendants("a").ToList(); if (nodeChildImage != null && nodeChildImage.Count > 0) { foreach (var itemImage in nodeChildImage) { var fb_id = new List <string>(); var _image = itemImage.GetAttributeValue("data-ploi", ""); var _apiDetail = itemImage.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(_image)) { _image = _image.Replace("amp;", ""); } if (!string.IsNullOrEmpty(_image) && !string.IsNullOrEmpty(_apiDetail)) { var Pin = new PinsModels(); var Splits = _apiDetail.Split('/').ToList(); if (Splits != null && Splits.Count >= 5) { fb_id.Add(Splits[4]); } if (fb_ids != null && fb_ids.Count > 0) { fb_id.AddRange(fb_ids); } //CrawlerFBDetail(_apiDetail, fb_id, ref Pin); Pin.LinkApi = "https://www.facebook.com" + _apiDetail; Pin.ImageURL = _image; Pin.OwnerName = OwnerName; Pin.Description = description; Pin.FbIds = fb_id; pins.Pins.Add(Pin); //if (!string.IsNullOrEmpty(Pin.ID)) // pins.Pins.Add(Pin); } } } //post dynamic var nodeChildDynamic = itemImg.Descendants("ul").ToList(); if (nodeChildDynamic != null && nodeChildDynamic.Count > 0) { var _doc = new HtmlDocument(); _doc.LoadHtml(nodeChildDynamic[0].InnerHtml); var nodeLI = _doc.DocumentNode.Descendants().Where( x => (x.Name == "li" && x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("_5ya"))).ToList(); if (nodeLI != null && nodeLI.Count > 0) { Parallel.ForEach(nodeLI, (itemLI) => { var Pin = new PinsModels(); var nodeLIImage = itemLI.Descendants("img").ToList(); if (nodeLIImage != null && nodeLIImage.Count > 0) { var _image = nodeLIImage[0].GetAttributeValue("src", ""); if (!string.IsNullOrEmpty(_image)) { _image = _image.Replace("amp;", ""); Pin.ImageURL = _image; var PinId = findFbOh(_image); if (!string.IsNullOrEmpty(PinId)) { Pin.ID = PinId + "_" + fb_ids[0]; } else { PinId = findFbHash(_image); if (!string.IsNullOrEmpty(PinId)) { Pin.ID = PinId + "_" + fb_ids[0]; } else { Pin.ID = Guid.NewGuid().ToString(); } } } } var nodeLink = itemLI.Descendants("a").ToList(); if (nodeLink != null && nodeLink.Count > 0) { var _link = nodeLink[0].GetAttributeValue("href", ""); Pin.Link = _link; } //description var nodeLIDescription = itemLI.Descendants("div").ToList(); if (nodeLIDescription != null && nodeLIDescription.Count > 0) { var _description = nodeLIDescription.Where(x => x.LastChild.Name.Equals("#text")).FirstOrDefault(); if (_description != null) { Pin.Description = _description.InnerText; } } Pin.OwnerName = OwnerName; Pin.IsDynamic = true; if (!string.IsNullOrEmpty(Pin.ID)) { pins.Pins.Add(Pin); } }); } } } } } } } } if (pins != null && pins.Pins != null && pins.Pins.Any()) { pins1.Pins.AddRange(pins.Pins); } } } catch (Exception ex) { } }
public static void CrawlerFb(string url, string cookie, ref CMS_CrawlerModels pins, ref int countExp, ref string _pageId) { int _port = 0; string _proxy = CommonHelper.RamdomProxy(ref _port); Uri uri = new Uri(url); var httpWebRequest = (HttpWebRequest)WebRequest.Create(uri); //httpWebRequest.Proxy = new WebProxy(_proxy, _port); httpWebRequest.KeepAlive = false; /* request need cookie & user agent */ httpWebRequest.Headers["Cookie"] = cookie; httpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"; httpWebRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; httpWebRequest.Timeout = 9000000; try { using (HttpWebResponse httpResponse = (HttpWebResponse)httpWebRequest.GetResponse()) { try { if (httpResponse.StatusCode == HttpStatusCode.OK) { using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var html = streamReader.ReadToEnd(); CrawlerDataFacebook(html, false, ref pins, ref _pageId); streamReader.Close(); streamReader.Dispose(); } } } catch (IOException exIO) { NSLog.Logger.Info("crawl error io exception" + url + " ", exIO.Message); Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerFb(url, cookie, ref pins, ref countExp, ref _pageId); } } catch (Exception ex) { if (httpResponse.StatusCode == HttpStatusCode.NotFound) { Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerFb(url, cookie, ref pins, ref countExp, ref _pageId); } } LogHelper.WriteLogs("ErrorCrawlerFB: " + url, JsonConvert.SerializeObject(ex)); NSLog.Logger.Error("Crawler Fb: " + url, ex); } } } catch (WebException ex) { NSLog.Logger.Info("Crawl error : " + url + ": ", ex.Message); if (ex.Status == WebExceptionStatus.ProtocolError && ex.Response != null) { var resp = (HttpWebResponse)ex.Response; if (resp.StatusCode == HttpStatusCode.NotFound) { Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerFb(url, cookie, ref pins, ref countExp, ref _pageId); } } } else { Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerFb(url, cookie, ref pins, ref countExp, ref _pageId); } } } catch (IOException exIO) { NSLog.Logger.Info("crawl error io exception" + url + " ", exIO.Message); Thread.Sleep(500); if (countExp <= 5) { countExp = countExp + 1; CrawlerFb(url, cookie, ref pins, ref countExp, ref _pageId); } } catch (Exception ex) { NSLog.Logger.Error("crawl error :", ex); } //httpWebRequest.Abort();//cancel request }
public static bool Get_Tagged_HomePins(ref CMS_CrawlerModels model, int limit = 1, string bookmarks_str = null, int page = 1) { if (page > limit) { return(false); } var next_page = false; if (!string.IsNullOrEmpty(bookmarks_str)) { next_page = true; } string data = string.Empty; var urlOrg = Commons.HostApi; if (!next_page) { var objJson = new { options = new { field_set_key = "hf_grid", in_nux = false, is_react = true, prependPartner = false, prependUserNews = false, repeatRequestBookmark = "", static_feed = false }, context = new { }, }; string input = JsonConvert.SerializeObject(objJson); urlOrg = Commons.HostApiHomePin; string[] pattern = new string[] { "\n", "\r", "\t" }; string[] replacements = new string[] { "", "", "" }; data = Preg_replace(input, pattern, replacements); } else { var objJson = new { options = new { bookmarks = new string[] { bookmarks_str }, field_set_key = "hf_grid", in_nux = false, is_react = true, prependPartner = false, prependUserNews = false, repeatRequestBookmark = "", static_feed = false }, context = new { }, }; string input = JsonConvert.SerializeObject(objJson); urlOrg = Commons.HostApiHomePin; string[] pattern = new string[] { "\n", "\r", "\t" }; string[] replacements = new string[] { "", "", "" }; data = Preg_replace(input, pattern, replacements); } // data = HttpContext.Current.Server.UrlEncode(data); var timestamp = GetTimestamp(DateTime.Now); var url = urlOrg + "&data=" + data + "&_=" + timestamp; var bookmarks = ""; getDataPinterestHome(url, model, "", ref bookmarks); if (!string.IsNullOrEmpty(bookmarks)) { Get_Tagged_HomePins(ref model, limit, bookmarks, ++page); } return(false); }
public static CMS_CrawlerModels getDataPinterestHome(string url, CMS_CrawlerModels model, string pinId, ref string bookmarks) { try { Uri uri = new Uri(url); var httpWebRequest = (HttpWebRequest)WebRequest.Create(uri); httpWebRequest.Headers["X-Requested-With"] = "XMLHttpRequest"; httpWebRequest.Headers["Cookie"] = "_auth=1; csrftoken=dMWi2a6L1DTFUHmyqem0oGrDmteiaETw; _pinterest_sess=\"TWc9PSZsSlA1dUF4QWlRWGRYVGR6Qm9mN3pwczUyUDk4ZDYvckduSjl4N3ZSRHlsU1VmWkhBTUsrMU9KNkxjS3pyUk1zREdDL2Rmb2VuT1dwRDhSTmxTOE1Ja0FjOUtreTJVc0o0SmthQ2xhN3lRa3BQVnRMcUF5dlN0Z255Syt4am56VnQvYVQwT0JyejBCSlk4YzFyQ0pEekZwNSs0YjZnMTBseEIvRkU0Um1XeWthZ1cvNGxpdDVyTEdrSHRzWFVLN244T25TaGVoYy93TGVSRjVxNzl5dnlZV1A5L3NlNnc5MWE4djl0ZjNoeEhqTTNuaGduRnZ2VkF1RTd6V1V3VnBCT3cyMksxMHJIdVE0TVVjc3FmWVozVllzekhpNFRGNDFBTERIVzdkcUNUS3NlWEJFdE1mSXJBbnNPVStHQXJiUWJRSENyVVVKTVJYNit5MkZTMFVNN3ptY09FNmFoaHk3Nk9MdUtuRmdDSWRWRVhPTWYrSXA4dFhlRU1hYW5paFNQMU5OcFNwY2xSZlJHZVlWWU03eHFsNWVmSWRHL0ZtN3NhdU9ubzhpUjZqMzNTTUxwMTlOQWRGa29zVUc1UXFqZ1BUYzhHL3M0YndDY2ZBN2ZMZnJQZTlGbXdPWjg5SXJVOEpUMEtPVnMzcjZPcytOVHRFUnlRUnoyNmJZdjl0YXJlOVp1WGQvM29SSi9xWUwvYmFPcDl5VFl1aEw2ZFBtMHlhZ0g4MXlIMXp1dnFXWWY1VytmY0ZPc0FSMzhqYXdhNTBqQjlYRHJ6OE9CY1ViMmljZkFhQkVydGxyVUtlNis4cnh3R3NPbXVTVjZCZUNTR1NKQ3JpWFJsajBsSEFGcytOMnptN2R2S1BXN1NocTFtZVlKMzF0Y1hyQXNseG9DdzdrQklxNnZXMkk2dXQ4azJJOTR4YWlIUDMvVzAwcmQ0SDVqNnhYc3NlTTNpK0ZHUU9xaUpCOER0N1pQaWFFTUhLRGxpdk1EVDlOYi9DdmRLcTQvdUROekpjRXNJSjVtcEl1bWVLUHhRdTVQQk91L1RWS0w0YkkzZDNwaW5mRnJFakRsck9aNTRBUXVsVFdFWVlTRHJ5OUxBWHdMa0V4Jk1FSHZIUWlQUlE2Q05OZWJydEZrV25SQ2tmND0 = \"; G_ENABLED_IDPS=google; _b=\"ATWTNNfXaINNj5j6VvA6 + rquchpAz7VF + IS8VabE7fJo7ragqOV82ASwCOgxcnxHC5k = \"; pnodepath=\" / 4\"; _ga=GA1.2.1908176321.1528170001; fba=True; cm_sub=none; sessionFunnelEventLogged=1; bei=false"; httpWebRequest.Timeout = 100000; var httpResponse = (HttpWebResponse)httpWebRequest.GetResponse(); using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var answer = streamReader.ReadToEnd(); JavaScriptSerializer jsonSerializer = new JavaScriptSerializer(); dynamic dobj = jsonSerializer.Deserialize <dynamic>(answer); if (dobj != null) { var resource_data_cache = dobj["resource_response"]; if (resource_data_cache != null) { var data = resource_data_cache["data"]; if (data != null) { var results = (dynamic)null; results = data; if (results != null) { foreach (var item in results) { var pin = new PinsModels(); var itemPin = item as Dictionary <string, dynamic>; bool flag = true; if (itemPin.ContainsKey("domain")) { pin.Domain = itemPin["domain"]; } else { flag = false; } if (itemPin.ContainsKey("id")) { pin.ID = itemPin["id"]; } else { flag = false; } if (itemPin.ContainsKey("link")) { pin.Link = itemPin["link"]; } else { flag = false; } if (itemPin.ContainsKey("created_at")) { pin.Created_At = DateTime.Parse(itemPin["created_at"], new CultureInfo("en-US", true)); } else { flag = false; } if (itemPin.ContainsKey("images")) { var Images = itemPin["images"] as Dictionary <string, dynamic>; if (Images != null) { foreach (var itemImg in Images) { var Image = itemImg.Value; var _ImageModel = new ImageModels() { url = Image["url"], height = Convert.ToInt16(Image["height"]), width = Convert.ToInt16(Image["width"]) }; pin.Images.Add(_ImageModel); } } } else { flag = false; } if (flag) { model.Pins.Add(pin); } } } } var dataBookmark = dobj["resource"]["options"]; if (dataBookmark != null) { bookmarks = dataBookmark["bookmarks"][0]; } } } streamReader.Close(); streamReader.Dispose(); } } catch (Exception ex) { } return(model); }
public bool CrawlData(string Id, string createdBy, ref string msg) { NSLog.Logger.Info("CrawlData: " + Id); var model = new CMS_CrawlerModels(); var sequence = 0; var key = ""; var _cookie = ""; DateTime lastdate = DateTime.Now.AddDays(-7); DateTime datenow = DateTime.Now; var result = true; try { using (var _db = new CMS_Context()) { /* get key by ID */ var keyWord = _db.CMS_KeyWord.Where(o => o.ID == Id).FirstOrDefault(); if (keyWord != null) { sequence = keyWord.Sequence; key = keyWord.KeyWord; /* check time span crawl */ var timeSpanCrawl = DateTime.Now - keyWord.UpdatedDate; if (timeSpanCrawl.Value.TotalMinutes > 5 || keyWord.UpdatedDate == keyWord.CreatedDate) /* 5min to crawl data again */ { /* update crawer date */ var bkTime = keyWord.UpdatedDate; keyWord.UpdatedDate = DateTime.Now; keyWord.UpdatedBy = createdBy; _db.SaveChanges(); /* call drawler api to crawl data */ CMSPinFactory _fac = new CMSPinFactory(); var listAcc = _db.CMS_Account.Where(o => o.Status == (byte)Commons.EStatus.Active && o.IsActive && !string.IsNullOrEmpty(o.Cookies)).ToList(); var listCookie = listAcc.Select(x => x.Cookies).ToList(); _cookie = CommonHelper.RamdomCookie(listCookie); /* crawler tab post */ var PageSize = Convert.ToInt32(Commons.PageSize); var modelPost = new CMS_CrawlerModels(); string q = "keywords_search(" + keyWord.KeyWord.Replace(" ", "+") + ")"; string ref_path = "/search/str/" + keyWord.KeyWord + "/stories-keyword/stories-public"; //CrawlerFBToolHelpers.CrawlerNow(q, ref_path, "list", (byte)Commons.EType.Post, _cookie, PageSize, ref modelPost); //string q = "stories-public(stories-keyword(" + keyWord.KeyWord + "))"; //string ref_path = "/search/str/" + keyWord.KeyWord + "/stories-keyword/stories-public"; NSLog.Logger.Info("done crawler tab post : ", modelPost.Pins.Count); if (modelPost.Pins != null && modelPost.Pins.Any()) { model.Pins.AddRange(modelPost.Pins); } /* crawler tab people */ var modelPeople = new CMS_CrawlerModels(); q = "stories-opinion(stories-keyword(" + keyWord.KeyWord + "))"; ref_path = "/search/str/" + keyWord.KeyWord + "/stories-keyword/stories-opinion"; //CrawlerFBToolHelpers.CrawlerNow(q, ref_path, "list", (byte)Commons.EType.People, _cookie, PageSize, ref modelPeople); NSLog.Logger.Info("done crawler tab people : ", modelPeople.Pins.Count); if (modelPeople.Pins != null && modelPeople.Pins.Any()) { model.Pins.AddRange(modelPeople.Pins); } /* crawler tab photo */ var modelPhoto = new CMS_CrawlerModels(); q = "photos-keyword(" + keyWord.KeyWord.Replace(" ", "+") + ")"; ref_path = "/search/str/" + keyWord.KeyWord.Replace(" ", "+") + "/photos-keyword"; CrawlerFBToolHelpers.CrawlerNow(q, ref_path, "grid", (byte)Commons.EType.Photo, _cookie, 70, ref modelPhoto); /*crawler detail tab photo */ PinsModels refmodelPhoto = new PinsModels(); var options = new ParallelOptions { MaxDegreeOfParallelism = 10 }; //for (int i = 0; i < modelPhoto.Pins.Count; i++) //{ // CrawlerFBToolHelpers.CrawlerDetail(modelPhoto.Pins[i].PhotoID, _cookie, (byte)Commons.EType.Photo, ref refmodelPhoto); //} Parallel.ForEach(modelPhoto.Pins, options, pin => { CrawlerFBToolHelpers.CrawlerDetail(pin.PhotoID, _cookie, (byte)Commons.EType.Photo, ref pin); }); NSLog.Logger.Info("done crawler tab photo : ", modelPhoto.Pins.Count); if (modelPhoto.Pins != null && modelPhoto.Pins.Any()) { model.Pins.AddRange(modelPhoto.Pins); } var res = false; if (model.Pins.Count > 0) { NSLog.Logger.Info("done crawler before 7 days ago : ", model.Pins.Count); /* check 7 days ago */ model.Pins = model.Pins.Where(o => o.Created_At >= lastdate && o.Created_At <= datenow).ToList(); NSLog.Logger.Info("done crawler after 7 days ago : ", model.Pins.Count); Parallel.ForEach(model.Pins, options, pin => { if (pin.Type != (byte)Commons.EType.Photo) { CrawlerFBToolHelpers.CrawlerDetail(pin.PhotoID, _cookie, (byte)Commons.EType.Post, ref pin); } }); res = _fac.CreateOrUpdate(model.Pins, keyWord.ID, createdBy, keyWord.KeyWord, ref msg); } if (res == false) { /* back to last crawl data */ //keyWord.UpdatedDate = bkTime; //_db.SaveChanges(); result = false; } else { keyWord.UpdatedDate = DateTime.Now; _db.SaveChanges(); } } } } LogHelper.WriteLogs(sequence.ToString() + " " + key, "Num post: " + model.Pins.Count().ToString()); NSLog.Logger.Info("ResponseCrawlData", result.ToString()); } catch (Exception ex) { msg = "Crawl data is unsuccessfully."; result = false; LogHelper.WriteLogs("ErrorCrawlData: " + Id, JsonConvert.SerializeObject(ex)); NSLog.Logger.Error("ErrorCrawlData: " + Id, ex); } return(result); }