public ResultDto InsertImgSearchTask(IW2S_ImgSearchTaskDto data) { ResultDto result = new ResultDto(); var builder = Builders <IW2S_ImgSearchTask> .Filter; var col = MongoDBHelper.Instance.GetIW2S_ImgSearchTasks(); var usrObjId = MongoDBHelper.Instance.GetIW2S_Projects().Find(Builders <IW2S_Project> .Filter.Eq(x => x._id, new ObjectId(data.ProjectId))).Project(x => x.UsrId).FirstOrDefault(); var filter = builder.Eq(x => x.ProjectId, new ObjectId(data.ProjectId)) & builder.Eq(x => x.Src, data.Src) & builder.Eq(x => x.IsDel, false); var task = col.Find(filter).FirstOrDefault(); if (task != null) { result.Message = "已经上传成功了"; return(result); } IW2S_ImgSearchTask kw = new IW2S_ImgSearchTask { _id = ObjectId.GenerateNewId(), CreatedAt = DateTime.Now.AddHours(8), ProjectId = new ObjectId(data.ProjectId), UsrId = usrObjId, BotStatus = 0, Src = data.Src, IsDel = false }; col.InsertOne(kw); IW2S_OperateLog log = new IW2S_OperateLog { CreatedAt = DateTime.Now.AddHours(8), ProjectId = new ObjectId(data.ProjectId), ShareOperateType = (int)ShareOperateType.AddKeyword, UserId = new ObjectId(data.UsrId), SiteSource = (int)SiteSource.BaiduImg }; MongoDBHelper.Instance.GetIW2S_OperateLogs().InsertOne(log); result.IsSuccess = true; return(result); }
public void save_level1_links(List <IW2S_ImgSearchLink> links, IW2S_ImgSearchTask tsk) { if (links == null || links.Count == 0) { Console.WriteLine("SUCCESS saving 0 Level 1 Links for " + tsk.Src); return; } int pagesize = 100; int count = 0; var col = MongoDBHelper.Instance.GetIW2S_ImgSearchLinks(); var builder = Builders <IW2S_ImgSearchLink> .Filter; for (int page = 0; page *pagesize < links.Count; page++) { var list = links.Skip(page * pagesize).Take(pagesize).ToList(); //list.ForEach(x => x._id = new MongoDB.Bson.ObjectId(IDHelper.GetGuid("{0}/&itemid={1}".FormatStr(x.Domain, x.LinkUrl)).ToString())); list = ListDistinctBy(list, x => x.BizId); FieldsDocument fd = new FieldsDocument(); fd.Add("BizId", 1); List <ObjectId> bizIds = list.Select(x => x.BizId).ToList(); var exists_objs = col.Find(builder.In(x => x.BizId, bizIds)).Project(x => x.BizId).ToList(); List <ObjectId> exists_ids = new List <ObjectId>(); foreach (var result in exists_objs) { exists_ids.Add(result); } if (exists_ids != null && exists_ids.Count > 0) { list = list.Where(x => !exists_ids.Contains(x.BizId)).ToList(); } if (list == null || list.Count == 0) { continue; } count += pagesize; col.InsertMany(links); Console.WriteLine("SUCCESS saving " + links.Count + " Level 1 Links for " + tsk.Src); } }
private void query(IW2S_ImgSearchTask p) { //http://www.baidu.com/s?rtt=2&tn=baiduwb&wd=%E8%80%81%E9%85%B8%E5%A5%B6 try { string http = "http://211.154.6.166:9000"; //string baiduUrl = "http://image.baidu.com/n/pc_search?queryImageUrl=http://a.hiphotos.baidu.com/image/pic/item/f9dcd100baa1cd1162eeea1ab112c8fcc3ce2dab.jpg" string link = "http://image.baidu.com/n/pc_list?queryImageUrl={0}&pos=moresource#activeTab=1".FormatStr(p.Src); //string link = "http://image.baidu.com/n/pc_search?queryImageUrl={0}".FormatStr(p.Src); Console.WriteLine(link); var html = proxy.GetFastHtmlWithProxyIpAndARE(link, "utf-8").RemoveSpace(); if (html == null) { var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 0 } } } }; var commendCol = MongoDBHelper.Instance.GetIW2S_ImgSearchTasks(); var result = commendCol.UpdateOne(new QueryDocument { { "_id", p._id } }, update); return; } var json = "[{" + html.SubAfter("'sameList':").SubBefore("'sameSizeNum':").SubAfter("[{").SubBefore("}]") + "}]"; var objImgs = JsonToObject(json); int rank = 1; foreach (var objImg in objImgs) { objImg.fromPageTitle = RemoveInivalidChar(objImg.fromPageTitle.GetTxtFromHtml2().RemoveSpace().GetLower()); objImg.textHost = RemoveInivalidChar(objImg.textHost.GetTxtFromHtml2().RemoveSpace().GetLower()); HanleTagData(p, objImg.fromPageTitle, objImg.fromURL, objImg.textHost, objImg.fromURLHost, objImg.objURL, rank); rank++; } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
private void HanleTagData(IW2S_ImgSearchTask tsk, string title, string href, string abs, string domain, string src, int rank) { string realUrl = null, detailHtml = null; if (!string.IsNullOrWhiteSpace(href)) { //Encoding enc = Encoding.UTF8; //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl); var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } if (string.IsNullOrEmpty(detailHtml)) { return; } Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); Match m = reg.Match(detailHtml); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (m.Groups.Count > 0) { time = m.Groups[0].Value; } IW2S_ImgSearchLink l1 = new IW2S_ImgSearchLink { UsrId = tsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Src = src, LinkUrl = href, BizId = "{0}{1}".FormatStr(href, tsk._id.ToString()).ToObjectId(), IW2S_ImgSearchTaskId = tsk._id, CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, ProjectId = tsk.ProjectId, PublishTime = time, Rank = rank }; save_level1_links(new List <IW2S_ImgSearchLink> { l1 }, tsk); }