void BeginCheckAnonymous(RatedProxy proxy) { Uri anonymUri = new Uri("http://checker.samair.ru/"); DownloaderObj obj = new DownloaderObj(anonymUri, EndCheckAnonimous, true, proxy, CookieOptions.NoCookies, 3); Downloader.Queue(obj); }
void CheckYaPage(DownloaderObj obj) { bool noYaBan = PageIsOk(obj.DataStr, GlobalResourceCache.YaPagePattern); obj.Proxy.YaRate = noYaBan ? 1 : 0; obj.Proxy.YaChecked++; }
void EndCheckAnonimous(DownloaderObj obj) { AnonymousRegexes anonRegxs = GlobalResourceCache.AnonymCheck; if (obj.DataStr != null) { if (anonRegxs.Anonymous.IsMatch(obj.DataStr)) { obj.Proxy.AnonymousLevel = AnonymousLevel.Anonymous; } else if (anonRegxs.HightAnonymous.IsMatch(obj.DataStr)) { obj.Proxy.AnonymousLevel = AnonymousLevel.HightAnonymous; } else { obj.Proxy.AnonymousLevel = AnonymousLevel.NotAnonymous; } } else { if (_logging) { GlobalLog.Err("Cant download page from anon checking site, p:" + obj.Proxy.Address.Host); } } }
public CookieCollection GetCookies(DownloaderObj obj) { while (obj.Attempts > 0) { obj.Attempts--; try { obj.Request = CreateRequest(obj); obj.Response = obj.Request.GetResponse() as HttpWebResponse; if (obj.Request.HaveResponse && obj.Response.Cookies != null) return obj.Response.Cookies; } catch (WebException e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); HandleWebState(e, obj); } catch (Exception e) { throw e; } finally { if (obj.Request != null) obj.Request.Abort(); if (obj.Response != null) obj.Response.Close(); } } return null; }
void EndDownload(DownloaderObj obj) { object[] args = obj.Arg as object[]; PatternsContainer sitePatt = args[0] as PatternsContainer; SyncWaitObj waiter = args[1] as SyncWaitObj; int siteChecksCount = (int)args[2]; if (obj.DataStr != null) { bool originalPage = sitePatt.Validation.Validate(obj.DataStr); if (originalPage) { waiter.MultidownloadRate += (double)obj.Attempts / (DownloadsAttempts - 1); } } if (Interlocked.Increment(ref waiter.Count) == siteChecksCount) { obj.Proxy.MultidownloadRate = waiter.MultidownloadRate / siteChecksCount; EndTest(obj.Proxy); ////DEBUG //Console.WriteLine("Downloads END {0}, rate ", DateTime.Now.ToShortTimeString(), obj.Proxy.MultidownloadRate); } }
int?GetAvgLatency(Uri uri, RatedProxy proxy, int attempts) { Stopwatch timer = new Stopwatch(); int totalLatency = 0; int score = 0; for (int i = 0; i < attempts; i++) { DownloaderObj obj = new DownloaderObj(uri, null, false, proxy, CookieOptions.Empty, 1); timer.Restart(); bool haveResponse = Downloader.HaveResponce(obj); timer.Stop(); if (haveResponse) { totalLatency += (int)timer.ElapsedMilliseconds; score++; } } if (score == 0) { return(null); } return(totalLatency / score); }
private HttpWebResponse GetResponse(DownloaderObj obj) { HttpWebRequest request = CreateRequest(obj); //ADD POST DATA HANDLING ?? return(request.GetResponse() as HttpWebResponse); }
/// <summary> /// Post image data on server /// </summary> /// <param name="image">Image data</param> /// <param name="type">Image extension type</param> /// <param name="param">Recognize params</param> public void PostImage(byte[] image, ImgType type, RecognizeParams param = null) { if (image == null || image.Length == 0) { throw new ArgumentException("Bad image data"); } List <KeyValuePair <string, string> > postParams = new List <KeyValuePair <string, string> >(); postParams.Add(new KeyValuePair <string, string>("method", "post")); postParams.Add(new KeyValuePair <string, string>("key", ACCOUNT_ID)); if (param != null) { foreach (var keyValue in param.Params) { postParams.Add(keyValue); } } byte[] fbData = BuldFormbasedData(postParams, StreamEncoding.GetString(image), type); string contentType = "multipart/form-data; boundary=" + _boundary; RequestParams reqParams = new RequestParams(null, null, contentType, null, true, "POST", StreamEncoding, false); DownloaderObj obj = new DownloaderObj(_postUri, EndPostImage, true, null, CookieOptions.Empty, 4, null, null, false, 1000, null, reqParams); obj.PostData = fbData; Downloader.Queue(obj); }
public static List<string> ParseGoogleKeys(int count, string startKey) { List<string> keys = new List<string>(); keys.Add(startKey); string pattern = "<p[^>]*><a href=\".*?q=([^&]*)&[^>]*?>.*?</a></p>"; Regex rx = new Regex(pattern, RegexOptions.Compiled); int i = 0; while (count > keys.Count & i <= keys.Count - 1) { string key = Uri.EscapeUriString(keys[i].Replace(' ', '+')); Uri uri = new Uri("http://www.google.ru/search?q=" + key + "&sourceid=opera&num=0&ie=utf-8&oe=utf-8&start=0"); DownloaderObj obj = new DownloaderObj(uri, null, true, null, CookieOptions.NoCookies, 3); Downloader.DownloadSync(obj); if (obj.DataStr == null) return null; MatchCollection results = rx.Matches(obj.DataStr); foreach (Match m in results) { keys.Add(m.Groups[1].Value); } keys = keys.Distinct().ToList<string>(); if (OnKeyPageParsed!=null) OnKeyPageParsed(null, new KeyEventArgs(keys.Count, i, count)); i++; } return keys; }
void CheckGooglePage(DownloaderObj obj) { bool noGoogleBan = PageIsOk(obj.DataStr, GlobalResourceCache.GooglePagePattern); obj.Proxy.GoogleRate = noGoogleBan ? 1 : 0; obj.Proxy.GoogleChecked++; }
internal void BeginReceive(DownloaderObj obj) { if (obj.Attempts > 0) { obj.Attempts--; try { obj.Request = CreateRequest(obj); if (obj.PostData != null) { PostRequestData(obj.Request, obj.PostData); } //Console.WriteLine("Send request to -- {0}", obj.Uri.OriginalString); obj.Request.BeginGetResponse(EndReceive, obj); } catch (WebException e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); HandleWebState(e, obj); RetryOrCallback(obj); } catch (Exception e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); RetryOrCallback(obj); } } }
//public void QueueGooglePages(int numPage, string key, Regex rx, WaitObj waiter) //{ // Uri uri = new Uri("http://www.google.ru/search?q=" + key + "&sourceid=opera&num=0&ie=utf-8&oe=utf-8&start=" + numPage); // DownloaderObj obj = new DownloaderObj(uri, EndGetPage, true, null, false, 10, rx); // Downloader.Queue(obj); //} void EndGetPage(DownloaderObj obj) { object[] args = obj.Arg as object[]; if (obj.DataStr != null) { List<string> urls = new List<string>(); Regex rx = args[0] as Regex; WaitObj waiter = args[1] as WaitObj; MatchCollection urlsMatches = rx.Matches(obj.DataStr); foreach (Match urlMatch in urlsMatches) { urls.Add(urlMatch.Groups[1].Value); } if (OnParsed!=null) OnParsed(urls); if (Interlocked.Decrement(ref waiter.Count) == 0 && OnCompleted != null) OnCompleted(this, EventArgs.Empty); } else { //ProxyRotator proxyGiver = args[2] as ProxyRotator; //proxyGiver.TryGetProxy(ref obj.PrxContainer); //obj.Proxies = proxyGiver; obj.Attempts = 10; obj.CallBack = EndGetPageWithProxy; } }
private void HandleBadDownload(DownloaderObj obj) { DevourTarget target = obj.Arg as DevourTarget; if (obj.State == HttpDownloadResult.BadAddress || obj.State == HttpDownloadResult.DocumentUnavailable) { AddFaultTarget(target); //Document extremely guilty _proxies.Release(obj.PrxContainer as ProxyContainer, false); //Proxy simply had a bad day } else { if (obj.State == HttpDownloadResult.ProxyError) { _proxies.Fire(obj.PrxContainer as ProxyContainer); //Proxy extremely guilty _targets.Enqueue(target); //Document not in the business } else { if (--target.Lifes <= 0) { AddFaultTarget(target); //Sth going wrong } _proxies.Release(obj.PrxContainer as ProxyContainer, false); //Proxy simply had a bad day } } }
private void DevourCallback(DownloaderObj obj) { if (obj.DataStr != null) { if (_validator != null) { if (_validator.Validate(obj.DataStr)) { HandleSuccessDownload(obj); } else { HandleBadDownload(obj); } } else { HandleSuccessDownload(obj); } } else { HandleBadDownload(obj); } MoveObjComplete(); }
public static List<RatedProxy> ParseProxyFromPage(Uri uri) { DownloaderObj obj = new DownloaderObj(uri, null, true); Downloader.DownloadSync(obj); if (obj.DataStr == null) return null; return ParseProxy(obj.DataStr); }
public List <RatedProxy> ParsePage(string data) { if (data == null) { return(null); } List <RatedProxy> proxies = new List <RatedProxy>(); Dictionary <string, string> imageLinksAndHash = new Dictionary <string, string>(); string ipPattern = @"<td>(?<ip>[^<]*)</td><td><img src=""(?<image>/images/proxylist_port_\d*.gif)""></td>"; Regex ipRx = new Regex(ipPattern); MatchCollection ipMatches = ipRx.Matches(data); Hashtable portHashes = LoadPortHashes(); foreach (Match ipMatch in ipMatches) { string imagePath = "http://hideme.ru" + ipMatch.Groups["image"].Value; DownloaderObj obj = new DownloaderObj(new Uri(imagePath), null, false, null); Downloader.DownloadSync(obj); if (obj.Data == null) { continue; } string imageHash = GetMd5HashString(obj.Data); if (imageHash == null) { continue; } if (portHashes.Contains(imageHash)) { string port = portHashes[imageHash] as string; string ip = ipMatch.Groups["ip"].Value; if (ip.IsValidIP() && port.IsValidPort()) { proxies.Add(new RatedProxy(ip + ":" + port)); } } else { if (!imageLinksAndHash.ContainsKey(imageHash)) { imageLinksAndHash.Add(imageHash, imagePath); } continue; } } AddUnknownPortImage(imageLinksAndHash); return(proxies); }
private void DevourOne(ProxyContainer proxyCont, DevourTarget target) { DownloaderObj obj = new DownloaderObj(target.Uri, DevourCallback, true, proxyCont, this.CookieOptions, target.Attempts, target, null, false, 1000, this.TimingParams, this.RequestParams); Downloader.Queue(obj); #if DEBUG Interlocked.Increment(ref __queuedObjects); GlobalLog.Write("__queuedObjects: {0}", __queuedObjects); #endif MoveObjQueued(); }
public ResponseState GetWebState(WebException e, DownloaderObj obj) { if (e.Response == null) { return(HandleWebExcStatus(e.Status)); } else { return(HandleHttpCode((e.Response as HttpWebResponse).StatusCode)); } }
public static CookieCollection GetCookies(DownloaderObj obj) { if (!obj.CookieOptions.HasFlag(CookieOptions.NoCookies)) { return(new MagicClient().GetCookies(obj)); } else { return(null); } }
private static string Parse(string address, string pattern) { Uri uri = UriHandler.CreateUri(address); DownloaderObj obj = new DownloaderObj(uri, null, true, null, CookieOptions.UseShared & CookieOptions.SaveShared, 5); Downloader.DownloadSync(obj); if (obj.DataStr != null) { return (new Regex(pattern)).Match(obj.DataStr).Groups[1].ToString(); } else return null; }
public static List <RatedProxy> ParseProxyFromPage(Uri uri) { DownloaderObj obj = new DownloaderObj(uri, null, true); Downloader.DownloadSync(obj); if (obj.DataStr == null) { return(null); } return(ParseProxy(obj.DataStr)); }
private void ParseSerpData(DownloaderObj obj) { List<RatedProxy> p = ProxyParser.ParseProxy(obj.DataStr); if (p != null) { lock (SerpProxy) { SerpProxy.AddRange(p); } if (OnUrlsPrsProgrChanged != null) OnUrlsPrsProgrChanged(p.Count, obj.Uri.OriginalString); } }
void RetryOrCallback(DownloaderObj obj) { if (obj.Attempts > 0) { Thread.Sleep(Rnd.Next(obj.AttemptPause / 2, (int)(obj.AttemptPause * 1.5))); BeginReceive(obj); } else { CallbackAndContinue(obj); } }
void EndDownloadAndParse(DownloaderObj obj) { object[] args = obj.Arg as object[]; IProxySiteProvider proxySiteProvider = args[0] as IProxySiteProvider; WaitObj waiter = args[1] as WaitObj; List<RatedProxy> proxies = null; proxies = proxySiteProvider.ParsePage(obj.DataStr); NotifyAboutProgress(waiter, proxies); }
void EndDownloadAndParse(DownloaderObj obj) { object[] args = obj.Arg as object[]; IProxySiteProvider proxySiteProvider = args[0] as IProxySiteProvider; WaitObj waiter = args[1] as WaitObj; List <RatedProxy> proxies = null; proxies = proxySiteProvider.ParsePage(obj.DataStr); NotifyAboutProgress(waiter, proxies); }
void GetProductCallback(DownloaderObj obj) { if (obj.DataStr != null) { List <Uri> products = GetProductsUrls(obj.DataStr); lock (allProductsUrlsSync) { allProductsUrls.AddRange(products); SaveProducts(products); } Console.WriteLine("Found {0} products on categoryIndx {1}", allProductsUrls.Count, (int)obj.Arg); } }
//ASYNC(NOT TESTED) #region Async RBL //void BeginRBLCheck(RatedProxy proxy) //{ // List<string[]> rblList = GlobalResourceCache.RBLList; // int indx = 0; // double rblBanRate = 0; // int rblChecks = rblList.Count; // BeginRBLCheck(indx, rblBanRate, rblChecks, proxy); //} //void EndRBLCheck(DownloaderObj obj) //{ // object[] args = obj.Arg as object[]; // int indx = (int)args[0]; // double rblBanRate = (double)args[1]; // int rblChecks = (int)args[2]; // List<string[]> rblList = GlobalResourceCache.RBLList; // string[] rblSet = rblList[indx]; // Regex positiveRx = new Regex(rblSet[1]); // Regex negativeRx = new Regex(rblSet[2]); // double rate = GetRblBanRate(obj.DataStr, positiveRx, negativeRx); // if (rate > -1) // { // rblBanRate += rate; // } // else // rblChecks--; // indx++; // if (indx < rblList.Count) // { // BeginRBLCheck(indx, rblBanRate, rblChecks, obj.PrxContainer); // } // else if (rblChecks > 0) // { // obj.PrxContainer.RBLBanRate = rblBanRate / rblChecks; // } //} //double GetRblBanRate(string data, Regex positiveRx, Regex negativeRx) //{ // if (data == null) // return -1; // double blocked = negativeRx.Matches(data).Count; // double ok = positiveRx.Matches(data).Count; // bool noResults = blocked == 0 && ok == 0; // if (noResults) // return -1; // if (blocked == 0) // return 0; // if (ok == 0) // return 1; // return blocked / ok; //} //void BeginRBLCheck(int indx, double rblBanRate, int rblChecks, RatedProxy proxy) //{ // List<string[]> rblList = GlobalResourceCache.RBLList; // if (indx < rblList.Count) // { // string[] rblSet = rblList[indx]; // DownloaderObj obj = new DownloaderObj(new Uri(rblSet[0]), EndRBLCheck, true, proxy, CookieOptions.NoCookies, 3, new object[] { indx, rblBanRate, rblChecks }); // Downloader.Queue(obj); // } //} #endregion #endregion #region DownloadsCheck void BeginDownloads(RatedProxy proxy) { //Console.WriteLine("Downloads START {0}", DateTime.Now.ToShortTimeString()); List <PatternsContainer> sitePatterns = PagePatternGrabber.LoadPatterns(PATH.TagClassPatterns); SyncWaitObj waiter = new SyncWaitObj(0); foreach (var patt in sitePatterns) { object[] args = { patt, waiter, sitePatterns.Count }; DownloaderObj obj = new DownloaderObj(patt.Uri, EndDownload, true, proxy, CookieOptions.NoCookies, DownloadsAttempts, args); Downloader.Queue(obj); } }
public void BeginDownloadPages(int count, string uriStr, string replaseSubstr, IProxySiteProvider proxySiteProvider) { if (count == 0 || string.IsNullOrEmpty(uriStr) || string.IsNullOrEmpty(replaseSubstr) || proxySiteProvider == null) throw new ArgumentException("Bad argumenst"); WaitObj waiter = new WaitObj(count); for (int i = 0; i < count; i++) { Uri uri = new Uri(uriStr.Replace(replaseSubstr, i.ToString())); DownloaderObj obj = new DownloaderObj(uri, EndDownloadAndParse, true, null, CookieOptions.NoCookies, 10, new object[] { proxySiteProvider, waiter }); Downloader.Queue(obj); } }
/// <summary> /// Post data callback /// </summary> private void EndPostImage(DownloaderObj obj) { if (obj.DataStr != null) { HandlePostState(obj.DataStr); } else { #if DEBUG GlobalLog.Err("null data in EndPostImage, POST_DATA:\n{0}", obj.PostData); #endif Err(ErrorState.ConnectProblem); } }
private static string Parse(string address, string pattern) { Uri uri = UriHandler.CreateUri(address); DownloaderObj obj = new DownloaderObj(uri, null, true, null, CookieOptions.UseShared & CookieOptions.SaveShared, 5); Downloader.DownloadSync(obj); if (obj.DataStr != null) { return((new Regex(pattern)).Match(obj.DataStr).Groups[1].ToString()); } else { return(null); } }
void CallbackAndContinue(DownloaderObj obj) { if (obj.CallBack != null) { try { obj.CallBack.Invoke(obj); } catch (Exception e) { GlobalLog.Err(e, "CallBack err"); } } Downloader.ProcessNext(obj.Uri); }
void HandleWebState(WebException e, DownloaderObj obj) { obj.State = new DownloadStateProvider().GetWebState(obj.Request, obj.Response); if (obj.State == DownloadState.BadAddress || obj.State == DownloadState.DocumentUnavailable || obj.State == DownloadState.ProxyError) { obj.Attempts = 0; } if (obj.State == DownloadState.ServiceUnavailable && obj.Attempts > 0) { Thread.Sleep(Rnd.Next(obj.AttemptPause / 2, (int)(obj.AttemptPause * 1.5))); } }
private void ParseSerpData(DownloaderObj obj) { List <RatedProxy> p = ProxyParser.ParseProxy(obj.DataStr); if (p != null) { lock (SerpProxy) { SerpProxy.AddRange(p); } if (OnUrlsPrsProgrChanged != null) { OnUrlsPrsProgrChanged(p.Count, obj.Uri.OriginalString); } } }
public void BeginDownloadPages(int count, string uriStr, string replaseSubstr, IProxySiteProvider proxySiteProvider) { if (count == 0 || string.IsNullOrEmpty(uriStr) || string.IsNullOrEmpty(replaseSubstr) || proxySiteProvider == null) { throw new ArgumentException("Bad argumenst"); } WaitObj waiter = new WaitObj(count); for (int i = 0; i < count; i++) { Uri uri = new Uri(uriStr.Replace(replaseSubstr, i.ToString())); DownloaderObj obj = new DownloaderObj(uri, EndDownloadAndParse, true, null, CookieOptions.NoCookies, 10, new object[] { proxySiteProvider, waiter }); Downloader.Queue(obj); } }
public List <Uri> GetAllCategoriesProducts(List <string> categories) { int indx = 0; Downloader.MaxParallelRequests = 20; foreach (var category in categories) { for (int i = 1; i < 6; i++) { DownloaderObj obj = new DownloaderObj(new Uri(category + "?pg=" + i), GetProductCallback, true, null, CookieOptions.NoCookies, 100, indx); Downloader.Queue(obj); } indx++; } return(allProductsUrls); }
private void SetYandexWordsWeight() { foreach (string[] theme in allThemes) { string escape = Uri.EscapeUriString(theme[0]); Uri pageUri = new Uri("http://wordstat.yandex.ru/?cmd=words&page=1&t=" + escape + "&geo=&text_geo="); DownloaderObj obj = new DownloaderObj(pageUri, null); Downloader.DownloadSync(obj); string pattern = "<a href=\"?cmd=words&page=1&ts=[^;]*;key=[^;]*;t=" + escape + @""">[^<]*</a>\s*</td>\s*" + @"<td><[^>]*></div>[^<]*</td>\s*" + "<td[^>]*>([^<]*)</td>"; theme[2] = (Int32.Parse(new Regex(pattern, RegexOptions.Compiled).Match(obj.DataStr).Groups[1].Value) + Int32.Parse(theme[2])).ToString(); Thread.Sleep(Rnd.Next(4000, 6000)); } }
internal void EndReceive(IAsyncResult ar) { if (!ExecutionContext.IsFlowSuppressed()) { ExecutionContext.SuppressFlow(); } DownloaderObj obj = ar.AsyncState as DownloaderObj; try { #if DEBUG GlobalLog.Write("Get responce from " + obj.Uri.Host); #endif obj.Response = obj.Request.EndGetResponse(ar) as HttpWebResponse; HandleRedirectAndCookies(obj); if (TryReceiveData(obj)) { CallbackAndContinue(obj); return; } #if DEBUG GlobalLog.Write("CANT Downloaded data from " + obj.Uri.Host); #endif } catch (WebException e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); HandleWebState(e, obj); } catch (Exception e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); } finally { if (obj.Response != null) { obj.Response.Close(); } } RetryOrCallback(obj); }
public List<RatedProxy> ParsePage(string data) { if (data == null) return null; List<RatedProxy> proxies = new List<RatedProxy>(); Dictionary<string, string> imageLinksAndHash = new Dictionary<string, string>(); string ipPattern = @"<td>(?<ip>[^<]*)</td><td><img src=""(?<image>/images/proxylist_port_\d*.gif)""></td>"; Regex ipRx = new Regex(ipPattern); MatchCollection ipMatches = ipRx.Matches(data); Hashtable portHashes = LoadPortHashes(); foreach (Match ipMatch in ipMatches) { string imagePath = "http://hideme.ru" + ipMatch.Groups["image"].Value; DownloaderObj obj = new DownloaderObj(new Uri(imagePath), null, false, null); Downloader.DownloadSync(obj); if (obj.Data == null) continue; string imageHash = GetMd5HashString(obj.Data); if (imageHash == null) continue; if (portHashes.Contains(imageHash)) { string port = portHashes[imageHash] as string; string ip = ipMatch.Groups["ip"].Value; if (ip.IsValidIP() && port.IsValidPort()) proxies.Add(new RatedProxy(ip + ":" + port)); } else { if (!imageLinksAndHash.ContainsKey(imageHash)) imageLinksAndHash.Add(imageHash, imagePath); continue; } } AddUnknownPortImage(imageLinksAndHash); return proxies; }
public bool HaveResponce(DownloaderObj obj) { while (obj.Attempts > 0) { obj.Attempts--; try { obj.Request = CreateRequest(obj); bool haveResp = obj.Request.HaveResponse; obj.Request.Abort(); return haveResp; } catch (WebException e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); HandleWebState(e, obj); } catch (Exception e) { throw e; } } return false; }
//Method need some refactoring private bool TryReceiveData(DownloaderObj obj) { byte[] data = ReadResponseStream(obj.Response, obj.TimingParams); obj.State = new DownloadStateProvider().GetWebState(obj.Request, obj.Response); if (data != null) { if (obj.NeedString) obj.DataStr = obj.RequestParam.Encoding.GetString(data); else obj.Data = data; return true; } else { return obj.State == DownloadState.Success_2xx || obj.State == DownloadState.Info_1xx; } }
private void HandleSuccessDownload(DownloaderObj obj) { _proxies.Release(obj.PrxContainer as ProxyContainer, true); DevourTarget target = obj.Arg as DevourTarget; //MoveReadQueue(); try { MoveReadQueue(); target.Reader.ReadData(obj.DataStr, target); MoveReadComplete(target); } catch(Exception ex) { GlobalLog.Err(ex, "error while reading data in devourer"); } AddSuccessTarget(obj.Uri); }
void GetYandexPage(int numPage, string key, Regex rx, WaitObj waiter) { Uri pageUri = new Uri("http://yandex.ru/yandsearch?p=" + numPage + "&text=" + Uri.EscapeDataString(key)); DownloaderObj obj = new DownloaderObj(pageUri, null); Downloader.DownloadSync(obj); }
public static List<string> ParseYandexKeys(int count, string startKey) { List<string> keys = new List<string>(); keys.Add(startKey); string pattern = @"<td>\s*<a href=""\?cmd=words&page=1&ts=[^&]*&key=[^&]*&t=([^""]*)"">[^<]*</a>\s*</td>"; string splitPattern = "<tr class=\"thead\" valign=\"bottom\">"; string capchaPattern = @"<input type=""hidden"" name=""captcha_id"" value=""([^""]*)""[^>]*>"; Regex rx = new Regex(pattern, RegexOptions.Compiled); Regex splitRx = new Regex(splitPattern, RegexOptions.Compiled); Regex capchaRx = new Regex(capchaPattern, RegexOptions.Compiled); int failTryCount = 0; int i = 0; CookieCollection cookies = new CookieCollection(); while (count > keys.Count & i <= keys.Count - 1) { string key = Uri.EscapeUriString(keys[i]); string content = string.Empty; Uri keyUri = new Uri("http://wordstat.yandex.ru/?cmd=words&page=1&t=" + key + "&geo=&text_geo="); DownloaderObj obj = new DownloaderObj(keyUri, null, true, null, CookieOptions.UseShared & CookieOptions.Take, 5, null, cookies); Downloader.DownloadSync(obj); if (obj.DataStr == null & failTryCount < 5) { failTryCount++; continue; } else if (content == null) break; Match capchaResult = capchaRx.Match(content); if (capchaResult.Success) { obj.Attempts = 3; obj.Uri = new Uri("http://kiks.yandex.ru/su/"); Downloader.HaveResponce(obj); cookies = obj.Cookie; continue; } content = splitRx.Split(content)[1]; MatchCollection results = rx.Matches(content); if (results.Count <= 14) { foreach (Match m in results) { keys.Add(Uri.UnescapeDataString(m.Groups[1].Value)); } } else { for (int j = 1; j < 14; j++) { keys.Add(Uri.UnescapeDataString(results[j].Groups[1].Value)); } } keys = keys.Distinct().ToList<string>(); i++; Console.WriteLine("ParseYandexKeys collect {0} keys and index is on {1} posotion...", keys.Count, i); } return keys; }
public List<Uri> GetAllCategoriesProducts(List<string> categories) { int indx = 0; Downloader.MaxParallelRequests = 20; foreach (var category in categories) { for (int i = 1; i < 6; i++) { DownloaderObj obj = new DownloaderObj(new Uri(category + "?pg=" + i), GetProductCallback, true, null, CookieOptions.NoCookies, 100, indx); Downloader.Queue(obj); } indx++; } return allProductsUrls; }
List<string> CollectAllASINs(List<Uri> productsUrls) { List<string> ASINs = new List<string>(); foreach (var productUrl in productsUrls) { DownloaderObj obj = new DownloaderObj(productUrl, null, true, null, CookieOptions.NoCookies, 100); Downloader.DownloadSync(obj); if (obj.DataStr != null) { string ASIN = GetProductASIN(obj.DataStr); if (!string.IsNullOrEmpty(ASIN)) { ASINs.Add(ASIN); } } } return ASINs; }
private void AskRecognize(string id) { Uri resolveUri = new Uri(_resolveStr + id); DownloaderObj obj = new DownloaderObj(resolveUri, null, true, null, CookieOptions.Empty, 5); Downloader.DownloadSync(obj); if (obj.DataStr != null) { HandleRecognizeState(obj.DataStr, id); } else { #if DEBUG GlobalLog.Err("can't ASK data, id:{0}", id); #endif Err(ErrorState.ConnectProblem); } }
/// <summary> /// Post image data on server /// </summary> /// <param name="image">Image data</param> /// <param name="type">Image extension type</param> /// <param name="param">Recognize params</param> public void PostImage(byte[] image, ImgType type, RecognizeParams param = null) { if (image == null || image.Length == 0) throw new ArgumentException("Bad image data"); List<KeyValuePair<string, string>> postParams = new List<KeyValuePair<string, string>>(); postParams.Add(new KeyValuePair<string, string>("method", "post")); postParams.Add(new KeyValuePair<string, string>("key", ACCOUNT_ID)); if (param!=null) { foreach (var keyValue in param.Params) { postParams.Add(keyValue); } } byte[] fbData = BuldFormbasedData(postParams, StreamEncoding.GetString(image), type); string contentType = "multipart/form-data; boundary=" + _boundary; RequestParams reqParams = new RequestParams(null, null, contentType, null, true, "POST", StreamEncoding, false); DownloaderObj obj = new DownloaderObj(_postUri, EndPostImage, true, null, CookieOptions.Empty, 4, null, null, false, 1000, null, reqParams); obj.PostData = fbData; Downloader.Queue(obj); }
internal void BeginReceive(DownloaderObj obj) { if (obj.Attempts > 0) { obj.Attempts--; try { obj.Request = CreateRequest(obj); if (obj.PostData != null) PostRequestData(obj.Request, obj.PostData); //Console.WriteLine("Send request to -- {0}", obj.Uri.OriginalString); obj.Request.BeginGetResponse(EndReceive, obj); } catch (WebException e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); HandleWebState(e, obj); RetryOrCallback(obj); } catch (Exception e) { GlobalLog.Err(e, "Host: " + obj.Uri.Host); RetryOrCallback(obj); } } }
public static CookieCollection GetCookies(DownloaderObj obj) { if (!obj.CookieOptions.HasFlag(CookieOptions.NoCookies)) { return new MagicClient().GetCookies(obj); } else return null; }
void AsyncDownloadCategoryPage(int curIndx) { ThreadPool.QueueUserWorkItem((object o) => { HashSet<string> curCategories = new HashSet<string>(); DownloaderObj obj = new DownloaderObj(tempList[curIndx], null, true, null, CookieOptions.NoCookies, 1000); Downloader.DownloadSync(obj); if (obj.DataStr != null) { string pageMenuData = GetMenuDataPiece(obj.DataStr, tempList[curIndx]); curCategories = GetAllMenuLinks(pageMenuData, tempList[curIndx]); } foreach (var category in curCategories) { string clearCategory = _linkRefRx.Replace(category, ""); lock (allCategoriesSync) { lock (tempListSync) { if (!allCategories.Contains(clearCategory)) { allCategories.Add(clearCategory); SaveCategory(clearCategory); Console.WriteLine("Total collected categoryes - {0}", allCategories.Count); tempList.Add(new Uri(clearCategory)); } } } } tempList[curIndx] = null; Console.WriteLine("waiter.Count - " + waiter.Count); if (Interlocked.Decrement(ref waiter.Count) == 0) { waiter.WaitEvent.Set(); } }); }
private void AskRecognize(object arg) { object[] args = arg as object[]; string id = args[1] as string; Timer timer = args[0] as Timer; Uri resolveUri = new Uri(_resolveStr + id); DownloaderObj obj = new DownloaderObj(resolveUri, null, true, null, CookieOptions.Empty, 5); Downloader.DownloadSync(obj); if (obj.DataStr != null) { HandleRecognizeState(obj.DataStr, id); } else { Err(ErrorState.ConnectProblem); } timer.Dispose(); }
void GetProductCallback(DownloaderObj obj) { if (obj.DataStr != null) { List<Uri> products = GetProductsUrls(obj.DataStr); lock (allProductsUrlsSync) { allProductsUrls.AddRange(products); SaveProducts(products); } Console.WriteLine("Found {0} products on categoryIndx {1}", allProductsUrls.Count, (int)obj.Arg); } }
private void DevourCallback(DownloaderObj obj) { if (obj.DataStr != null) { if (_validator != null) { if (_validator.Validate(obj.DataStr)) HandleSuccessDownload(obj); else HandleBadDownload(obj); } else { HandleSuccessDownload(obj); } } else { HandleBadDownload(obj); } MoveObjComplete(); }
void EndGetPageWithProxy(DownloaderObj obj) { }