public IHttpActionResult GeoYellowpageName(string fName = "", string lName = "", int houseNumber = -1, string street = "", string town = "", string state = "", string zip = "") { /* For a given name location find all phone numbers and names */ string URL = "http://people.yellowpages.com/whitepages?first=" + fName + "&last=" + lName + "&zip=" + town + "&state=" + state; try { string responseFromServer = ScraperHelper.GetURL(URL); List <WhitePageHit> myPhoneNums = ParseWpList(responseFromServer); List <WhitePageHit> toRemove = new List <WhitePageHit>(); foreach (WhitePageHit oneHit in myPhoneNums) { if (!oneHit.address.Contains(street)) { toRemove.Add(oneHit); } } myPhoneNums.RemoveAll(x => toRemove.Contains(x)); return(Ok(new PhoneLookupResult { result = myPhoneNums.Count.ToString(), phoneList = myPhoneNums })); } catch (Exception ex) { return(Ok(new GenericResult { result = "0", payload = ex.Message })); } }
public virtual async Task ExecuteDownload(HttpClient client, CancellationToken cancelToken) { if (!this.CanDownload()) { return; } ScraperHelper.SetOrigenToClient(this.Url, client); Console.WriteLine("Downloading: " + this.Url); var result = await client.GetAsync(Url, cancelToken).ConfigureAwait(false); if (result.IsSuccessStatusCode) { var html = await result.Content.ReadAsStringAsync().WithCancellation(cancelToken).ConfigureAwait(false); if (!this.ValidateHtml(html)) { throw new HttpRequestException("HTML returned was not verified."); } this.Html = html; } else { throw new HttpRequestException("HTML returned was not verified."); } }
public async void TestJob(IThreadedWebClientJob initialJob) { Stack <IThreadedWebClientJob> queue = new Stack <IThreadedWebClientJob>(); queue.Push(initialJob); while (queue.Count > 0) { IThreadedWebClientJob job = queue.Pop(); using (HttpClient client = new HttpClient(new HttpClientHandler() { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate })) { // Add default headers to the client to simulate // a real browser ScraperHelper.AddHeadersToClient(client); try { CancellationTokenSource cancelToken = new CancellationTokenSource(); cancelToken.CancelAfter(new TimeSpan(0, 1, 0, 00)); await job.ExecuteDownload(client, cancelToken.Token); List <IThreadedWebClientJob> newJobs; try { newJobs = job.Execute(); foreach (var t in newJobs) { queue.Push(t); } } catch (Exception exp) { job.FailedExecute(exp); } } // WebException may be a proxy error catch (WebException exp) { throw; } // Uncaught error catch (Exception exp) { job.FailedDownload(exp); } } } }
public IHttpActionResult GeoWhoOpenStreet(double Lat, Double Lon, int zoom = 18) { string URL = "http://nominatim.openstreetmap.org/reverse?format=json&addressdetails=1"; URL = URL + "&zoom=" + zoom + "&lat=" + Lat + "&lon=" + Lon; string responseFromServer = ""; try { responseFromServer = ScraperHelper.GetURL(URL); } catch (Exception ex) { return(Ok(new GenericResult { result = "0", payload = ex.Message })); } return(Ok(new GenericResult { result = "1", payload = responseFromServer })); }
public IHttpActionResult GeoYellowpage(int houseNumber = 1, string road = "", string town = "", string state = "", string zip = "") { string URL = "http://people.yellowpages.com/whitepages/address?street=" + houseNumber + ' ' + road + "&qloc=" + town + "+" + state + "+" + zip; try { string responseFromServer = ScraperHelper.GetURL(URL); List <WhitePageHit> mynums = ParseWpList(responseFromServer); return(Ok(new PhoneLookupResult { result = mynums.Count.ToString(), phoneList = mynums })); } catch (Exception ex) { return(Ok(new GenericResult { result = "0", payload = ex.Message })); } }
public virtual async Task ExecuteDownload(HttpClient client, CancellationToken cancelToken) { if (!this.CanDownload()) { logger.Debug("Can download returned false, stopping the download job."); return; } ScraperHelper.SetOrigenToClient(this.Url, client); if (this.headers != null) { foreach (KeyValuePair <string, string> keyValuePair in headers) { client.DefaultRequestHeaders.Add(keyValuePair.Key, keyValuePair.Value); } } logger.Trace("Downloading: " + this.Url); var result = await client.GetAsync(Url, cancelToken).ConfigureAwait(false); if (result.IsSuccessStatusCode) { var html = await result.Content.ReadAsStringAsync().WithCancellation(cancelToken).ConfigureAwait(false); if (!this.ValidateHtml(html)) { logger.Warn("Failed HTML verification"); throw new HttpRequestException("HTML returned was not verified."); } this.Html = html; } else { throw new HttpRequestException(string.Format("HTML request was not successfull. Status Code: {0} ", result.StatusCode)); } }
public IActionResult Index() { var a = new ScraperHelper(); return(View()); }
public override async Task ExecuteDownload(HttpClient client, CancellationToken cancelToken) { var uri = new Uri(this.GetUrl()); client.BaseAddress = this.GetUrl().Contains("https:") ? new Uri("https://" + uri.Host) : new Uri("http://" + uri.Host); ScraperHelper.SetOrigenToClient(this.GetUrl(), client); var message = new HttpRequestMessage(HttpMethod.Post, uri.LocalPath); if (this.headers != null) { foreach (var header in this.headers) { message.Headers.Add(header.Key, header.Value); } } var keyValue = new List <KeyValuePair <string, string> >(); if (this.formData != null) { keyValue.AddRange(this.formData); } if (this.formString != null) { message.Content = new StringContent(this.formString, Encoding.UTF8, "application/json"); } else { StringBuilder stringBuilder = new StringBuilder(); foreach (KeyValuePair <string, string> current in keyValue) { if (stringBuilder.Length > 0) { stringBuilder.Append('&'); } stringBuilder.Append(Encode(current.Key)); stringBuilder.Append('='); stringBuilder.Append(Encode(current.Value)); } message.Content = new StringContent(stringBuilder.ToString(), Encoding.UTF8, "application/x-www-form-urlencoded"); //message.Content = new MyFormUrlEncodedContent(keyValue); } var result = await client.SendAsync(message, cancelToken).ConfigureAwait(false); this.responseHeaders = result.Headers; if (result.IsSuccessStatusCode) { var html = await result.Content.ReadAsStringAsync().WithCancellation(cancelToken).ConfigureAwait(false); if (!this.ValidateHtml(html)) { throw new HttpRequestException("HTML returned was not verified."); } this.SetHtml(html); } else { throw new HttpRequestException("HTML returned was not verified."); } }
private async void RunWebDownload(object threadIndexObject) { int threadIndex = (int)threadIndexObject; logger.Trace("Download Thread {0} started", threadIndex); DateTime currentTaskStarted; Task currentTask; // Run untill asked to shut down while (!this.stopThread) { if (this.DownloaderThreadStatus != null) { this.DownloaderThreadStatus(this, threadIndex, true); } // Done jobs queue full, halt download work if (this.doneJobQueue.Count >= this.maxDoneQueue) { this.DownloaderThreadJobChanged(this, threadIndex, null); this.waitingForEmpty = true; await Task.Delay(50); continue; } if (this.waitingForEmpty) { this.DownloaderThreadJobChanged(this, threadIndex, null); // If the jobs queue is empty, resume work if (this.doneJobQueue.Count == 0) { this.waitingForEmpty = false; } await Task.Delay(50); continue; } // Find a proxy to use, if required, if the method // returns false, do we have to stop work, and we // use proxies, but there are no proxies left to work with. WebProxyHolder proxy; IThreadedWebClientJob job; try { // Dequeue a job. //logger.Trace("Downloader {0} Trying to get a new job.", threadIndex); job = this.Dequeue(); if (job != null) { logger.Trace("Downloader {0} got a new job {1}", threadIndex, job); lock (this.jobsInProcessLocker) { this.jobsInProcess++; if (this.JobProcessingChanged != null) { this.JobProcessingChanged(this, this.jobsInProcess); } } if (this.DownloaderThreadJobChanged != null) { this.DownloaderThreadJobChanged(this, threadIndex, job); } WebProxy webProxy = this.HandleAddProxy(out proxy); if (webProxy == null && this.useProxies) { if (this.dieOnProxiesLeft) { logger.Error("Download {0} was unable to find any proxies, shutting down.", threadIndex); // No proxies to use, and we have to use proxies, kill all downloads. this.stopThread = true; this.jobsInProcess = 0; // Ensure worker is not stuck return; } else { logger.Trace("Download {0} was unable to find any proxies, requeueing the job.", threadIndex); // We just keep running untill we get proxies again. // Requeue the job this.Enqueue(job); // Count down the working jobs lock (this.jobsInProcessLocker) { this.jobsInProcess--; if (this.JobProcessingChanged != null) { this.JobProcessingChanged(this, this.jobsInProcess); } } await Task.Delay(50); continue; } } if (webProxy != null && this.useProxies) { logger.Trace("Downloader {0}, got proxy {1}:{2}", threadIndex, webProxy.Address.Host, webProxy.Address.Port); } try { bool run = true; if (job is ITypedTask) { logger.Trace("Downloader {0}'s job {1} requires verification.", threadIndex, job); run = ((ITypedTask)job).Verify(); } if (run) { var HttpClientHandler = new HttpClientHandler() { UseCookies = false, Proxy = webProxy, UseProxy = this.useProxies, // For Fiddler debugging //Proxy = new WebProxy("http://127.0.0.1:8888"), //UseProxy = true, AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate }; if (job is IHttpClientHandlerTask) { logger.Trace("Downloader {0}'s job {1} have custom HttpClient.", threadIndex, job); HttpClientHandler = ((IHttpClientHandlerTask)job).GetHttpClient(webProxy); } using (HttpClient client = new HttpClient(HttpClientHandler)) { // Add default headers to the client to simulate // a real browser ScraperHelper.AddHeadersToClient(client); logger.Trace("Downloader {0} is running job {1}.", threadIndex, job); CancellationTokenSource cancelToken = new CancellationTokenSource(); var timelimit = new TimeSpan(0, 0, 30); // Some jobs might requere a bigger timelimit if (job is IThreadedWebClientLongJob) { logger.Debug("Fetching timelimit for downloader {0}'s job {1}.", threadIndex, job); timelimit = ((IThreadedWebClientLongJob)job).GetTimeOut(); logger.Debug("Downloader {0}'s job {1} have set a custome time limit to {2}.", threadIndex, job, timelimit); } cancelToken.CancelAfter(timelimit); logger.Trace("Downloader {0} is executing job {1}.", threadIndex, job); await job.ExecuteDownload(client, cancelToken.Token); logger.Trace("Downloader {0} is done executing job {1}.", threadIndex, job); doneJobQueue.Enqueue(job); if (this.JobDoneInQueueChanged != null) { this.JobDoneInQueueChanged(this, this.doneJobQueue.Count); } // Vote up good proxy, if have bad votes if (proxy != null && this.badProxy.ContainsKey(proxy)) { logger.Trace("Proxy {0} was good, up voting it.", proxy); if (this.badProxy[proxy] > 0) { this.badProxy[proxy]--; } } } } else { logger.Trace("Downloader {0}'s job {1} did not verify, reenqueueing the job.", threadIndex, job); // Requery the job this.Enqueue(job); await Task.Delay(10); } } // WebException may be a proxy error catch (HttpRequestException exp) { logger.Warn(exp, "Got web exception while executing downloader {0}'s job {1}.", threadIndex, job); // Handle bad proxy voting this.HandleBadProxy(threadIndex, proxy); // Requeue the job this.HandleBadJob(job, threadIndex); } // Uncaught error catch (Exception exp) { logger.Error(exp, "Got unknown exception while executing downloader {0}'s job {1}.", threadIndex, job); try { logger.Trace(exp, "Downloader {0} is running failed download for job {1}: {2}", threadIndex, job, exp.Message); job.FailedDownload(exp); } catch (Exception exp2) { // Error here, nothing we can do } this.HandleBadJob(job, threadIndex); } lock (this.jobsInProcessLocker) { this.jobsInProcess--; if (this.JobProcessingChanged != null) { this.JobProcessingChanged(this, this.jobsInProcess); } } } else { // Currently no jobs to do if (this.DownloaderThreadJobChanged != null) { this.DownloaderThreadJobChanged(this, threadIndex, null); } await Task.Delay(100); } } catch (HttpRequestException exp) { break; // only no good proxies left } catch (Exception exp) { // Something bad? maby no more proxies? } } // Download thread shutdown if (this.DownloaderThreadStatus != null) { this.DownloaderThreadStatus(this, threadIndex, false); } }