Esempio n. 1
0
        public IHttpActionResult GeoYellowpageName(string fName = "", string lName = "", int houseNumber = -1, string street = "", string town = "",
                                                   string state = "", string zip   = "")
        {
            /* For a given name location find all phone numbers and names */

            string URL = "http://people.yellowpages.com/whitepages?first=" + fName
                         + "&last=" + lName + "&zip=" + town + "&state=" + state;

            try
            {
                string responseFromServer = ScraperHelper.GetURL(URL);

                List <WhitePageHit> myPhoneNums = ParseWpList(responseFromServer);
                List <WhitePageHit> toRemove    = new List <WhitePageHit>();
                foreach (WhitePageHit oneHit in myPhoneNums)
                {
                    if (!oneHit.address.Contains(street))
                    {
                        toRemove.Add(oneHit);
                    }
                }
                myPhoneNums.RemoveAll(x => toRemove.Contains(x));

                return(Ok(new PhoneLookupResult {
                    result = myPhoneNums.Count.ToString(), phoneList = myPhoneNums
                }));
            }
            catch (Exception ex)
            {
                return(Ok(new GenericResult {
                    result = "0", payload = ex.Message
                }));
            }
        }
Esempio n. 2
0
        public virtual async Task ExecuteDownload(HttpClient client, CancellationToken cancelToken)
        {
            if (!this.CanDownload())
            {
                return;
            }

            ScraperHelper.SetOrigenToClient(this.Url, client);
            Console.WriteLine("Downloading: " + this.Url);
            var result = await client.GetAsync(Url, cancelToken).ConfigureAwait(false);

            if (result.IsSuccessStatusCode)
            {
                var html = await result.Content.ReadAsStringAsync().WithCancellation(cancelToken).ConfigureAwait(false);

                if (!this.ValidateHtml(html))
                {
                    throw new HttpRequestException("HTML returned was not verified.");
                }
                this.Html = html;
            }
            else
            {
                throw new HttpRequestException("HTML returned was not verified.");
            }
        }
        public async void TestJob(IThreadedWebClientJob initialJob)
        {
            Stack <IThreadedWebClientJob> queue = new Stack <IThreadedWebClientJob>();

            queue.Push(initialJob);

            while (queue.Count > 0)
            {
                IThreadedWebClientJob job = queue.Pop();

                using (HttpClient client = new HttpClient(new HttpClientHandler()
                {
                    AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate
                }))
                {
                    // Add default headers to the client to simulate
                    // a real browser
                    ScraperHelper.AddHeadersToClient(client);

                    try
                    {
                        CancellationTokenSource cancelToken = new CancellationTokenSource();
                        cancelToken.CancelAfter(new TimeSpan(0, 1, 0, 00));
                        await job.ExecuteDownload(client, cancelToken.Token);

                        List <IThreadedWebClientJob> newJobs;
                        try
                        {
                            newJobs = job.Execute();
                            foreach (var t in newJobs)
                            {
                                queue.Push(t);
                            }
                        }
                        catch (Exception exp)
                        {
                            job.FailedExecute(exp);
                        }
                    }
                    // WebException may be a proxy error
                    catch (WebException exp)
                    {
                        throw;
                    }
                    // Uncaught error
                    catch (Exception exp)
                    {
                        job.FailedDownload(exp);
                    }
                }
            }
        }
Esempio n. 4
0
        public IHttpActionResult GeoWhoOpenStreet(double Lat, Double Lon, int zoom = 18)
        {
            string URL = "http://nominatim.openstreetmap.org/reverse?format=json&addressdetails=1";

            URL = URL + "&zoom=" + zoom + "&lat=" + Lat + "&lon=" + Lon;
            string responseFromServer = "";

            try
            {
                responseFromServer = ScraperHelper.GetURL(URL);
            }
            catch (Exception ex)
            {
                return(Ok(new GenericResult {
                    result = "0", payload = ex.Message
                }));
            }
            return(Ok(new GenericResult {
                result = "1", payload = responseFromServer
            }));
        }
Esempio n. 5
0
        public IHttpActionResult GeoYellowpage(int houseNumber = 1, string road = "", string town = "",
                                               string state    = "", string zip = "")
        {
            string URL = "http://people.yellowpages.com/whitepages/address?street=" +
                         houseNumber + ' ' + road + "&qloc=" + town + "+" + state + "+" + zip;

            try
            {
                string responseFromServer = ScraperHelper.GetURL(URL);

                List <WhitePageHit> mynums = ParseWpList(responseFromServer);

                return(Ok(new PhoneLookupResult {
                    result = mynums.Count.ToString(), phoneList = mynums
                }));
            }
            catch (Exception ex)
            {
                return(Ok(new GenericResult {
                    result = "0", payload = ex.Message
                }));
            }
        }
Esempio n. 6
0
        public virtual async Task ExecuteDownload(HttpClient client, CancellationToken cancelToken)
        {
            if (!this.CanDownload())
            {
                logger.Debug("Can download returned false, stopping the download job.");
                return;
            }

            ScraperHelper.SetOrigenToClient(this.Url, client);
            if (this.headers != null)
            {
                foreach (KeyValuePair <string, string> keyValuePair in headers)
                {
                    client.DefaultRequestHeaders.Add(keyValuePair.Key, keyValuePair.Value);
                }
            }

            logger.Trace("Downloading: " + this.Url);
            var result = await client.GetAsync(Url, cancelToken).ConfigureAwait(false);

            if (result.IsSuccessStatusCode)
            {
                var html = await result.Content.ReadAsStringAsync().WithCancellation(cancelToken).ConfigureAwait(false);

                if (!this.ValidateHtml(html))
                {
                    logger.Warn("Failed HTML verification");
                    throw new HttpRequestException("HTML returned was not verified.");
                }
                this.Html = html;
            }
            else
            {
                throw new HttpRequestException(string.Format("HTML request was not successfull. Status Code: {0} ", result.StatusCode));
            }
        }
Esempio n. 7
0
        public IActionResult Index()
        {
            var a = new ScraperHelper();

            return(View());
        }
Esempio n. 8
0
        public override async Task ExecuteDownload(HttpClient client, CancellationToken cancelToken)
        {
            var uri = new Uri(this.GetUrl());

            client.BaseAddress = this.GetUrl().Contains("https:") ? new Uri("https://" + uri.Host) : new Uri("http://" + uri.Host);
            ScraperHelper.SetOrigenToClient(this.GetUrl(), client);

            var message = new HttpRequestMessage(HttpMethod.Post, uri.LocalPath);

            if (this.headers != null)
            {
                foreach (var header in this.headers)
                {
                    message.Headers.Add(header.Key, header.Value);
                }
            }

            var keyValue = new List <KeyValuePair <string, string> >();

            if (this.formData != null)
            {
                keyValue.AddRange(this.formData);
            }
            if (this.formString != null)
            {
                message.Content = new StringContent(this.formString, Encoding.UTF8, "application/json");
            }
            else
            {
                StringBuilder stringBuilder = new StringBuilder();
                foreach (KeyValuePair <string, string> current in keyValue)
                {
                    if (stringBuilder.Length > 0)
                    {
                        stringBuilder.Append('&');
                    }

                    stringBuilder.Append(Encode(current.Key));
                    stringBuilder.Append('=');
                    stringBuilder.Append(Encode(current.Value));
                }

                message.Content = new StringContent(stringBuilder.ToString(), Encoding.UTF8, "application/x-www-form-urlencoded");
                //message.Content = new MyFormUrlEncodedContent(keyValue);
            }

            var result = await client.SendAsync(message, cancelToken).ConfigureAwait(false);

            this.responseHeaders = result.Headers;
            if (result.IsSuccessStatusCode)
            {
                var html = await result.Content.ReadAsStringAsync().WithCancellation(cancelToken).ConfigureAwait(false);

                if (!this.ValidateHtml(html))
                {
                    throw new HttpRequestException("HTML returned was not verified.");
                }
                this.SetHtml(html);
            }
            else
            {
                throw new HttpRequestException("HTML returned was not verified.");
            }
        }
Esempio n. 9
0
        private async void RunWebDownload(object threadIndexObject)
        {
            int threadIndex = (int)threadIndexObject;

            logger.Trace("Download Thread {0} started", threadIndex);

            DateTime currentTaskStarted;
            Task     currentTask;

            // Run untill asked to shut down
            while (!this.stopThread)
            {
                if (this.DownloaderThreadStatus != null)
                {
                    this.DownloaderThreadStatus(this, threadIndex, true);
                }

                // Done jobs queue full, halt download work
                if (this.doneJobQueue.Count >= this.maxDoneQueue)
                {
                    this.DownloaderThreadJobChanged(this, threadIndex, null);
                    this.waitingForEmpty = true;
                    await Task.Delay(50);

                    continue;
                }
                if (this.waitingForEmpty)
                {
                    this.DownloaderThreadJobChanged(this, threadIndex, null);
                    // If the jobs queue is empty, resume work
                    if (this.doneJobQueue.Count == 0)
                    {
                        this.waitingForEmpty = false;
                    }
                    await Task.Delay(50);

                    continue;
                }

                // Find a proxy to use, if required, if the method
                // returns false, do we have to stop work, and we
                // use proxies, but there are no proxies left to work with.
                WebProxyHolder        proxy;
                IThreadedWebClientJob job;
                try
                {
                    // Dequeue a job.
                    //logger.Trace("Downloader {0} Trying to get a new job.", threadIndex);
                    job = this.Dequeue();
                    if (job != null)
                    {
                        logger.Trace("Downloader {0} got a new job {1}", threadIndex, job);
                        lock (this.jobsInProcessLocker)
                        {
                            this.jobsInProcess++;
                            if (this.JobProcessingChanged != null)
                            {
                                this.JobProcessingChanged(this, this.jobsInProcess);
                            }
                        }

                        if (this.DownloaderThreadJobChanged != null)
                        {
                            this.DownloaderThreadJobChanged(this, threadIndex, job);
                        }

                        WebProxy webProxy = this.HandleAddProxy(out proxy);
                        if (webProxy == null && this.useProxies)
                        {
                            if (this.dieOnProxiesLeft)
                            {
                                logger.Error("Download {0} was unable to find any proxies, shutting down.", threadIndex);
                                // No proxies to use, and we have to use proxies, kill all downloads.
                                this.stopThread    = true;
                                this.jobsInProcess = 0; // Ensure worker is not stuck
                                return;
                            }
                            else
                            {
                                logger.Trace("Download {0} was unable to find any proxies, requeueing the job.", threadIndex);
                                // We just keep running untill we get proxies again.
                                // Requeue the job
                                this.Enqueue(job);
                                // Count down the working jobs
                                lock (this.jobsInProcessLocker)
                                {
                                    this.jobsInProcess--;
                                    if (this.JobProcessingChanged != null)
                                    {
                                        this.JobProcessingChanged(this, this.jobsInProcess);
                                    }
                                }
                                await Task.Delay(50);

                                continue;
                            }
                        }

                        if (webProxy != null && this.useProxies)
                        {
                            logger.Trace("Downloader {0}, got proxy {1}:{2}", threadIndex, webProxy.Address.Host, webProxy.Address.Port);
                        }

                        try
                        {
                            bool run = true;
                            if (job is ITypedTask)
                            {
                                logger.Trace("Downloader {0}'s job {1} requires verification.", threadIndex, job);
                                run = ((ITypedTask)job).Verify();
                            }

                            if (run)
                            {
                                var HttpClientHandler = new HttpClientHandler()
                                {
                                    UseCookies = false,
                                    Proxy      = webProxy,
                                    UseProxy   = this.useProxies,
                                    // For Fiddler debugging
                                    //Proxy = new WebProxy("http://127.0.0.1:8888"),
                                    //UseProxy = true,
                                    AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate
                                };
                                if (job is IHttpClientHandlerTask)
                                {
                                    logger.Trace("Downloader {0}'s job {1} have custom HttpClient.", threadIndex, job);
                                    HttpClientHandler = ((IHttpClientHandlerTask)job).GetHttpClient(webProxy);
                                }
                                using (HttpClient client = new HttpClient(HttpClientHandler))
                                {
                                    // Add default headers to the client to simulate
                                    // a real browser
                                    ScraperHelper.AddHeadersToClient(client);

                                    logger.Trace("Downloader {0} is running job {1}.", threadIndex, job);
                                    CancellationTokenSource cancelToken = new CancellationTokenSource();
                                    var timelimit = new TimeSpan(0, 0, 30);
                                    // Some jobs might requere a bigger timelimit
                                    if (job is IThreadedWebClientLongJob)
                                    {
                                        logger.Debug("Fetching timelimit for downloader {0}'s job {1}.", threadIndex,
                                                     job);
                                        timelimit = ((IThreadedWebClientLongJob)job).GetTimeOut();
                                        logger.Debug("Downloader {0}'s job {1} have set a custome time limit to {2}.",
                                                     threadIndex, job, timelimit);
                                    }

                                    cancelToken.CancelAfter(timelimit);
                                    logger.Trace("Downloader {0} is executing job {1}.", threadIndex, job);
                                    await job.ExecuteDownload(client, cancelToken.Token);

                                    logger.Trace("Downloader {0} is done executing job {1}.", threadIndex, job);

                                    doneJobQueue.Enqueue(job);
                                    if (this.JobDoneInQueueChanged != null)
                                    {
                                        this.JobDoneInQueueChanged(this, this.doneJobQueue.Count);
                                    }

                                    // Vote up good proxy, if have bad votes
                                    if (proxy != null && this.badProxy.ContainsKey(proxy))
                                    {
                                        logger.Trace("Proxy {0} was good, up voting it.", proxy);
                                        if (this.badProxy[proxy] > 0)
                                        {
                                            this.badProxy[proxy]--;
                                        }
                                    }
                                }
                            }
                            else
                            {
                                logger.Trace("Downloader {0}'s job {1} did not verify, reenqueueing the job.", threadIndex, job);
                                // Requery the job
                                this.Enqueue(job);
                                await Task.Delay(10);
                            }
                        }
                        // WebException may be a proxy error
                        catch (HttpRequestException exp)
                        {
                            logger.Warn(exp, "Got web exception while executing downloader {0}'s job {1}.", threadIndex, job);
                            // Handle bad proxy voting
                            this.HandleBadProxy(threadIndex, proxy);
                            // Requeue the job
                            this.HandleBadJob(job, threadIndex);
                        }
                        // Uncaught error
                        catch (Exception exp)
                        {
                            logger.Error(exp, "Got unknown exception while executing downloader {0}'s job {1}.", threadIndex, job);
                            try
                            {
                                logger.Trace(exp, "Downloader {0} is running failed download for job {1}: {2}", threadIndex, job, exp.Message);
                                job.FailedDownload(exp);
                            }
                            catch (Exception exp2)
                            {
                                // Error here, nothing we can do
                            }
                            this.HandleBadJob(job, threadIndex);
                        }

                        lock (this.jobsInProcessLocker)
                        {
                            this.jobsInProcess--;
                            if (this.JobProcessingChanged != null)
                            {
                                this.JobProcessingChanged(this, this.jobsInProcess);
                            }
                        }
                    }
                    else
                    {
                        // Currently no jobs to do
                        if (this.DownloaderThreadJobChanged != null)
                        {
                            this.DownloaderThreadJobChanged(this, threadIndex, null);
                        }
                        await Task.Delay(100);
                    }
                }
                catch (HttpRequestException exp)
                {
                    break; // only no good proxies left
                }
                catch (Exception exp)
                {
                    // Something bad? maby no more proxies?
                }
            }
            // Download thread shutdown
            if (this.DownloaderThreadStatus != null)
            {
                this.DownloaderThreadStatus(this, threadIndex, false);
            }
        }