Esempio n. 1
0
        public string GetUrlContents(string url, string header, string userAgent, Functions.LogMessageDelegate LogMessage)
        {
            string            result        = string.Empty;
            IWebObjectStorage objectStorage = new WebObjectStorageFileSystem(_webObjectRoot);

            WebObject cached = objectStorage.GetUrlObject(url);

            if (cached != null)
            {
                result = cached.Content;
            }
            else
            {
                result = WebProcessor.GetUrlContents(url, header, userAgent, LogMessage);

                if (!Functions.IsEmptyString(result))
                {
                    cached = new WebObject()
                    {
                        Url = url, Content = result, MimeType = "text/html"
                    };
                    objectStorage.SetUrlObject(cached);
                }
            }

            return(result);
        }
        /// <summary>
        /// Get the search results JSON data from Google
        /// </summary>
        /// <param name="term">The plain search term.  Some keywords are added to get better search results.</param>
        /// <param name="clientIp">The client IP address</param>
        /// <param name="resultsPageSize">The number of results per page (8 recommended)</param>
        /// <param name="pageToGet">The zero-based index of the page to get</param>
        /// <returns>The raw JSON from Google.</returns>
        public string GetSearchResults(string term, string clientIp, int resultsPageSize, int pageToGet, Functions.LogMessageDelegate LogMessage)
        {
            string result = string.Empty;

            //
            // Docs for google search:
            //
            // https://developers.google.com/web-search/docs/reference#_intro_fonje
            //
            const string baseAddr = @"https://ajax.googleapis.com/ajax/services/search/web";

            //
            // Note: Paging is supported in the request.  To ponder.
            //
            const string searchModifierKeywords = "sound clips wav mp3";
            int          startIndex             = pageToGet * resultsPageSize;

            string googleSearchPhrase = string.Format("{0} {1}", term, searchModifierKeywords);

            string searchUrl = string.Format(@"{0}?q={1}&v=1.0&userip={2}&start={3}&rsz={4}", baseAddr, googleSearchPhrase, clientIp, startIndex, resultsPageSize);

            if (LogMessage != null)
            {
                LogMessage(string.Format("Searching google for: \"{0}\"", searchUrl));
            }

            result = WebProcessor.GetUrlContents(searchUrl, @"referrer:http://www.otamata.com", @"OtamataSoundSearchService", LogMessage);

            return(result);
        }
 public string GetImageSearchResults(string term, string clientIp, int resultsPageSize, int pageToGet, Functions.LogMessageDelegate LogMessage)
 {
     throw new NotImplementedException();
 }
        public string GetSearchResults(string term, string clientIp, int resultsPageSize, int pageToGet, Functions.LogMessageDelegate LogMessage)
        {
            string result = string.Empty;

            //
            // Docs for google search:
            //
            // https://developers.google.com/web-search/docs/reference#_intro_fonje
            //
            const string baseAddr = @"http://yboss.yahooapis.com/ysearch/web";

            //
            // Note: Paging is supported in the request.  To ponder.
            //
            const string searchModifierKeywords = "sound clips wav mp3";
            int          startIndex             = pageToGet * resultsPageSize;

            string yahooSearchPhrase = string.Format("{0} {1}", term, searchModifierKeywords);

            // http://yboss.yahooapis.com/ysearch/web?q=ipod

            string searchUrl = string.Format(@"{0}?q={1}&count={2}", baseAddr, yahooSearchPhrase, resultsPageSize);

            if (LogMessage != null)
            {
                LogMessage(string.Format("Searching yahoo for: \"{0}\"", searchUrl));
            }

            result = WebProcessor.GetUrlContents(BuildAuthenticatedUrl(new Uri(searchUrl)), @"referrer:http://www.otamata.com", @"OtamataSoundSearchService", LogMessage);

            return(result);
        }
        public string GetImageSearchResults(string term, string clientIp, int resultsPageSize, int pageToGet, Functions.LogMessageDelegate LogMessage)
        {
            //
            // Let's search for this bad boy!
            //
            string result = string.Empty;

            string size = "medium";
            var    uri  = new Uri(string.Format("http://yboss.yahooapis.com/ysearch/images?dimensions={0}&q={1}", size, HttpUtility.UrlEncode(term)));

            string searchUrl = BuildAuthenticatedUrl(uri);

            result = WebProcessor.GetUrlContents(searchUrl, @"referrer:http://www.otamata.com", @"OtamataSoundSearchService", LogMessage);

            return(result);
        }
Esempio n. 6
0
        /// <summary>
        /// For the passed url, grab the contents.  Should only be used for text content types.
        /// </summary>
        /// <param name="url">The url</param>
        /// <param name="header">Extra header values to include, if any.</param>
        /// <param name="userAgent">The user agent to use, if any.</param>
        /// <returns>The url contents, or string.Empty if something goes wrong.</returns>
        /// <remarks>
        /// Using a 5 second timeout.
        /// </remarks>
        public static string GetUrlContents(string url, string header, string userAgent, Functions.LogMessageDelegate LogMessage)
        {
            string result = null;

            var request = (HttpWebRequest)HttpWebRequest.Create(url);

            request.Timeout = 5000;     // Only wait 5 seconds for some of these lame-ass servers

            if (!Functions.IsEmptyString(header))
            {
                request.Headers.Add(header);
            }

            if (!Functions.IsEmptyString(userAgent))
            {
                request.UserAgent = userAgent;
            }

            HttpWebResponse response = null;

            try
            {
                response = (HttpWebResponse)request.GetResponse();

                //
                // todo: handle things like:         <meta HTTP-EQUIV="refresh" CONTENT="0; URL=http://www.thesoundarchive.com/beavis-and-butthead.asp">
                //

                if (response.StatusCode == HttpStatusCode.Redirect || response.StatusCode == HttpStatusCode.MovedPermanently)
                {
                    //
                    // According to the .NET docs, the request should automatically handle this?  See mouse-over tool tip for "HttpStatusCode.Redirect" above
                    //
                    LogMessage(string.Format("Received redirect - prolly not gonna find anything on this page... {0}", response.StatusCode));
                }

                Stream responseStream = response.GetResponseStream();
                result = new StreamReader(responseStream).ReadToEnd();
            }
            catch (Exception ex)
            {
                // Crud.
                LogMessage(string.Format("GetUrlContents() Exception! = \"{0}\" at url: \"{1}\"", ex.Message, url));
            }
            finally
            {
                if (response != null)
                {
                    response.Close();
                }
            }

            return(result);
        }
        /// <summary>
        /// Get all the sounds in the list of urls from the passed IDataSource and add the sounds to the passed websearch
        /// </summary>
        /// <param name="urls">The list of urls</param>
        /// <param name="dataSource">The datasource to use to look for sounds</param>
        /// <param name="currentSearch">The current search</param>
        private static void GetSoundsOnPages(IList <dynamic> urls, IDataSource dataSource, websearch currentSearch, IList <websearchsound> searchResultList, Functions.LogMessageDelegate LogMessage, int maxDepthToFollow)
        {
            //const int MAX_URLS_TO_SEARCH = 20;
            const int MAX_SOUNDS_PER_URL = 150;
            //int urlsProcessed = 0;
            HashSet <string> urlsOfObjectsSearched = new HashSet <string>();

            //
            // Multithreading here for requesting the pages works pretty well speed-wise.  Unfortunately, the regexes bog down the
            // server so badly that it becomes unresponsive for other users.  So, don't do parallel on this outside loop.
            //
            // However, once the first page is processed, the sounds are webrequested asynchronously.  So, the next page will
            // start being processed while the first page's sounds are still being downloaded.  This works quite well, and
            // the performance is just about the same.  So, let's stick with that.
            //

            foreach (dynamic url in urls)
            {
                string theUrl = url.Url;
                string domain = WebProcessor.GetDomainOfUrl(theUrl);

                if (unprocessableDomains.Contains(domain))
                {
                    LogMessage(string.Format("Skipping crappy domain: {0}", domain));
                }
                else
                {
                    LogMessage(string.Format("About to search for sounds on page: \"{0}\"", theUrl));

                    // string pageContent = WebProcessor.GetUrlContents(theUrl, null, null, LogMessage);

                    //
                    // todo: test this, make sure it works
                    //
                    string pageContent = dataSource.GetUrlContents(theUrl, null, GetUserAgent(), LogMessage);

                    bool wasAborted = false;

                    //
                    // todo: combine sound links func with above function
                    //
                    IList <string> linksOnPage = GetSoundLinksOnPage(pageContent, ref wasAborted);

                    //
                    // For generating test case files, set breakpoint on if (wasAborted) below with condition:
                    //
                    // maxDepthToFollow == 1
                    //

                    if (wasAborted)
                    {
                        LogMessage(string.Format("Had to abort link search on domain: {0}", domain));

                        lock (unprocessableDomains)
                        {
                            unprocessableDomains.Add(domain);
                        }
                    }

                    LogMessage(string.Format("Found {0} links on \"{1}\"", linksOnPage.Count, theUrl));


#if MULTITHREADED
                    Parallel.ForEach <string>(linksOnPage.Take(MAX_SOUNDS_PER_URL), partialLink => // <=-- normal operation - multithreaded
#else
                    foreach (string partialLink in linksOnPage.Take(MAX_SOUNDS_PER_URL))           // <=-- for debugging stuff, it's easier when not multithreaded
#endif
                    {
                        string soundLink = WebProcessor.GetUrlForObject(theUrl, partialLink);

                        LogMessage(string.Format("About to grab a potential sound here: \"{0}\"", soundLink));

                        if (!unprocessableDomains.Contains(domain) && IsNewSoundToGrab(urlsOfObjectsSearched, soundLink))
                        {
                            websearchsound receivedObject = GetWebObjectAtUrl(soundLink, null, null);

                            //
                            // enhanced search: if not a sound and is text/html and response code is 200, search for sounds on THAT page
                            //

                            if (receivedObject.issound)
                            {
                                receivedObject.sourceurl         = theUrl;
                                receivedObject.sourceDomain      = domain;
                                receivedObject.searchResultOrder = url.Index;

                                //
                                // Check for dups
                                //
                                string md5Hash = Functions.GetMd5Hash(receivedObject.soundbytes);

                                if (!HaveMd5ForSound(dataSource.CurrentSoundMd5s, md5Hash))
                                {
                                    dataSource.SetSoundInSearch(currentSearch, receivedObject);

                                    //
                                    // Performance optimization: we're not going to return the sound data itself with the search
                                    // so let's free up the mem here
                                    //
                                    receivedObject.soundbytes = null;

                                    searchResultList.Add(receivedObject);
                                }
                                else
                                {
                                    LogMessage("Not adding sound - already in collection");
                                }
                            }
                            else if (receivedObject.contenttype.ToLower().StartsWith("text/html"))
                            {
                                //
                                // We have another HTML page.  Check that too?
                                //
                                if (maxDepthToFollow > 0)
                                {
                                    LogMessage(string.Format("Going to drill down in this page - we're at max level: {0}", maxDepthToFollow));
                                    GetSoundsOnPages(new List <dynamic>()
                                    {
                                        new { Url = soundLink, Index = url.Index }
                                    }, dataSource, currentSearch, searchResultList, LogMessage, maxDepthToFollow - 1);
                                }
                                else
                                {
                                    LogMessage(string.Format("No more drilling down, we're as low as we can go"));
                                }
                            }
                        }
                        else
                        {
                            LogMessage("Won't process: already had sound from that url, or the domain is unprocessable!");
                        }
#if MULTITHREADED
                    });
 /// <summary>
 /// Get all the sounds in the list of urls from the passed IDataSource and add the sounds to the passed websearch
 /// </summary>
 /// <param name="urls">The list of urls</param>
 /// <param name="dataSource">The datasource to use to look for sounds</param>
 /// <param name="currentSearch">The current search</param>
 private static void GetSoundsOnPages(IList <dynamic> urls, IDataSource dataSource, websearch currentSearch, IList <websearchsound> searchResultList, Functions.LogMessageDelegate LogMessage)
 {
     GetSoundsOnPages(urls, dataSource, currentSearch, searchResultList, LogMessage, _maxDepthToFollow);
 }