Exemplo n.º 1
0
        public string SetSoundInSearch(websearch search, websearchsound sound)
        {
            if (Functions.IsEmptyString(sound.soundid))
            {
                sound.soundid = Guid.NewGuid().ToString();
            }

            string outputFileName = Functions.CombineElementsWithDelimiter(@"\", PhysicalCacheDir(search.status.searchterm), string.Format("{0}.{1}", sound.soundid, DEFAULT_EXT));

            Functions.SerializeObjectToFile <websearchsound>(outputFileName, sound);

            return(sound.soundid);
        }
Exemplo n.º 2
0
        public websearch GetWebsearch(string term, IList <websearchsound> searchResultSounds)
        {
            websearch result = null;

            //
            // The directory existance is how different threads communicate.  If the dir exists, this thread believes that
            // another thread is currently conducting the search.  It's important that the searching thread complete
            // w/o errors and set the status appropriately.  Otherwise, we're kind of in trouble.
            //

            if (Directory.Exists(PhysicalCacheDir(term)))
            {
                //
                // If we have a directory, then we've started/done a search.  New-up the object and grab the status.
                //
                result = new websearch();
                string statusFilename = Functions.CombineElementsWithDelimiter(@"\", PhysicalCacheDir(term), STATUS_FNAME);
                result.status = Functions.DeserializeObjectFromFile <websearchstatus>(statusFilename);

                //
                // Only return files if we're done with the search.
                //
                if (result.status != null && result.status.isdone == 1)
                {
                    //
                    // The reason we only return files if we're done with the search is that the GetFiles() operation is a shared
                    // resource - read: not thread-friendly.  The calling function doesn't need the results if the search isn't
                    // finished, and we want to return the status pretty quickly.
                    //
                    string[] sounds = Directory.GetFiles(PhysicalCacheDir(term), string.Format("*.{0}", DEFAULT_EXT));

                    if (sounds.Length > 0)
                    {
                        foreach (string sound in sounds)
                        {
                            websearchsound loadedSound = Functions.DeserializeObjectFromFile <websearchsound>(sound);
                            //
                            // Performance optimization: we're not going to return the sound data itself with the search
                            // so let's free up the mem here
                            //
                            loadedSound.soundbytes = null;

                            searchResultSounds.Add(loadedSound);
                        }
                    }
                }
            }

            return(result);
        }
Exemplo n.º 3
0
        public websearchsound GetSoundInSearch(string searchTerm, string soundId)
        {
            websearchsound result = null;

            string inputFileName = Functions.CombineElementsWithDelimiter(@"\", PhysicalCacheDir(searchTerm), string.Format("{0}.{1}", soundId, DEFAULT_EXT));

            if (File.Exists(inputFileName))
            {
                result = Functions.DeserializeObjectFromFile <websearchsound>(inputFileName);
            }
            else
            {
                Debug.WriteLine(string.Format("Crap!  Can't find file: \"{0}\"", inputFileName));
            }

            return(result);
        }
Exemplo n.º 4
0
        /// <summary>
        /// Get all the sounds in the list of urls from the passed IDataSource and add the sounds to the passed websearch
        /// </summary>
        /// <param name="urls">The list of urls</param>
        /// <param name="dataSource">The datasource to use to look for sounds</param>
        /// <param name="currentSearch">The current search</param>
        private static void GetSoundsOnPages(IList <dynamic> urls, IDataSource dataSource, websearch currentSearch, IList <websearchsound> searchResultList, Functions.LogMessageDelegate LogMessage, int maxDepthToFollow)
        {
            //const int MAX_URLS_TO_SEARCH = 20;
            const int MAX_SOUNDS_PER_URL = 150;
            //int urlsProcessed = 0;
            HashSet <string> urlsOfObjectsSearched = new HashSet <string>();

            //
            // Multithreading here for requesting the pages works pretty well speed-wise.  Unfortunately, the regexes bog down the
            // server so badly that it becomes unresponsive for other users.  So, don't do parallel on this outside loop.
            //
            // However, once the first page is processed, the sounds are webrequested asynchronously.  So, the next page will
            // start being processed while the first page's sounds are still being downloaded.  This works quite well, and
            // the performance is just about the same.  So, let's stick with that.
            //

            foreach (dynamic url in urls)
            {
                string theUrl = url.Url;
                string domain = WebProcessor.GetDomainOfUrl(theUrl);

                if (unprocessableDomains.Contains(domain))
                {
                    LogMessage(string.Format("Skipping crappy domain: {0}", domain));
                }
                else
                {
                    LogMessage(string.Format("About to search for sounds on page: \"{0}\"", theUrl));

                    // string pageContent = WebProcessor.GetUrlContents(theUrl, null, null, LogMessage);

                    //
                    // todo: test this, make sure it works
                    //
                    string pageContent = dataSource.GetUrlContents(theUrl, null, GetUserAgent(), LogMessage);

                    bool wasAborted = false;

                    //
                    // todo: combine sound links func with above function
                    //
                    IList <string> linksOnPage = GetSoundLinksOnPage(pageContent, ref wasAborted);

                    //
                    // For generating test case files, set breakpoint on if (wasAborted) below with condition:
                    //
                    // maxDepthToFollow == 1
                    //

                    if (wasAborted)
                    {
                        LogMessage(string.Format("Had to abort link search on domain: {0}", domain));

                        lock (unprocessableDomains)
                        {
                            unprocessableDomains.Add(domain);
                        }
                    }

                    LogMessage(string.Format("Found {0} links on \"{1}\"", linksOnPage.Count, theUrl));


#if MULTITHREADED
                    Parallel.ForEach <string>(linksOnPage.Take(MAX_SOUNDS_PER_URL), partialLink => // <=-- normal operation - multithreaded
#else
                    foreach (string partialLink in linksOnPage.Take(MAX_SOUNDS_PER_URL))           // <=-- for debugging stuff, it's easier when not multithreaded
#endif
                    {
                        string soundLink = WebProcessor.GetUrlForObject(theUrl, partialLink);

                        LogMessage(string.Format("About to grab a potential sound here: \"{0}\"", soundLink));

                        if (!unprocessableDomains.Contains(domain) && IsNewSoundToGrab(urlsOfObjectsSearched, soundLink))
                        {
                            websearchsound receivedObject = GetWebObjectAtUrl(soundLink, null, null);

                            //
                            // enhanced search: if not a sound and is text/html and response code is 200, search for sounds on THAT page
                            //

                            if (receivedObject.issound)
                            {
                                receivedObject.sourceurl         = theUrl;
                                receivedObject.sourceDomain      = domain;
                                receivedObject.searchResultOrder = url.Index;

                                //
                                // Check for dups
                                //
                                string md5Hash = Functions.GetMd5Hash(receivedObject.soundbytes);

                                if (!HaveMd5ForSound(dataSource.CurrentSoundMd5s, md5Hash))
                                {
                                    dataSource.SetSoundInSearch(currentSearch, receivedObject);

                                    //
                                    // Performance optimization: we're not going to return the sound data itself with the search
                                    // so let's free up the mem here
                                    //
                                    receivedObject.soundbytes = null;

                                    searchResultList.Add(receivedObject);
                                }
                                else
                                {
                                    LogMessage("Not adding sound - already in collection");
                                }
                            }
                            else if (receivedObject.contenttype.ToLower().StartsWith("text/html"))
                            {
                                //
                                // We have another HTML page.  Check that too?
                                //
                                if (maxDepthToFollow > 0)
                                {
                                    LogMessage(string.Format("Going to drill down in this page - we're at max level: {0}", maxDepthToFollow));
                                    GetSoundsOnPages(new List <dynamic>()
                                    {
                                        new { Url = soundLink, Index = url.Index }
                                    }, dataSource, currentSearch, searchResultList, LogMessage, maxDepthToFollow - 1);
                                }
                                else
                                {
                                    LogMessage(string.Format("No more drilling down, we're as low as we can go"));
                                }
                            }
                        }
                        else
                        {
                            LogMessage("Won't process: already had sound from that url, or the domain is unprocessable!");
                        }
#if MULTITHREADED
                    });
Exemplo n.º 5
0
        /// <summary>
        /// Grab a potential websoundsearch object at a url
        /// </summary>
        /// <param name="url">The url of the object</param>
        /// <param name="header">Additional header to include, if any</param>
        /// <param name="userAgent">The user agent to use, if any</param>
        /// <returns>A websearchsound object, with the properties populated if it's really a sound.</returns>
        private static websearchsound GetWebObjectAtUrl(string url, string header, string userAgent)
        {
            websearchsound result = new websearchsound();

            const long MAX_SOUND_SIZE_BYTES = 1024 * 1000;  // Let's cap at 1 MB

            var request = (HttpWebRequest)HttpWebRequest.Create(url);

            request.Timeout = 5000;                         // If it takes longer than 5 seconds to respond, we're in trouble.  Let's bail

            if (!Functions.IsEmptyString(header))
            {
                request.Headers.Add(header);
            }

            if (!Functions.IsEmptyString(userAgent))
            {
                request.UserAgent = userAgent;
            }

            HttpWebResponse response   = null;
            FileStream      fileStream = null;

            try
            {
                response = (HttpWebResponse)request.GetResponse();

                string foundContentType = response.ContentType.ToLower();
                long   responseSize     = response.ContentLength;

                if (responseSize > MAX_SOUND_SIZE_BYTES)
                {
                    LogMessage(string.Format("Won't download, too big: {0} (limit is {1})", responseSize, MAX_SOUND_SIZE_BYTES));
                }
                else
                {
                    result.contenttype = foundContentType;
                    string outputExt = GetExtensionFromMimeType(foundContentType);
                    string fileName  = WebProcessor.GetFileNameFromUrl(url);

                    result.issound   = outputExt != string.Empty;
                    result.filename  = fileName;
                    result.extension = outputExt;

                    //
                    // For debugging - setup variables to assist writing out the file to the cache dir
                    //

                    /*
                     * string csd = Config.CacheSearchesDirectory;
                     * string cgcsd = Config.Get(csd);
                     * HttpServerUtility hsu = HttpContext.Current.Server; // Note: for multithreading - this will be NULL.  Need to pass in a value.
                     * string outputPath = hsu.MapPath(cgcsd);
                     * string outputFile = Functions.CombineElementsWithDelimiter("\\", outputPath, string.Format("{0}.{1}", fileName.ReplaceAllNonAlphaNumericCharsInString(), outputExt));
                     */

                    if (result.issound)
                    {
                        /* To get raw bytes:
                         */
                        var memStream = new MemoryStream();

                        try
                        {
                            // not sure if this will copy all the bytes: response.GetResponseStream().CopyTo(memStream);
                            Functions.CopyStream(response.GetResponseStream(), memStream);
                            result.soundbytes = memStream.ToArray();
                            long memStreamSizeBytes = memStream.Length;
                            result.size = memStreamSizeBytes;
                        }
                        catch (Exception ex)
                        {
                            LogMessage(string.Format("Exception getting sound bytes for file \"{0}\", was: {1}", result.filename, ex.Message));
                        }
                        finally
                        {
                            if (memStream != null)
                            {
                                memStream.Close();
                            }
                        }
                        //
                        // Don't clog log up with successes, we're worried about the errors
                        //
                        // LogMessage(string.Format("Boom - snagged file \"{0}\" of size {1}", fileName, memStreamSizeBytes));

                        /*
                         * const bool writeDebuggingFile = false;
                         *
                         * if (writeDebuggingFile && !File.Exists(outputFile))
                         * {
                         *  // Debugging - write to disk
                         *  fileStream = new FileStream(outputFile, FileMode.Create);
                         *  response.GetResponseStream().Position = 0;
                         *  Functions.CopyStream(response.GetResponseStream(), fileStream);
                         * }
                         */
                    }
                    else
                    {
                        LogMessage(string.Format("Object at \"{0}\" not a sound, has mime type of \"{1}\"", url, foundContentType));
                    }
                }
            }
            catch (Exception ex)
            {
                // Crud.
                LogMessage(string.Format("Error doing stuff with file \"{0}\", was: {1}", url, ex.Message));
            }
            finally
            {
                if (response != null)
                {
                    response.Close();
                }

                if (fileStream != null)
                {
                    fileStream.Close();
                }
            }

            return(result);
        }