public string SetSoundInSearch(websearch search, websearchsound sound) { if (Functions.IsEmptyString(sound.soundid)) { sound.soundid = Guid.NewGuid().ToString(); } string outputFileName = Functions.CombineElementsWithDelimiter(@"\", PhysicalCacheDir(search.status.searchterm), string.Format("{0}.{1}", sound.soundid, DEFAULT_EXT)); Functions.SerializeObjectToFile <websearchsound>(outputFileName, sound); return(sound.soundid); }
public websearch GetWebsearch(string term, IList <websearchsound> searchResultSounds) { websearch result = null; // // The directory existance is how different threads communicate. If the dir exists, this thread believes that // another thread is currently conducting the search. It's important that the searching thread complete // w/o errors and set the status appropriately. Otherwise, we're kind of in trouble. // if (Directory.Exists(PhysicalCacheDir(term))) { // // If we have a directory, then we've started/done a search. New-up the object and grab the status. // result = new websearch(); string statusFilename = Functions.CombineElementsWithDelimiter(@"\", PhysicalCacheDir(term), STATUS_FNAME); result.status = Functions.DeserializeObjectFromFile <websearchstatus>(statusFilename); // // Only return files if we're done with the search. // if (result.status != null && result.status.isdone == 1) { // // The reason we only return files if we're done with the search is that the GetFiles() operation is a shared // resource - read: not thread-friendly. The calling function doesn't need the results if the search isn't // finished, and we want to return the status pretty quickly. // string[] sounds = Directory.GetFiles(PhysicalCacheDir(term), string.Format("*.{0}", DEFAULT_EXT)); if (sounds.Length > 0) { foreach (string sound in sounds) { websearchsound loadedSound = Functions.DeserializeObjectFromFile <websearchsound>(sound); // // Performance optimization: we're not going to return the sound data itself with the search // so let's free up the mem here // loadedSound.soundbytes = null; searchResultSounds.Add(loadedSound); } } } } return(result); }
public websearchsound GetSoundInSearch(string searchTerm, string soundId) { websearchsound result = null; string inputFileName = Functions.CombineElementsWithDelimiter(@"\", PhysicalCacheDir(searchTerm), string.Format("{0}.{1}", soundId, DEFAULT_EXT)); if (File.Exists(inputFileName)) { result = Functions.DeserializeObjectFromFile <websearchsound>(inputFileName); } else { Debug.WriteLine(string.Format("Crap! Can't find file: \"{0}\"", inputFileName)); } return(result); }
/// <summary> /// Get all the sounds in the list of urls from the passed IDataSource and add the sounds to the passed websearch /// </summary> /// <param name="urls">The list of urls</param> /// <param name="dataSource">The datasource to use to look for sounds</param> /// <param name="currentSearch">The current search</param> private static void GetSoundsOnPages(IList <dynamic> urls, IDataSource dataSource, websearch currentSearch, IList <websearchsound> searchResultList, Functions.LogMessageDelegate LogMessage, int maxDepthToFollow) { //const int MAX_URLS_TO_SEARCH = 20; const int MAX_SOUNDS_PER_URL = 150; //int urlsProcessed = 0; HashSet <string> urlsOfObjectsSearched = new HashSet <string>(); // // Multithreading here for requesting the pages works pretty well speed-wise. Unfortunately, the regexes bog down the // server so badly that it becomes unresponsive for other users. So, don't do parallel on this outside loop. // // However, once the first page is processed, the sounds are webrequested asynchronously. So, the next page will // start being processed while the first page's sounds are still being downloaded. This works quite well, and // the performance is just about the same. So, let's stick with that. // foreach (dynamic url in urls) { string theUrl = url.Url; string domain = WebProcessor.GetDomainOfUrl(theUrl); if (unprocessableDomains.Contains(domain)) { LogMessage(string.Format("Skipping crappy domain: {0}", domain)); } else { LogMessage(string.Format("About to search for sounds on page: \"{0}\"", theUrl)); // string pageContent = WebProcessor.GetUrlContents(theUrl, null, null, LogMessage); // // todo: test this, make sure it works // string pageContent = dataSource.GetUrlContents(theUrl, null, GetUserAgent(), LogMessage); bool wasAborted = false; // // todo: combine sound links func with above function // IList <string> linksOnPage = GetSoundLinksOnPage(pageContent, ref wasAborted); // // For generating test case files, set breakpoint on if (wasAborted) below with condition: // // maxDepthToFollow == 1 // if (wasAborted) { LogMessage(string.Format("Had to abort link search on domain: {0}", domain)); lock (unprocessableDomains) { unprocessableDomains.Add(domain); } } LogMessage(string.Format("Found {0} links on \"{1}\"", linksOnPage.Count, theUrl)); #if MULTITHREADED Parallel.ForEach <string>(linksOnPage.Take(MAX_SOUNDS_PER_URL), partialLink => // <=-- normal operation - multithreaded #else foreach (string partialLink in linksOnPage.Take(MAX_SOUNDS_PER_URL)) // <=-- for debugging stuff, it's easier when not multithreaded #endif { string soundLink = WebProcessor.GetUrlForObject(theUrl, partialLink); LogMessage(string.Format("About to grab a potential sound here: \"{0}\"", soundLink)); if (!unprocessableDomains.Contains(domain) && IsNewSoundToGrab(urlsOfObjectsSearched, soundLink)) { websearchsound receivedObject = GetWebObjectAtUrl(soundLink, null, null); // // enhanced search: if not a sound and is text/html and response code is 200, search for sounds on THAT page // if (receivedObject.issound) { receivedObject.sourceurl = theUrl; receivedObject.sourceDomain = domain; receivedObject.searchResultOrder = url.Index; // // Check for dups // string md5Hash = Functions.GetMd5Hash(receivedObject.soundbytes); if (!HaveMd5ForSound(dataSource.CurrentSoundMd5s, md5Hash)) { dataSource.SetSoundInSearch(currentSearch, receivedObject); // // Performance optimization: we're not going to return the sound data itself with the search // so let's free up the mem here // receivedObject.soundbytes = null; searchResultList.Add(receivedObject); } else { LogMessage("Not adding sound - already in collection"); } } else if (receivedObject.contenttype.ToLower().StartsWith("text/html")) { // // We have another HTML page. Check that too? // if (maxDepthToFollow > 0) { LogMessage(string.Format("Going to drill down in this page - we're at max level: {0}", maxDepthToFollow)); GetSoundsOnPages(new List <dynamic>() { new { Url = soundLink, Index = url.Index } }, dataSource, currentSearch, searchResultList, LogMessage, maxDepthToFollow - 1); } else { LogMessage(string.Format("No more drilling down, we're as low as we can go")); } } } else { LogMessage("Won't process: already had sound from that url, or the domain is unprocessable!"); } #if MULTITHREADED });
/// <summary> /// Grab a potential websoundsearch object at a url /// </summary> /// <param name="url">The url of the object</param> /// <param name="header">Additional header to include, if any</param> /// <param name="userAgent">The user agent to use, if any</param> /// <returns>A websearchsound object, with the properties populated if it's really a sound.</returns> private static websearchsound GetWebObjectAtUrl(string url, string header, string userAgent) { websearchsound result = new websearchsound(); const long MAX_SOUND_SIZE_BYTES = 1024 * 1000; // Let's cap at 1 MB var request = (HttpWebRequest)HttpWebRequest.Create(url); request.Timeout = 5000; // If it takes longer than 5 seconds to respond, we're in trouble. Let's bail if (!Functions.IsEmptyString(header)) { request.Headers.Add(header); } if (!Functions.IsEmptyString(userAgent)) { request.UserAgent = userAgent; } HttpWebResponse response = null; FileStream fileStream = null; try { response = (HttpWebResponse)request.GetResponse(); string foundContentType = response.ContentType.ToLower(); long responseSize = response.ContentLength; if (responseSize > MAX_SOUND_SIZE_BYTES) { LogMessage(string.Format("Won't download, too big: {0} (limit is {1})", responseSize, MAX_SOUND_SIZE_BYTES)); } else { result.contenttype = foundContentType; string outputExt = GetExtensionFromMimeType(foundContentType); string fileName = WebProcessor.GetFileNameFromUrl(url); result.issound = outputExt != string.Empty; result.filename = fileName; result.extension = outputExt; // // For debugging - setup variables to assist writing out the file to the cache dir // /* * string csd = Config.CacheSearchesDirectory; * string cgcsd = Config.Get(csd); * HttpServerUtility hsu = HttpContext.Current.Server; // Note: for multithreading - this will be NULL. Need to pass in a value. * string outputPath = hsu.MapPath(cgcsd); * string outputFile = Functions.CombineElementsWithDelimiter("\\", outputPath, string.Format("{0}.{1}", fileName.ReplaceAllNonAlphaNumericCharsInString(), outputExt)); */ if (result.issound) { /* To get raw bytes: */ var memStream = new MemoryStream(); try { // not sure if this will copy all the bytes: response.GetResponseStream().CopyTo(memStream); Functions.CopyStream(response.GetResponseStream(), memStream); result.soundbytes = memStream.ToArray(); long memStreamSizeBytes = memStream.Length; result.size = memStreamSizeBytes; } catch (Exception ex) { LogMessage(string.Format("Exception getting sound bytes for file \"{0}\", was: {1}", result.filename, ex.Message)); } finally { if (memStream != null) { memStream.Close(); } } // // Don't clog log up with successes, we're worried about the errors // // LogMessage(string.Format("Boom - snagged file \"{0}\" of size {1}", fileName, memStreamSizeBytes)); /* * const bool writeDebuggingFile = false; * * if (writeDebuggingFile && !File.Exists(outputFile)) * { * // Debugging - write to disk * fileStream = new FileStream(outputFile, FileMode.Create); * response.GetResponseStream().Position = 0; * Functions.CopyStream(response.GetResponseStream(), fileStream); * } */ } else { LogMessage(string.Format("Object at \"{0}\" not a sound, has mime type of \"{1}\"", url, foundContentType)); } } } catch (Exception ex) { // Crud. LogMessage(string.Format("Error doing stuff with file \"{0}\", was: {1}", url, ex.Message)); } finally { if (response != null) { response.Close(); } if (fileStream != null) { fileStream.Close(); } } return(result); }