/// <summary> /// From a JSON results string (must be from GetSearchResults()), grab all the urls /// </summary> /// <param name="rawJsonResults">The JSON string</param> /// <returns>A list of dynamic objects</returns> /// <remarks> /// Url = url["unescapedUrl"].ToString(), /// Index = index, /// Domain = WebProcessor.GetDomainOfUrl(url["unescapedUrl"].ToString()) /// </remarks> public IList <dynamic> GetResultUrls(string rawJsonResults) { IEnumerable temp = null; IList <dynamic> result = new List <dynamic>(); if (!Functions.IsEmptyString(rawJsonResults)) { JObject jsonResults = JObject.Parse(rawJsonResults); temp = jsonResults["responseData"]["results"].Children().Select( (url, index) => new { Url = url["unescapedUrl"].ToString(), Index = index, Domain = WebProcessor.GetDomainOfUrl(url["unescapedUrl"].ToString()) }); // // "temp" is some crazy type of variable. IEnumerable doesn't have a "Count" property, so it's pretty much useless. // Todo: figure more of this Linq stuff out. It seems cool, but so hard to use. // foreach (object item in temp) { result.Add(item); } } return(result); }
/// <summary> /// Get all the sounds in the list of urls from the passed IDataSource and add the sounds to the passed websearch /// </summary> /// <param name="urls">The list of urls</param> /// <param name="dataSource">The datasource to use to look for sounds</param> /// <param name="currentSearch">The current search</param> private static void GetSoundsOnPages(IList <dynamic> urls, IDataSource dataSource, websearch currentSearch, IList <websearchsound> searchResultList, Functions.LogMessageDelegate LogMessage, int maxDepthToFollow) { //const int MAX_URLS_TO_SEARCH = 20; const int MAX_SOUNDS_PER_URL = 150; //int urlsProcessed = 0; HashSet <string> urlsOfObjectsSearched = new HashSet <string>(); // // Multithreading here for requesting the pages works pretty well speed-wise. Unfortunately, the regexes bog down the // server so badly that it becomes unresponsive for other users. So, don't do parallel on this outside loop. // // However, once the first page is processed, the sounds are webrequested asynchronously. So, the next page will // start being processed while the first page's sounds are still being downloaded. This works quite well, and // the performance is just about the same. So, let's stick with that. // foreach (dynamic url in urls) { string theUrl = url.Url; string domain = WebProcessor.GetDomainOfUrl(theUrl); if (unprocessableDomains.Contains(domain)) { LogMessage(string.Format("Skipping crappy domain: {0}", domain)); } else { LogMessage(string.Format("About to search for sounds on page: \"{0}\"", theUrl)); // string pageContent = WebProcessor.GetUrlContents(theUrl, null, null, LogMessage); // // todo: test this, make sure it works // string pageContent = dataSource.GetUrlContents(theUrl, null, GetUserAgent(), LogMessage); bool wasAborted = false; // // todo: combine sound links func with above function // IList <string> linksOnPage = GetSoundLinksOnPage(pageContent, ref wasAborted); // // For generating test case files, set breakpoint on if (wasAborted) below with condition: // // maxDepthToFollow == 1 // if (wasAborted) { LogMessage(string.Format("Had to abort link search on domain: {0}", domain)); lock (unprocessableDomains) { unprocessableDomains.Add(domain); } } LogMessage(string.Format("Found {0} links on \"{1}\"", linksOnPage.Count, theUrl)); #if MULTITHREADED Parallel.ForEach <string>(linksOnPage.Take(MAX_SOUNDS_PER_URL), partialLink => // <=-- normal operation - multithreaded #else foreach (string partialLink in linksOnPage.Take(MAX_SOUNDS_PER_URL)) // <=-- for debugging stuff, it's easier when not multithreaded #endif { string soundLink = WebProcessor.GetUrlForObject(theUrl, partialLink); LogMessage(string.Format("About to grab a potential sound here: \"{0}\"", soundLink)); if (!unprocessableDomains.Contains(domain) && IsNewSoundToGrab(urlsOfObjectsSearched, soundLink)) { websearchsound receivedObject = GetWebObjectAtUrl(soundLink, null, null); // // enhanced search: if not a sound and is text/html and response code is 200, search for sounds on THAT page // if (receivedObject.issound) { receivedObject.sourceurl = theUrl; receivedObject.sourceDomain = domain; receivedObject.searchResultOrder = url.Index; // // Check for dups // string md5Hash = Functions.GetMd5Hash(receivedObject.soundbytes); if (!HaveMd5ForSound(dataSource.CurrentSoundMd5s, md5Hash)) { dataSource.SetSoundInSearch(currentSearch, receivedObject); // // Performance optimization: we're not going to return the sound data itself with the search // so let's free up the mem here // receivedObject.soundbytes = null; searchResultList.Add(receivedObject); } else { LogMessage("Not adding sound - already in collection"); } } else if (receivedObject.contenttype.ToLower().StartsWith("text/html")) { // // We have another HTML page. Check that too? // if (maxDepthToFollow > 0) { LogMessage(string.Format("Going to drill down in this page - we're at max level: {0}", maxDepthToFollow)); GetSoundsOnPages(new List <dynamic>() { new { Url = soundLink, Index = url.Index } }, dataSource, currentSearch, searchResultList, LogMessage, maxDepthToFollow - 1); } else { LogMessage(string.Format("No more drilling down, we're as low as we can go")); } } } else { LogMessage("Won't process: already had sound from that url, or the domain is unprocessable!"); } #if MULTITHREADED });