예제 #1
0
        public string GetSearchResults(string query)
        {
            SiteDataTable = CloudConfiguration.GetSiteDataTable();
            query         = query.Trim().ToLower();
            if (cache.ContainsKey(query))
            {
                return(new JavaScriptSerializer().Serialize(cache[query]));
            }
            else
            {
                var keywords = query.Split(null)
                               .Select(x => Base64.Base64Encode(x));

                var results = new List <URLEntity>();

                foreach (string keyword in keywords)
                {
                    TableQuery <URLEntity> rangeQuery = new TableQuery <URLEntity>()
                                                        .Where(TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, keyword));

                    var data = SiteDataTable.ExecuteQuery(rangeQuery);
                    results.AddRange(data);
                }

                var siteMatches = results.GroupBy(x => x.URL)
                                  .Select(group => new Tuple <string, int, string>(group.Key, group.Count(), group.First().Title))
                                  .OrderByDescending(tuple => tuple.Item2);

                var links = siteMatches.Select(x => x.Item1 + "$" + x.Item3).ToList <string>();
                cache.Add(query, links);
                return(new JavaScriptSerializer().Serialize(links));
            }
        }
예제 #2
0
        public override void Run()
        {
            Storage = new AzureStorage();

            LoadQueue        = CloudConfiguration.GetLoadingQueue();
            CrawlQueue       = CloudConfiguration.GetCrawlingQueue();
            StopQueue        = CloudConfiguration.GetStopQueue();
            SiteDataTable    = CloudConfiguration.GetSiteDataTable();
            AdminStatusTable = CloudConfiguration.GetAdminStatusTable();
            StateQueue       = CloudConfiguration.GetStateQueue();

            State = "Idle";

            CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total");
            MemCount = new PerformanceCounter("Memory", "Available MBytes");

            Status = new AdminStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue());

            string[] robots = { "http://www.cnn.com/robots.txt", "http://www.bleacherreport.com/robots.txt" };
            Crawler = new WebCrawler(robots, Storage);

            Thread.Sleep(10000);



            string url = "";

            while (true)
            {
                CloudQueueMessage stopMessage = StopQueue.GetMessage();

                while (stopMessage == null)
                {
                    // Get the next message
                    CloudQueueMessage loadMessage = LoadQueue.GetMessage();
                    State = "Loading";
                    if (loadMessage != null)
                    {
                        State = "Loading";
                        url   = loadMessage.AsString;
                        if (url.Contains("robots.txt"))
                        {
                            string[] robotLinks = url.Split(null);
                            foreach (string link in robotLinks)
                            {
                                Crawler.ProcessURL(link);
                            }
                            LoadQueue.DeleteMessage(loadMessage);
                        }
                        else
                        {
                            Crawler.ProcessURL(url);
                        }
                    }
                    else if (State.Equals("Loading") || State.Equals("Crawling"))
                    {
                        CloudQueueMessage crawlMessage = CrawlQueue.GetMessage();
                        // dequeue crawl message
                        if (crawlMessage != null)
                        {
                            State = "Crawling";
                            url   = crawlMessage.AsString;
                            Crawler.ProcessURL(url);
                            CrawlQueue.DeleteMessage(crawlMessage);
                        }
                    }
                    stopMessage = StopQueue.GetMessage();
                    UpdateDashboard(url);
                }
                State = "Idle";
            }
        }