Example #1
0
        public List <string> getUrl(string title)
        {
            title = Robotom.CleanWord(title);

            if (searchcache == null || searchcache.Count > 100)
            {
                searchcache = new Dictionary <string, Tuple <List <string>, DateTime> >();
            }

            if (searchcache.ContainsKey(title))
            {
                if (searchcache[title].Item2.AddMinutes(30) > DateTime.Now)
                {
                    return(searchcache[title].Item1);
                }
                else
                {
                    searchcache.Remove(title);
                }
            }

            List <string> returnthis = new List <string>();

            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(
                ConfigurationManager.AppSettings["StorageConnectionString"]);
            CloudTableClient tableClient = storageAccount.CreateCloudTableClient();
            CloudTable       statsTable  = tableClient.GetTableReference("crawltable");

            try
            {
                var results = new Dictionary <string, int>();
                foreach (string word in title.Split(' '))
                {
                    TableQuery <UriEntity> query = new TableQuery <UriEntity>()
                                                   .Where(
                        TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, word));
                    var stuffs = statsTable.ExecuteQuery(query);
                    foreach (UriEntity stuff in stuffs)
                    {
                        string key = stuff.Title + " " + stuff.Site;
                        if (results.ContainsKey(key))
                        {
                            results[key]++;
                        }
                        else
                        {
                            results.Add(key, 1);
                        }
                    }
                }
                returnthis = results.OrderByDescending(x => x.Value).Select(x => x.Key).Take(20).ToList();
            }
            catch (Exception e)
            {
                returnthis.Add("Could not access table: " + e.Message);
            }

            if (returnthis.Count == 0)
            {
                returnthis.Add("Could not find any results...");
            }

            searchcache.Add(title, new Tuple <List <string>, DateTime>(returnthis, DateTime.Now));

            return(returnthis);
        }
Example #2
0
        //return 1 for success, 0 for fail
        public int ParseHtml(Uri uri)
        {
            if (isDisallowed(uri))
            {
                queueCount--;
                return(0);
            }

            long   start    = DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond;
            string sitedata = "";

            try
            {
                WebClient downloader = new WebClient();
                sitedata = downloader.DownloadString(uri);
            }
            catch (Exception e)
            {
                UriEntity error = new UriEntity(uri, e.Message, DateTime.Now, e.Message);
                errorTable.ExecuteAsync(TableOperation.Insert(error));
                parsed.Add(uri.AbsoluteUri);
                visited.Remove(uri.AbsoluteUri);
                queueCount--;
                return(0);
            }

            string hi = uri.AbsoluteUri;

            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(sitedata);

            HtmlNodeCollection hrefs = doc.DocumentNode.SelectNodes("//a[@href]");

            if (hrefs == null)
            {
                queueCount--;
                return(0);
            }
            foreach (HtmlNode node in hrefs)
            {
                var    href = node.Attributes["href"];
                string url  = href.Value;

                //remove this if crawler break
                try
                {
                    Uri    newsite = new Uri(uri, url);
                    string host    = newsite.Host;
                    if (host.Equals("cnn.com") || host.Equals("www.cnn.com") || newsite.AbsoluteUri.StartsWith("http://bleacherreport.com/articles"))
                    {
                        if (!visited.Contains(newsite.AbsoluteUri) && !parsed.Contains(newsite.AbsoluteUri))
                        {
                            htmlQ.AddMessageAsync(new CloudQueueMessage(newsite.AbsoluteUri));
                            visited.Add(newsite.AbsoluteUri);
                            queueCount++;
                        }
                    }
                }
                catch (Exception e)
                {
                }
                //to here

                //if (url.StartsWith("/") && !url.StartsWith("//"))
                //{
                //    Uri test = new Uri("http://" + uri.Host + url);
                //    if (!visited.Contains(test.AbsoluteUri) && !parsed.Contains(test.AbsoluteUri))
                //    {
                //        htmlQ.AddMessageAsync(new CloudQueueMessage(test.AbsoluteUri));
                //        visited.Add(test.AbsoluteUri);
                //        queueCount++;
                //    }

                //}
                //else if (url.StartsWith("http://bleacherreport.com/articles"))
                //{
                //    Uri test = new Uri(url);
                //    if (!visited.Contains(test.AbsoluteUri) && !parsed.Contains(test.AbsoluteUri))
                //    {
                //        htmlQ.AddMessageAsync(new CloudQueueMessage(test.AbsoluteUri));
                //        visited.Add(test.AbsoluteUri);
                //        queueCount++;
                //    }
                //}
            }

            long stop = DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond;

            timer = stop - start;

            //get title
            HtmlNode titleNode = doc.DocumentNode.SelectSingleNode("//title");
            string   title     = "";

            if (titleNode != null)
            {
                title = titleNode.InnerText;
            }

            //get date
            HtmlNode lastmod = doc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']");

            if (uri.Host.Equals("bleacherreport.com"))
            {
                lastmod = doc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']");
            }
            string date = "";

            if (lastmod != null)
            {
                date = lastmod.GetAttributeValue("content", "");
            }

            DateTime converteddate = date.Equals("") ? new DateTime() : Convert.ToDateTime(date);

            HashSet <UriEntity> words = new HashSet <UriEntity>();

            foreach (string word in Robotom.CleanWord(title).Split(' '))
            {
                if (!word.Trim().Equals(""))
                {
                    words.Add(new UriEntity(uri, title, converteddate, word));
                }
            }

            try
            {
                if (!parsed.Contains(uri.AbsoluteUri))
                {
                    foreach (UriEntity add in words)
                    {
                        resultTable.ExecuteAsync(TableOperation.Insert(add));
                        tableCount++;
                    }
                    lastTen.Enqueue(uri + " - \"" + title + "\"");
                    if (lastTen.Count > 10)
                    {
                        lastTen.Dequeue();
                    }
                }
            }
            catch (Exception e)
            {
            }

            parsed.Add(uri.AbsoluteUri);
            visited.Remove(uri.AbsoluteUri);
            queueCount--;

            return(1);
        }