Example #1
0
        public DataResult <int> CreateUriForClient(int clientId, UriModel uri)
        {
            var uriTypes = m_uriTypeUoW.GetAllUriTypes();

            var newUri = new UriEntity
            {
                Uri      = uri.Value,
                UriTypes = new HashSet <UriTypeEntity>(uri.UriTypes.Select(x => uriTypes.FirstOrDefault(y => y.Value == x.UriTypeValue))),
            };

            try
            {
                var result = m_uriUoW.CreateUriForClient(clientId, newUri);
                return(Success(result));
            }
            catch (NoResultException <ClientEntity> e)
            {
                m_logger.LogWarning(e);
                return(Error <int>(m_translator.Translate("invalid-client-id"), DataResultErrorCode.ClientNotExistId));
            }
            catch (DatabaseException e)
            {
                m_logger.LogWarning(e);
                return(Error <int>(e.Message));
            }
        }
Example #2
0
        public virtual int CreateUriForClient(int clientId, UriEntity uriEntity)
        {
            var client = m_clientRepository.FindById <ClientEntity>(clientId);

            uriEntity.Client = client ?? throw new NoResultException <ClientEntity>();

            var result = (int)m_uriRepository.Create(uriEntity);

            return(result);
        }
Example #3
0
        //return 1 for success, 0 for fail
        public int ParseHtml(Uri uri)
        {
            if (isDisallowed(uri))
            {
                queueCount--;
                return(0);
            }

            long   start    = DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond;
            string sitedata = "";

            try
            {
                WebClient downloader = new WebClient();
                sitedata = downloader.DownloadString(uri);
            }
            catch (Exception e)
            {
                UriEntity error = new UriEntity(uri, e.Message, DateTime.Now, e.Message);
                errorTable.ExecuteAsync(TableOperation.Insert(error));
                parsed.Add(uri.AbsoluteUri);
                visited.Remove(uri.AbsoluteUri);
                queueCount--;
                return(0);
            }

            string hi = uri.AbsoluteUri;

            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(sitedata);

            HtmlNodeCollection hrefs = doc.DocumentNode.SelectNodes("//a[@href]");

            if (hrefs == null)
            {
                queueCount--;
                return(0);
            }
            foreach (HtmlNode node in hrefs)
            {
                var    href = node.Attributes["href"];
                string url  = href.Value;

                //remove this if crawler break
                try
                {
                    Uri    newsite = new Uri(uri, url);
                    string host    = newsite.Host;
                    if (host.Equals("cnn.com") || host.Equals("www.cnn.com") || newsite.AbsoluteUri.StartsWith("http://bleacherreport.com/articles"))
                    {
                        if (!visited.Contains(newsite.AbsoluteUri) && !parsed.Contains(newsite.AbsoluteUri))
                        {
                            htmlQ.AddMessageAsync(new CloudQueueMessage(newsite.AbsoluteUri));
                            visited.Add(newsite.AbsoluteUri);
                            queueCount++;
                        }
                    }
                }
                catch (Exception e)
                {
                }
                //to here

                //if (url.StartsWith("/") && !url.StartsWith("//"))
                //{
                //    Uri test = new Uri("http://" + uri.Host + url);
                //    if (!visited.Contains(test.AbsoluteUri) && !parsed.Contains(test.AbsoluteUri))
                //    {
                //        htmlQ.AddMessageAsync(new CloudQueueMessage(test.AbsoluteUri));
                //        visited.Add(test.AbsoluteUri);
                //        queueCount++;
                //    }

                //}
                //else if (url.StartsWith("http://bleacherreport.com/articles"))
                //{
                //    Uri test = new Uri(url);
                //    if (!visited.Contains(test.AbsoluteUri) && !parsed.Contains(test.AbsoluteUri))
                //    {
                //        htmlQ.AddMessageAsync(new CloudQueueMessage(test.AbsoluteUri));
                //        visited.Add(test.AbsoluteUri);
                //        queueCount++;
                //    }
                //}
            }

            long stop = DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond;

            timer = stop - start;

            //get title
            HtmlNode titleNode = doc.DocumentNode.SelectSingleNode("//title");
            string   title     = "";

            if (titleNode != null)
            {
                title = titleNode.InnerText;
            }

            //get date
            HtmlNode lastmod = doc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']");

            if (uri.Host.Equals("bleacherreport.com"))
            {
                lastmod = doc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']");
            }
            string date = "";

            if (lastmod != null)
            {
                date = lastmod.GetAttributeValue("content", "");
            }

            DateTime converteddate = date.Equals("") ? new DateTime() : Convert.ToDateTime(date);

            HashSet <UriEntity> words = new HashSet <UriEntity>();

            foreach (string word in Robotom.CleanWord(title).Split(' '))
            {
                if (!word.Trim().Equals(""))
                {
                    words.Add(new UriEntity(uri, title, converteddate, word));
                }
            }

            try
            {
                if (!parsed.Contains(uri.AbsoluteUri))
                {
                    foreach (UriEntity add in words)
                    {
                        resultTable.ExecuteAsync(TableOperation.Insert(add));
                        tableCount++;
                    }
                    lastTen.Enqueue(uri + " - \"" + title + "\"");
                    if (lastTen.Count > 10)
                    {
                        lastTen.Dequeue();
                    }
                }
            }
            catch (Exception e)
            {
            }

            parsed.Add(uri.AbsoluteUri);
            visited.Remove(uri.AbsoluteUri);
            queueCount--;

            return(1);
        }