private bool SaveSiteContents(string domain, SiteContents siteContents, bool overwrite)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            var container = GetContentsBlobContainer();
            var blob      = container.GetBlockBlobReference(domain + _siteFileExt);

            if (!overwrite && blob.Exists())
            {
                return(false);
            }

            try
            {
                blob.UploadText(siteContents.SerializeToJson());
            }
            catch
            {
                return(false);
            }

            return(true);
        }
        public SiteCrawler(string domain, ISiteRepository siteRepository, CancellationToken cancelToken)
        {
            Trace.TraceInformation("Crawler: Starting processing domain " + domain);

            if (!SiteInfo.IsValidDomain(domain))
            {
                throw new ArgumentException("Invalid domain");
            }

            if (siteRepository == null)
            {
                throw new ArgumentNullException("siteRepository is null");
            }

            if (cancelToken == null)
            {
                throw new ArgumentNullException("cancelToken is null");
            }

            _domain    = domain;
            _rawDomain = _domain.StartsWith("www.") ? _domain.Substring(4) : _domain;

            _siteRepository = siteRepository;
            _cancelToken    = cancelToken;

            DesiredNumberOfPages    = 220;
            CrawlDelay              = 100;
            MaxSimultaneousRequests = 20;

            _binaryExtensions = " " + _binaryExtensions + " ";
        }
        private SiteInfo LoadSiteInfo(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(null);
            }

            var            table     = GetInfoTable();
            TableOperation operation = TableOperation.Retrieve <SiteInfoEntity>(SiteInfoEntity.StaticPartitionKey, domain);

            try
            {
                var result = table.Execute(operation);
                var entity = result.Result as SiteInfoEntity;

                if (entity == null)
                {
                    return(null);
                }

                return(entity.ToSiteInfo());
            }
            catch
            {
                return(null);
            }
        }
        private SiteContents LoadSiteContents(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(null);
            }

            var container = GetContentsBlobContainer();
            var blob      = container.GetBlockBlobReference(domain + _siteFileExt);

            if (!blob.Exists())
            {
                return(null);
            }

            try
            {
                string st = blob.DownloadText();
                return(SiteContents.DeserializeFromJson(st));
            }
            catch
            {
                return(null);
            }
        }
        public bool SiteExists(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            return(LoadSiteInfo(domain) != null);
        }
        public bool SiteExists(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            return(File.Exists(GetFullSiteInfoFileName(domain)));
        }
        public bool ProcessSite(string domain)
        {
            domain = SiteInfo.NormalizeDomain(domain);

            if (!SiteInfo.IsValidDomain(domain))
            {
                throw new ArgumentException("Invalid domain");
            }

            return(_crawler.ProcessSite(domain));
        }
        private bool SaveSiteInfoToFile(SiteInfo siteInfo, bool overwrite)
        {
            if (siteInfo == null)
            {
                return(false);
            }

            if (!SiteInfo.IsValidDomain(siteInfo.Domain))
            {
                return(false);
            }

            string fileName = GetFullSiteInfoFileName(siteInfo.Domain);

            if (!overwrite && File.Exists(fileName))
            {
                return(false);
            }

            int retryCount = 3;

            do
            {
                try
                {
                    using (var stream = new FileStream(fileName, FileMode.Create, FileAccess.Write, FileShare.None))
                    {
                        using (var writer = new StreamWriter(stream))
                        {
                            writer.Write(siteInfo.SerializeToJson());
                        }
                    }

                    return(true);
                }
                catch (IOException)
                {
                    retryCount--;

                    if (retryCount == 0)
                    {
                        return(false);
                    }

                    Thread.Sleep(2);
                }
                catch
                {
                    return(false);
                }
            }while (retryCount > 0);

            return(true);
        }
Example #9
0
        public ActionResult Index(SiteAddressModel siteAddress)
        {
            if (ModelState.IsValid)
            {
                if ((siteAddress.Address == null) || !SiteInfo.IsValidDomain(SiteInfo.NormalizeDomain(siteAddress.Address)))
                {
                    ModelState.AddModelError("", "Invalid domain");
                    return(View(siteAddress));
                }

                return(RedirectToAction("ShowMap", "Map", new { domain = SiteInfo.NormalizeDomain(siteAddress.Address) }));
            }

            return(View(siteAddress));
        }
        public Site GetSite(string domain, bool includeContents, long?contentsTimeStamp = null, bool createIfNecessary = false)
        {
            domain = SiteInfo.NormalizeDomain(domain);

            if (!SiteInfo.IsValidDomain(domain))
            {
                throw new ArgumentException("Invalid domain");
            }

            var site = _siteRepository.GetSite(domain, includeContents, contentsTimeStamp);

            bool needToCreate = (site == null) && createIfNecessary;

            needToCreate = needToCreate || ((site != null) && (DateTime.UtcNow - site.Info.StatusTime > TimeSpan.FromDays(RefreshPeriodInDays)));
            needToCreate = needToCreate || ((site != null) &&
                                            ((site.Info.Status == SiteStatus.ConnectionProblem) || (site.Info.Status == SiteStatus.RobotsTxtProblem)) &&
                                            (DateTime.UtcNow - site.Info.StatusTime > TimeSpan.FromMinutes(10)));
            var needToProcess = ((site != null) && ((site.Info.Status == SiteStatus.Added) || (site.Info.Status == SiteStatus.Processing)));

            if ((site != null) && !site.Info.RefreshEnabled)
            {
                needToProcess = false;
            }

            if (needToCreate || needToProcess)
            {
                if (needToCreate)
                {
                    site             = new Site();
                    site.Info.Domain = domain;

                    if (_siteRepository.SaveSite(site, true))
                    {
                        _crawler.ProcessSite(domain);
                    }
                    else
                    {
                        site = null;
                    }
                }
                else
                {
                    _crawler.ProcessSite(domain);
                }
            }

            return(site);
        }
        private SiteContents LoadSiteContentsFromFile(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(null);
            }

            string fileName = GetFullSiteContentsFileName(domain);

            if (!File.Exists(fileName))
            {
                return(null);
            }

            int retryCount = 3;

            do
            {
                try
                {
                    using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.None))
                    {
                        using (var reader = new StreamReader(stream))
                        {
                            string st = reader.ReadToEnd();
                            return(SiteContents.DeserializeFromJson(st));
                        }
                    }
                }
                catch (IOException)
                {
                    retryCount--;

                    if (retryCount == 0)
                    {
                        return(null);
                    }

                    Thread.Sleep(2);
                }
                catch
                {
                    return(null);
                }
            }while (retryCount > 0);

            return(null);
        }
        public bool RemoveSite(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            if (!DeleteSiteInfo(domain))
            {
                return(false);
            }

            DeleteSiteContents(domain);

            return(true);
        }
        public bool RemoveSite(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            try
            {
                File.Delete(GetFullSiteInfoFileName(domain));
                File.Delete(GetFullSiteContentsFileName(domain));
            }
            catch
            {
                return(false);
            }

            return(true);
        }
        private bool DeleteSiteContents(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            var container = GetContentsBlobContainer();
            var blob      = container.GetBlockBlobReference(domain + _siteFileExt);

            try
            {
                blob.Delete();
            }
            catch
            {
                return(false);
            }

            return(true);
        }
        private bool DeleteSiteInfo(string domain)
        {
            if (!SiteInfo.IsValidDomain(domain))
            {
                return(false);
            }

            var table = GetInfoTable();

            try
            {
                // Create a retrieve operation
                TableOperation retrieveOperation = TableOperation.Retrieve <SiteInfoEntity>(SiteInfoEntity.StaticPartitionKey, domain);

                // Execute the operation.
                TableResult retrievedResult = table.Execute(retrieveOperation);

                // Assign the result to a CustomerEntity.
                var deleteEntity = (SiteInfoEntity)retrievedResult.Result;

                // Create the Delete TableOperation.
                if (deleteEntity != null)
                {
                    TableOperation deleteOperation = TableOperation.Delete(deleteEntity);

                    // Execute the operation.
                    table.Execute(deleteOperation);

                    return(true);
                }
                else
                {
                    return(false);
                }
            }
            catch
            {
                return(false);
            }
        }
        private bool SaveSiteInfo(SiteInfo siteInfo, bool overwrite)
        {
            if (siteInfo == null)
            {
                return(false);
            }

            if (!SiteInfo.IsValidDomain(siteInfo.Domain))
            {
                return(false);
            }

            var            table = GetInfoTable();
            TableOperation operation;
            var            entity = new SiteInfoEntity(siteInfo);

            if (overwrite)
            {
                operation = TableOperation.InsertOrReplace(entity);
            }
            else
            {
                operation = TableOperation.Insert(entity);
            }

            try
            {
                var res = table.Execute(operation);
            }
            catch
            {
                return(false);
            }

            return(true);
        }