private bool SaveSiteContents(string domain, SiteContents siteContents, bool overwrite) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } var container = GetContentsBlobContainer(); var blob = container.GetBlockBlobReference(domain + _siteFileExt); if (!overwrite && blob.Exists()) { return(false); } try { blob.UploadText(siteContents.SerializeToJson()); } catch { return(false); } return(true); }
public SiteCrawler(string domain, ISiteRepository siteRepository, CancellationToken cancelToken) { Trace.TraceInformation("Crawler: Starting processing domain " + domain); if (!SiteInfo.IsValidDomain(domain)) { throw new ArgumentException("Invalid domain"); } if (siteRepository == null) { throw new ArgumentNullException("siteRepository is null"); } if (cancelToken == null) { throw new ArgumentNullException("cancelToken is null"); } _domain = domain; _rawDomain = _domain.StartsWith("www.") ? _domain.Substring(4) : _domain; _siteRepository = siteRepository; _cancelToken = cancelToken; DesiredNumberOfPages = 220; CrawlDelay = 100; MaxSimultaneousRequests = 20; _binaryExtensions = " " + _binaryExtensions + " "; }
private SiteInfo LoadSiteInfo(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(null); } var table = GetInfoTable(); TableOperation operation = TableOperation.Retrieve <SiteInfoEntity>(SiteInfoEntity.StaticPartitionKey, domain); try { var result = table.Execute(operation); var entity = result.Result as SiteInfoEntity; if (entity == null) { return(null); } return(entity.ToSiteInfo()); } catch { return(null); } }
private SiteContents LoadSiteContents(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(null); } var container = GetContentsBlobContainer(); var blob = container.GetBlockBlobReference(domain + _siteFileExt); if (!blob.Exists()) { return(null); } try { string st = blob.DownloadText(); return(SiteContents.DeserializeFromJson(st)); } catch { return(null); } }
public bool SiteExists(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } return(LoadSiteInfo(domain) != null); }
public bool SiteExists(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } return(File.Exists(GetFullSiteInfoFileName(domain))); }
public bool ProcessSite(string domain) { domain = SiteInfo.NormalizeDomain(domain); if (!SiteInfo.IsValidDomain(domain)) { throw new ArgumentException("Invalid domain"); } return(_crawler.ProcessSite(domain)); }
private bool SaveSiteInfoToFile(SiteInfo siteInfo, bool overwrite) { if (siteInfo == null) { return(false); } if (!SiteInfo.IsValidDomain(siteInfo.Domain)) { return(false); } string fileName = GetFullSiteInfoFileName(siteInfo.Domain); if (!overwrite && File.Exists(fileName)) { return(false); } int retryCount = 3; do { try { using (var stream = new FileStream(fileName, FileMode.Create, FileAccess.Write, FileShare.None)) { using (var writer = new StreamWriter(stream)) { writer.Write(siteInfo.SerializeToJson()); } } return(true); } catch (IOException) { retryCount--; if (retryCount == 0) { return(false); } Thread.Sleep(2); } catch { return(false); } }while (retryCount > 0); return(true); }
public ActionResult Index(SiteAddressModel siteAddress) { if (ModelState.IsValid) { if ((siteAddress.Address == null) || !SiteInfo.IsValidDomain(SiteInfo.NormalizeDomain(siteAddress.Address))) { ModelState.AddModelError("", "Invalid domain"); return(View(siteAddress)); } return(RedirectToAction("ShowMap", "Map", new { domain = SiteInfo.NormalizeDomain(siteAddress.Address) })); } return(View(siteAddress)); }
public Site GetSite(string domain, bool includeContents, long?contentsTimeStamp = null, bool createIfNecessary = false) { domain = SiteInfo.NormalizeDomain(domain); if (!SiteInfo.IsValidDomain(domain)) { throw new ArgumentException("Invalid domain"); } var site = _siteRepository.GetSite(domain, includeContents, contentsTimeStamp); bool needToCreate = (site == null) && createIfNecessary; needToCreate = needToCreate || ((site != null) && (DateTime.UtcNow - site.Info.StatusTime > TimeSpan.FromDays(RefreshPeriodInDays))); needToCreate = needToCreate || ((site != null) && ((site.Info.Status == SiteStatus.ConnectionProblem) || (site.Info.Status == SiteStatus.RobotsTxtProblem)) && (DateTime.UtcNow - site.Info.StatusTime > TimeSpan.FromMinutes(10))); var needToProcess = ((site != null) && ((site.Info.Status == SiteStatus.Added) || (site.Info.Status == SiteStatus.Processing))); if ((site != null) && !site.Info.RefreshEnabled) { needToProcess = false; } if (needToCreate || needToProcess) { if (needToCreate) { site = new Site(); site.Info.Domain = domain; if (_siteRepository.SaveSite(site, true)) { _crawler.ProcessSite(domain); } else { site = null; } } else { _crawler.ProcessSite(domain); } } return(site); }
private SiteContents LoadSiteContentsFromFile(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(null); } string fileName = GetFullSiteContentsFileName(domain); if (!File.Exists(fileName)) { return(null); } int retryCount = 3; do { try { using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.None)) { using (var reader = new StreamReader(stream)) { string st = reader.ReadToEnd(); return(SiteContents.DeserializeFromJson(st)); } } } catch (IOException) { retryCount--; if (retryCount == 0) { return(null); } Thread.Sleep(2); } catch { return(null); } }while (retryCount > 0); return(null); }
public bool RemoveSite(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } if (!DeleteSiteInfo(domain)) { return(false); } DeleteSiteContents(domain); return(true); }
public bool RemoveSite(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } try { File.Delete(GetFullSiteInfoFileName(domain)); File.Delete(GetFullSiteContentsFileName(domain)); } catch { return(false); } return(true); }
private bool DeleteSiteContents(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } var container = GetContentsBlobContainer(); var blob = container.GetBlockBlobReference(domain + _siteFileExt); try { blob.Delete(); } catch { return(false); } return(true); }
private bool DeleteSiteInfo(string domain) { if (!SiteInfo.IsValidDomain(domain)) { return(false); } var table = GetInfoTable(); try { // Create a retrieve operation TableOperation retrieveOperation = TableOperation.Retrieve <SiteInfoEntity>(SiteInfoEntity.StaticPartitionKey, domain); // Execute the operation. TableResult retrievedResult = table.Execute(retrieveOperation); // Assign the result to a CustomerEntity. var deleteEntity = (SiteInfoEntity)retrievedResult.Result; // Create the Delete TableOperation. if (deleteEntity != null) { TableOperation deleteOperation = TableOperation.Delete(deleteEntity); // Execute the operation. table.Execute(deleteOperation); return(true); } else { return(false); } } catch { return(false); } }
private bool SaveSiteInfo(SiteInfo siteInfo, bool overwrite) { if (siteInfo == null) { return(false); } if (!SiteInfo.IsValidDomain(siteInfo.Domain)) { return(false); } var table = GetInfoTable(); TableOperation operation; var entity = new SiteInfoEntity(siteInfo); if (overwrite) { operation = TableOperation.InsertOrReplace(entity); } else { operation = TableOperation.Insert(entity); } try { var res = table.Execute(operation); } catch { return(false); } return(true); }