public override IEnumerable<WarningOccurrenceDto> Analyze(ISqlRepository sqlRepository, UrlDto urlDto) { var warningOccurrencesList = new List<WarningOccurrenceDto>(); if (urlDto.Status != HttpStatusCode.OK) { return warningOccurrencesList; } foreach (var link in sqlRepository.GetWebPageLinks(urlDto.UrlId.Value)) { if (link.Status == HttpStatusCode.NotFound) { IWarning warning = GetWarning(typeof(BrokenLinkWarning)); warningOccurrencesList.Add(new WarningOccurrenceDto() { UrlId = (long)urlDto.UrlId, Message = String.Format(warning.Description, link.Url), WarningType = warning } ); } } return warningOccurrencesList; }
public override IEnumerable<WarningOccurrenceDto> Analyze(ISqlRepository sqlRepository, UrlDto urlDto) { var warningOccurrencesList = new List<WarningOccurrenceDto>(); if (urlDto.Contents == null) { _log.Debug("urlDto is empty for url:" + urlDto.Url); return warningOccurrencesList; } if (urlDto.Status != HttpStatusCode.OK) { return warningOccurrencesList; } var title = urlDto.Contents.SingleOrDefault(element => element.ContentType == ContentTypesEnum.Title); _log.Debug(String.Format("Analyze title '{0}' for url '{1}'", (title == null || title.Element == null) ? "" : title.Element, urlDto.Url)); if (title == null || String.IsNullOrEmpty(title.Element)) { IWarning warning = GetWarning(typeof(EmptyTitleWarning)); warningOccurrencesList.Add(new WarningOccurrenceDto() { UrlId = (long) urlDto.UrlId, Message = warning.Description, WarningType = warning } ); } else { var elements = sqlRepository.GetContentForElementText(ContentTypesEnum.Title, title.Element); if (elements.Any()) { ContentDto targetContentDto = elements.FirstOrDefault(x => x.UrlId != urlDto.UrlId); if (targetContentDto != null) { UrlDto targetUrlDto; sqlRepository.FindUrl(targetContentDto.UrlId, out targetUrlDto); IWarning warning = GetWarning(typeof(DuplicatedTitleWarning)); warningOccurrencesList.Add(new WarningOccurrenceDto() { UrlId = (long)urlDto.UrlId, Message = String.Format(warning.Description, title.Element, targetUrlDto.Url), WarningType = warning } ); } } } return warningOccurrencesList; }
public void ProcessPage(ISqlRepository sqlRepository, UrlDto urlDto, Report report) { foreach (var rule in _rules) { var warningOccurrences = rule.Analyze(sqlRepository, urlDto); report.WarningOccurrences.AddRange(warningOccurrences); } }
internal bool FindUrl(long urlId, out UrlDto urlDto) { urlDto = new UrlDto(); bool success = false; string selectSql = "SELECT * FROM Urls where url_id='" + urlId + "'"; using (var selectCommand = new SQLiteCommand(selectSql, _sqliteCon)) { using (var dataReader = selectCommand.ExecuteReader()) { if (dataReader.Read()) { MapDataReaderToUrlDto(dataReader, out urlDto); success = true; } } } return success; }
internal ContentDto[] GetContentsForUrlDto(UrlDto urlDto) { var contents = new List<ContentDto>(); var selectSql = "SELECT * FROM Contents where url_id='" + urlDto.UrlId + "'"; using (var selectCommand = new SQLiteCommand(selectSql, _sqliteCon)) { using (var dataReader = selectCommand.ExecuteReader()) { if (dataReader.Read()) { ContentDto contentDto; MapDataReaderToContentDto(dataReader, out contentDto); contents.Add(contentDto); } } } return contents.ToArray(); }
static void AddNewPage(CrawledPage crawledPage) { if (crawledPage.Uri.AbsoluteUri.StartsWith(_uriToCrawl.AbsoluteUri) == false) return; lock (_sqlRepository) { UrlDto urlDto; if (_sqlRepository.FindUrl(crawledPage.Uri.AbsoluteUri, out urlDto)) { // For pages that do not repond is empty if (crawledPage.HttpWebResponse != null) { urlDto.Status = crawledPage.HttpWebResponse.StatusCode; // This can be text/html or text/html;charset=utf-8 if (crawledPage.HttpWebResponse.ContentType.Contains("text/html")) { urlDto.IsWebPage = true; } } } else { urlDto = new UrlDto { Url = crawledPage.Uri.AbsoluteUri, Status = crawledPage.HttpWebResponse.StatusCode, IsWebPage = true }; } var webPageParsing = new WebPageParsing(crawledPage); var list = GetLinks(webPageParsing, urlDto); urlDto.Links = list.ToArray(); urlDto.Contents = GetContent(webPageParsing); _sqlRepository.InsertOrUpdateUrl(urlDto); } }
static UrlDto InsertUrlDtoWithOneStatusCodeOKLink(ISqlRepository repository, HttpStatusCode httpStatusCode) { var urlDtoLink = new UrlDto { Status = httpStatusCode }; repository.InsertOrUpdateUrl(urlDtoLink); var urlDto = new UrlDto { Status = HttpStatusCode.OK, Links = new List<LinkDto>() { new LinkDto() { TargetUrlId = (long) urlDtoLink.UrlId, } }.ToArray() }; return repository.InsertOrUpdateUrl(urlDto); }
public abstract IEnumerable<WarningOccurrenceDto> Analyze(ISqlRepository sqlRepository, UrlDto urlDto);
private static void MapDataReaderToUrlDto(SQLiteDataReader dataReader, out UrlDto urlDto) { urlDto = new UrlDto(); urlDto.UrlId = dataReader.GetInt64((int)UrlsField.UrlId); urlDto.Url = dataReader.GetString((int)UrlsField.Url); urlDto.Url = HttpUtility.UrlDecode(urlDto.Url); urlDto.Status = (HttpStatusCode) dataReader.GetInt32((int)UrlsField.Status); urlDto.IsWebPage = dataReader.GetInt32((int)UrlsField.IsWebPage) > 0; }
internal UrlDto InsertOrUpdateUrl(UrlDto urlDto) { string sqlStatement; string urlEncoded = HttpUtility.UrlEncode(urlDto.Url); if (urlDto.UrlId.HasValue) { sqlStatement = String.Format("INSERT OR REPLACE INTO Urls (url_id, url, status, is_webpage) VALUES ('{0}', '{1}', '{2}', '{3}')", urlDto.UrlId, urlEncoded, urlDto.StatusRaw, urlDto.IsWebPage ? 1 : 0); ExecuteSqlCommand(sqlStatement); } else { sqlStatement = String.Format("INSERT INTO Urls (url, status, is_webpage) VALUES ('{0}', '{1}', '{2}')", urlEncoded, urlDto.StatusRaw, urlDto.IsWebPage ? 1 : 0); ExecuteSqlCommand(sqlStatement); urlDto.UrlId = GetLastInsertedRowId(); } if (urlDto.Contents != null) { foreach (var content in urlDto.Contents) { content.UrlId = urlDto.UrlId.HasValue ? urlDto.UrlId.Value : 0; _contentDao.InsertOrUpdateContent(content); } } if (urlDto.Links != null) { foreach (var link in urlDto.Links) { link.SourceUrlId = urlDto.UrlId.HasValue ? urlDto.UrlId.Value : 0; _linkDao.AddLink(link); } } return urlDto; }
private static List<LinkDto> GetLinks(WebPageParsing webPageParsing, UrlDto urlDto) { var links = webPageParsing.GetLinks(); var list = new List<LinkDto>(); foreach (Uri link in links) { UrlDto linkUrlDto; if (_sqlRepository.FindUrl(link.AbsoluteUri, out linkUrlDto) == false) { linkUrlDto = new UrlDto {Url = link.AbsoluteUri}; linkUrlDto = _sqlRepository.InsertOrUpdateUrl(linkUrlDto); } list.Add(new LinkDto() { TargetUrlId = linkUrlDto.UrlId.HasValue ? linkUrlDto.UrlId.Value : 0 }); } return list; }
public bool FindUrl(long urlId, out UrlDto urlDto) { return _urlDao.FindUrl(urlId, out urlDto); }
public bool FindUrl(string url, out UrlDto urlDto) { return _urlDao.FindUrl(url, out urlDto); }
public UrlDto InsertOrUpdateUrl(UrlDto urlDto) { return _urlDao.InsertOrUpdateUrl(urlDto); }
static UrlDto GetValidTitle(string title) { var urlDto = new UrlDto { Status = HttpStatusCode.OK, Contents = new List<ContentDto>() { new ContentDto() { ContentType = ContentTypesEnum.Title, Element = title } }.ToArray() }; return urlDto; }