public void Export(string directory) { Dictionary <string, string> renamed = GetFriendlyNames(); Uri location = new Uri(directory, UriKind.Absolute); ContentParser parser = new ContentParser(_content, _baseUri); parser.RewriteElement += e => { return(e); }; parser.RewriteUri += uri => { string rename; if (uri.IsSameHost(_baseUri)) { if (renamed.TryGetValue(uri.NormalizedPathAndQuery(), out rename)) { if (RebaseLinks) { return(new Uri(location, rename.TrimStart('/', '\\'))); } else { return(new Uri(_baseUri, rename.TrimStart('/', '\\'))); } } } return(uri); }; parser.RewriteAll = true; parser.RelativeUri = RebaseLinks; parser.Reformat = Reformat; parser.IndentChars = " "; parser.ProcessAll( (r, b) => { string path; if (renamed.TryGetValue(r.ContentUri, out path)) { string file = Path.Combine(directory, path.Replace('/', '\\').TrimStart('\\')); if (!Directory.Exists(Path.GetDirectoryName(file))) { Directory.CreateDirectory(Path.GetDirectoryName(file)); } File.WriteAllBytes(file, b); } } ); }
public void ConvertTo(string target, ContentStorage writer) { Uri tgt = new Uri(target, UriKind.Absolute); ContentParser processor = new ContentParser(writer, _baseUri); processor.Reformat = Reformat; processor.RelativeUri = false; if (RebaseLinks) { processor.RewriteUri += uri => { if (uri.IsSameHost(_baseUri)) { return(new Uri(tgt, uri.PathAndQuery)); } return(uri); }; } CopyTo(writer, processor.ProcessFile); }
private void RunOptimizer(Predicate <ContentRecord> filter) { Dictionary <string, string> values = new Dictionary <string, string>(StringComparer.OrdinalIgnoreCase) { { "site.uri", _baseUri.AbsoluteUri } }; foreach (var type in _config.DocumentTypes.Where(t => t.Optimizations != null)) { ILookup <string, HttpCloneFileType> lookup = new[] { type } .Concat(type.Aliases.SafeEnumeration()) .ToLookup(a => a.MimeType); ContentParser processor = new ContentParser(_content, _baseUri); processor.Reformat = CondenseHtml; processor.RelativeUri = true; processor.IndentChars = String.Empty; if (type.Type == ContentFormat.Html && _config.Searching != null && _config.Searching.FormXPath != null) { processor.RewriteXmlDocument += RewriteSearchForm; } if (type.Type == ContentFormat.Html || type.Type == ContentFormat.Xml) { new XmlRewriter(processor, type.Optimizations, values); } new RegexRewriter(processor, type.Optimizations, values); processor.ContextChanged += r => { values["page.path"] = r.ContentUri; values["page.uri"] = new Uri(_baseUri, r.ContentUri).AbsoluteUri; values["page.mime"] = r.MimeType; }; processor.Process(x => (filter == null || filter(x)) && lookup.Contains(x.MimeType)); } }
public SiteCollector(string directory, string uriStart) { CrawlTime = DateTime.Now; _instanceId = Guid.NewGuid().ToUInt64(); _baseUri = new Uri(new Uri(uriStart, UriKind.Absolute), "/"); if (!Directory.Exists(directory)) { Directory.CreateDirectory(directory); } _config = Config.ReadConfig(_baseUri, directory); _excluded = new PathExclusionList(); _queue = new TextQueue(Path.Combine(directory, "workqueue.txt"), true); _data = new ContentStorage(directory, false); _parser = new ContentParser(_data, _baseUri); _parser.VisitUri += AddUri; AddUrlsFound = true; UpdateSearchTemplate = true; MaxCrawlAge = TimeSpan.MaxValue; AddUri(new Uri(uriStart, UriKind.Absolute)); }
private void UpdateTemplate(bool forced) { string tempPath = new Uri(_baseUri, _config.Searching.TemplateUri).NormalizedPathAndQuery(); ContentRecord record; ContentRecord.Builder update; if (_data.TryGetValue(TemplatePath, out record)) { update = record.ToBuilder(); } else { update = _data.New(TemplatePath, DateTime.Now); } ContentRecord template; if (_data.TryGetValue(tempPath, out template)) { if (template.HasContentStoreId && (forced || template.HashOriginal != update.HashOriginal)) { update.SetContentType(template.ContentType); update.SetHashOriginal(template.HashOriginal); update.SetLastCrawled(template.LastCrawled); update.SetLastValid(template.LastValid); update.SetDateModified(DateTime.Now); update.SetHttpStatus(template.HttpStatus); update.ClearContentRedirect(); if (template.HasContentRedirect) { update.SetContentRedirect(update.ContentRedirect); } ContentParser parser = new ContentParser(_data, _baseUri); parser.RelativeUri = true; parser.RewriteUri += uri => new Uri(uri.OriginalString); Uri templateUri = new Uri(_baseUri, SearchTemplate.SearchPath); parser.MakeRelativeUri = (s, d) => templateUri.MakeRelativeUri(d); byte[] mapped = parser.ProcessFile(template, _data.ReadContent(template, true)); string templateHtml = CreateTemplate(Encoding.UTF8.GetString(mapped)); using (ITransactable trans = _data.WriteContent(update, Encoding.UTF8.GetBytes(templateHtml))) { _data.AddOrUpdate(TemplatePath, update.Build()); trans.Commit(); } } } if (!_data.TryGetValue(SearchCssPath, out record)) { ContentRecord cssRecord = _data.New(SearchCssPath, DateTime.Now) .SetContentType("text/css") .SetHttpStatus(200) .Build(); _data.Add(cssRecord.ContentUri, cssRecord); _data.WriteContent(cssRecord, Encoding.UTF8.GetBytes(Properties.Resources.search_css)); } }