public SiteConverter(string storagePath, string baseUri) { RebaseLinks = true; _baseUri = new Uri(baseUri, UriKind.Absolute); HttpCloneConfig config = Config.ReadConfig(_baseUri, storagePath); _mime = new MimeInfoMap(_baseUri, storagePath); CleanupRegex = new Regex(config.BadNameCharsExpression ?? @"[^\w]+", RegexOptions.IgnoreCase | RegexOptions.Singleline); _content = new ContentStorage(storagePath, true); }
public MimeInfoMap(Uri baseUri, string storageDir) { _config = Config.ReadConfig(baseUri, storageDir); foreach (HttpCloneDocType t in _config.DocumentTypes.Where(x=>!String.IsNullOrEmpty(x.MimeType))) { _map.Add(t.MimeType, t); if (t.Aliases != null) foreach (var a in t.Aliases.Where(x => !String.IsNullOrEmpty(x.MimeType))) _map.Add(a.MimeType, t); } }
public MimeInfoMap(Uri baseUri, string storageDir) { _config = Config.ReadConfig(baseUri, storageDir); foreach (HttpCloneDocType t in _config.DocumentTypes.Where(x => !String.IsNullOrEmpty(x.MimeType))) { _map.Add(t.MimeType, t); if (t.Aliases != null) { foreach (var a in t.Aliases.Where(x => !String.IsNullOrEmpty(x.MimeType))) { _map.Add(a.MimeType, t); } } } }
public ContentParser(ContentStorage content, Uri configUri, Uri baseUri) { _baseUri = baseUri; _config = Config.ReadConfig(configUri, content.StorageDirectory); _content = content; _documentTypes = new Dictionary <string, HttpCloneDocType>(StringComparer.OrdinalIgnoreCase); foreach (HttpCloneDocType dtype in _config.DocumentTypes) { _documentTypes.Add(dtype.MimeType, dtype); if (dtype.FileExtension != null) { _documentTypes.Add(dtype.FileExtension, dtype); } foreach (var alias in dtype.Aliases.SafeEnumeration()) { _documentTypes.Add(alias.MimeType, dtype); if (alias.FileExtension != null) { _documentTypes.Add(alias.FileExtension, dtype); } } } _documentTags = new Dictionary <string, List <HttpCloneDocumentTag> >(StringComparer.OrdinalIgnoreCase); foreach (HttpCloneDocType dtype in _config.DocumentTypes) { if (dtype.DocumentTags == null) { continue; } foreach (HttpCloneDocumentTag tag in dtype.DocumentTags) { List <HttpCloneDocumentTag> tags; string key = dtype.MimeType + MimeTagDivide + tag.TagName; if (!_documentTags.TryGetValue(key, out tags)) { _documentTags.Add(key, tags = new List <HttpCloneDocumentTag>()); foreach (var alias in dtype.Aliases.SafeEnumeration()) { _documentTags.Add(alias.MimeType + MimeTagDivide + tag.TagName, tags); } } tags.Add(tag); } } }
public SiteCollector(string directory, string uriStart) { CrawlTime = DateTime.Now; _instanceId = Guid.NewGuid().ToUInt64(); _baseUri = new Uri(new Uri(uriStart, UriKind.Absolute), "/"); if (!Directory.Exists(directory)) Directory.CreateDirectory(directory); _config = Config.ReadConfig(_baseUri, directory); _excluded = new PathExclusionList(); _queue = new TextQueue(Path.Combine(directory, "workqueue.txt"), true); _data = new ContentStorage(directory, false); _parser = new ContentParser(_data, _baseUri); _parser.VisitUri += AddUri; AddUrlsFound = true; UpdateSearchTemplate = true; MaxCrawlAge = TimeSpan.MaxValue; AddUri(new Uri(uriStart, UriKind.Absolute)); }
public SiteCollector(string directory, string uriStart) { CrawlTime = DateTime.Now; _instanceId = Guid.NewGuid().ToUInt64(); _baseUri = new Uri(new Uri(uriStart, UriKind.Absolute), "/"); if (!Directory.Exists(directory)) { Directory.CreateDirectory(directory); } _config = Config.ReadConfig(_baseUri, directory); _excluded = new PathExclusionList(); _queue = new TextQueue(Path.Combine(directory, "workqueue.txt"), true); _data = new ContentStorage(directory, false); _parser = new ContentParser(_data, _baseUri); _parser.VisitUri += AddUri; AddUrlsFound = true; UpdateSearchTemplate = true; MaxCrawlAge = TimeSpan.MaxValue; AddUri(new Uri(uriStart, UriKind.Absolute)); }
public ContentOptimizier(string storagePath, string baseUri) { _baseUri = new Uri(baseUri, UriKind.Absolute); _config = Config.ReadConfig(_baseUri, storagePath); _content = new ContentStorage(storagePath, false); }
public SearchTemplateBuilder(ContentStorage data, Uri baseUri) { _data = data; _baseUri = baseUri; _config = Config.ReadConfig(_baseUri, data.StorageDirectory); }