public void CopyTo(ContentStorage writer, Func<ContentRecord, byte[], byte[]> fnprocess) { bool success; foreach(KeyValuePair<string, ContentRecord> item in _content) { ContentRecord.Builder builder = item.Value.ToBuilder(); if(item.Value.HasContentStoreId) { byte[] data = _content.ReadContent(item.Value, true); if (fnprocess != null) data = fnprocess(item.Value, data); using (ITransactable trans = writer.WriteContent(builder, data)) { success = Overwrite ? writer.AddOrUpdate(item.Key, builder.Build()) : writer.Add(item.Key, builder.Build()); if (success) trans.Commit(); } } else success = Overwrite ? writer.AddOrUpdate(item.Key, builder.Build()) : writer.Add(item.Key, builder.Build()); if(!success) Console.Error.WriteLine("Path already exists " + item.Key); } }
public ContentResponse(ContentStorage content, Uri uri) { _content = content; _status = HttpStatusCode.InternalServerError; _record = ContentRecord.DefaultInstance; try { string path = uri.NormalizedPathAndQuery(); if (_content.TryGetValue(path, out _record)) { if (_record.HasContentRedirect) _status = HttpStatusCode.Redirect; else _status = HttpStatusCode.OK; } else { _record = ContentRecord.DefaultInstance; _status = HttpStatusCode.NotFound; Log.Warning("404 - {0}", path); } } catch (Exception ex) { Log.Error(ex, "Exception on {0}", uri); } }
public SiteConverter(string storagePath, string baseUri) { RebaseLinks = true; _baseUri = new Uri(baseUri, UriKind.Absolute); HttpCloneConfig config = Config.ReadConfig(_baseUri, storagePath); _mime = new MimeInfoMap(_baseUri, storagePath); CleanupRegex = new Regex(config.BadNameCharsExpression ?? @"[^\w]+", RegexOptions.IgnoreCase | RegexOptions.Singleline); _content = new ContentStorage(storagePath, true); }
public WcfHttpHost(ContentStorage storage, int port) { _uri = new Uri(String.Format("http://localhost:{0}/", port)); _handler = new SimpleHttpHandler(storage); _host = new WebServiceHost(this); ServiceEndpoint pages = _host.AddServiceEndpoint(GetType(), new WebHttpBinding(), _uri); pages.Behaviors.Add(new WebHttpBehavior()); _host.Open(); }
private void With(bool readOnly, string url, Action<ContentStorage, ContentRecord> action) { using (ContentStorage storage = new ContentStorage(StoragePath(url), readOnly)) { string relpath = new Uri(url, UriKind.Absolute).NormalizedPathAndQuery(); ContentRecord rec; if (storage.TryGetValue(relpath, out rec)) { action(storage, rec); } else throw new ApplicationException("Path not found: " + relpath); } }
public void Dispose() { if (_content != null) { using (ExecutionLock.Write(LockTimeout)) { if (_content != null) _content.Dispose(); _content = null; if (_rsaKeyPair != null) _rsaKeyPair.Dispose(); } } }
public ContentState(ContentStorage content) { //_executionLock = new SimpleReadWriteLocking(); _executionLock = IgnoreLocking.Instance; _rsaKeyPair = ReadKeyFile(); _content = content ?? ReadCurrent(); _channel = new IpcEventChannel(Path.Combine(Settings.RegistryPath, "IISChannel"), BitConverter.ToString(Hash.MD5(Encoding.UTF8.GetBytes(StoragePath)).ToArray())); _channel.OnError += (o, e) => Log.Error(e.GetException()); _channel[Events.ContentUpdate].OnEvent += OnContentUpdated; _channel[Events.CompletionAck].OnEvent += (o, e) => { }; _channel.StartListening(); }
public SitePublisher(string storagePath, string site) { _storagePath = storagePath; _siteUri = new Uri(site, UriKind.Absolute); _content = new ContentStorage(storagePath, true); _keyfile = Path.Combine(storagePath, "client-publishing.key"); if (File.Exists(_keyfile)) { _rsaKeyPair = new RSAKeyPair(_keyfile, true); // we publish on the hash of both client and server keys so that if the handler is invoked there is already // a high-probability that the keyset will match. _publishUri = "/api/publish/" + Safe64Encoding.EncodeBytes(_rsaKeyPair.KeyPairHash.ToArray()) + "/"; } }
public ContentIndexing(string storagePath, string baseUri) { _baseUri = new Uri(baseUri, UriKind.Absolute); _config = Config.ReadConfig(_baseUri, storagePath).Searching; _mimeInfo = new MimeInfoMap(_baseUri, storagePath); _content = new ContentStorage(storagePath, false); string directory = _content.IndexDirectory; DirectoryInfo dirInfo = new DirectoryInfo(directory); if (dirInfo.Exists) dirInfo.Delete(true); _writer = new IndexWriter(FSDirectory.Open(dirInfo), new StandardAnalyzer(Version.LUCENE_29), true, IndexWriter.MaxFieldLength.LIMITED); BlurbLength = (uint)_config.BlubXPath.MaxLength; }
public void ConvertTo(string target, ContentStorage writer) { Uri tgt = new Uri(target, UriKind.Absolute); ContentParser processor = new ContentParser(writer, _baseUri); processor.Reformat = Reformat; processor.RelativeUri = false; if (RebaseLinks) { processor.RewriteUri += uri => { if (uri.IsSameHost(_baseUri)) return new Uri(tgt, uri.PathAndQuery); return uri; }; } CopyTo(writer, processor.ProcessFile); }
public SiteCollector(string directory, string uriStart) { CrawlTime = DateTime.Now; _instanceId = Guid.NewGuid().ToUInt64(); _baseUri = new Uri(new Uri(uriStart, UriKind.Absolute), "/"); if (!Directory.Exists(directory)) Directory.CreateDirectory(directory); _config = Config.ReadConfig(_baseUri, directory); _excluded = new PathExclusionList(); _queue = new TextQueue(Path.Combine(directory, "workqueue.txt"), true); _data = new ContentStorage(directory, false); _parser = new ContentParser(_data, _baseUri); _parser.VisitUri += AddUri; AddUrlsFound = true; UpdateSearchTemplate = true; MaxCrawlAge = TimeSpan.MaxValue; AddUri(new Uri(uriStart, UriKind.Absolute)); }
public void ChangeStorage(ContentStorage contentStorage) { ExecutionLock.ReleaseRead(); //they already have a read-lock try { SwapStorage(contentStorage, true); } finally { try { ExecutionLock.Read(LockTimeout); } catch (Exception e) { throw new CorruptApplicationDomainException(e); } } }
private void SwapStorage(ContentStorage contentStorage, bool bNotify) { ContentStorage old; using (ExecutionLock.Write(LockTimeout)) { if (bNotify) { BroadcastToOthers(Events.ContentUpdate); } old = _content; _content = contentStorage; } if(old != null) old.Dispose(); }
public SearchTemplateBuilder(ContentStorage data, Uri baseUri) { _data = data; _baseUri = baseUri; _config = Config.ReadConfig(_baseUri, data.StorageDirectory); }
public void TestDeduplicate() { SetupW3Example(); VerifyLinks(W3ExampleDirectory, KnownLinks); // the html wordpress generates is full of user-tracking junk, need to strip it out new CommandLine().Optimize(W3ExampleUrl.AbsoluteUri, false); // then one last piece to strip is to clean up the .css user tracking links new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^(.*?\.css)\?.*$", "{1}"); string a, b; using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true)) { a = Encoding.UTF8.GetString(store.ReadContent(store["/2011/09/23/hello-world-2-0/"], true)); b = Encoding.UTF8.GetString(store.ReadContent(store["/2011/09/23/hello-world-2-0/?like=1"], true)); } // make sure we now have a duplicate Assert.AreEqual(a, b); // Remove the duplicate new CommandLine().Deduplicate(W3ExampleUrl.AbsoluteUri, true, true); VerifyLinks(W3ExampleDirectory, KnownLinks.Where(u => u != "/2011/09/23/hello-world-2-0/?like=1")); Assert.AreEqual(0, CountLinks(W3ExampleDirectory, u => u.PathAndQuery == "/2011/09/23/hello-world-2-0/?like=1")); }
public void TestCrawlSite() { Uri siteUri = new Uri("", UriKind.Absolute); string path = Path.Combine(TempDirectory.TempPath, siteUri.Authority.Replace(':', '.')); SetupW3Example(); new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^http://w3example\.wordpress\.com(.*)$", "{1}"); using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true)) using (new WcfHttpHost(store, siteUri.Port)) { Assert.IsFalse(Directory.Exists(path)); new CommandLine().CrawlSite(siteUri.AbsoluteUri); Assert.IsTrue(Directory.Exists(path)); VerifyLinks(path, KnownLinks); } }
public SearchTemplate(ContentStorage data) { _data = data; }
private void VerifyLinks(string directory, IEnumerable<string> knownLinks) { using (ContentStorage store = new ContentStorage(directory, true)) { foreach (string url in knownLinks) Assert.IsTrue(store.ContainsKey(url), "missing link " + url); Assert.AreEqual(knownLinks.Count(), store.Count, "incorrect total links"); } }
public void TestImportAll() { SetupW3Example(); new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^http://w3example\.wordpress\.com(.*)$", "{1}"); using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true)) using (new WcfHttpHost(store, 11080)) { Uri tempUri = new Uri("http://w3example.localhost.test", UriKind.Absolute); string path = Path.Combine(TempDirectory.TempPath, tempUri.Authority.Replace(':', '.')); Assert.IsFalse(Directory.Exists(path)); new CommandLine().Import(new Uri(tempUri, "/copy-of-root/").AbsoluteUri, "", /* Recursive: */ true, true); Assert.IsTrue(Directory.Exists(path)); List<string> expect = new List<string>(KnownLinks); expect.Remove("/?s=%00");//not linked, added by search expect.Remove("/search.css");//not linked, added by search expect.Remove("/search/");//not linked, added by search expect.Remove("/");//root was renamed expect.Add("/copy-of-root/");//new rename of root path VerifyLinks(path, expect); } }
public void TestImportOne() { SetupW3Example(); new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^http://w3example\.wordpress\.com(.*)$", "{1}"); using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true)) using (new WcfHttpHost(store, 11080)) { Uri tempUri = new Uri("http://w3example.localhost.test", UriKind.Absolute); string path = Path.Combine(TempDirectory.TempPath, tempUri.Authority.Replace(':', '.')); Assert.IsFalse(Directory.Exists(path)); new CommandLine().Import(new Uri(tempUri, "/copy-of-root/").AbsoluteUri, "", false, true); Assert.IsTrue(Directory.Exists(path)); VerifyLinks(path, new[] {"/copy-of-root/"}); } }
public SimpleHttpHandler(ContentStorage storage) : this(new ContentState(storage)) { }
private int CountLinks(string directory, Predicate<Uri> test) { using (ContentStorage store = new ContentStorage(directory, true)) { int counter = 0; ContentParser parser = new ContentParser(store, W3ExampleUrl); parser.VisitUri += u => { if (test(u)) counter++; }; parser.ProcessAll(); return counter; } }
public ContentOptimizier(string storagePath, string baseUri) { _baseUri = new Uri(baseUri, UriKind.Absolute); _config = Config.ReadConfig(_baseUri, storagePath); _content = new ContentStorage(storagePath, false); }
public void TestExampleSite() { SetupW3Example(); using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true)) { foreach (string url in KnownLinks) Assert.IsTrue(store.ContainsKey(url)); } }