Beispiel #1
0
        public void CopyTo(ContentStorage writer, Func<ContentRecord, byte[], byte[]> fnprocess)
        {
            bool success;
            foreach(KeyValuePair<string, ContentRecord> item in _content)
            {
                ContentRecord.Builder builder = item.Value.ToBuilder();
                if(item.Value.HasContentStoreId)
                {
                    byte[] data = _content.ReadContent(item.Value, true);
                    if (fnprocess != null)
                        data = fnprocess(item.Value, data);

                    using (ITransactable trans = writer.WriteContent(builder, data))
                    {
                        success = Overwrite 
                                ? writer.AddOrUpdate(item.Key, builder.Build())
                                : writer.Add(item.Key, builder.Build());
                        if (success)
                            trans.Commit();
                    }
                }
                else
                    success = Overwrite
                            ? writer.AddOrUpdate(item.Key, builder.Build())
                            : writer.Add(item.Key, builder.Build());
                
                if(!success)
                    Console.Error.WriteLine("Path already exists " + item.Key);
            }
        }
 public ContentResponse(ContentStorage content, Uri uri)
 {
     _content = content;
     _status = HttpStatusCode.InternalServerError;
     _record = ContentRecord.DefaultInstance;
     try
     {
         string path = uri.NormalizedPathAndQuery();
         if (_content.TryGetValue(path, out _record))
         {
             if (_record.HasContentRedirect)
                 _status = HttpStatusCode.Redirect;
             else
                 _status = HttpStatusCode.OK;
         }
         else
         {
             _record = ContentRecord.DefaultInstance;
             _status = HttpStatusCode.NotFound;
             Log.Warning("404 - {0}", path);
         }
     }
     catch (Exception  ex)
     {
         Log.Error(ex, "Exception on {0}", uri);
     }
 }
Beispiel #3
0
 public SiteConverter(string storagePath, string baseUri)
 {
     RebaseLinks = true;
     _baseUri = new Uri(baseUri, UriKind.Absolute);
     HttpCloneConfig config = Config.ReadConfig(_baseUri, storagePath);
     _mime = new MimeInfoMap(_baseUri, storagePath);
     CleanupRegex = new Regex(config.BadNameCharsExpression ?? @"[^\w]+", RegexOptions.IgnoreCase | RegexOptions.Singleline);
     _content = new ContentStorage(storagePath, true);
 }
Beispiel #4
0
 public WcfHttpHost(ContentStorage storage, int port)
 {
     _uri = new Uri(String.Format("http://localhost:{0}/", port));
     _handler = new SimpleHttpHandler(storage);
     
     _host = new WebServiceHost(this);
     ServiceEndpoint pages = _host.AddServiceEndpoint(GetType(), new WebHttpBinding(), _uri);
     pages.Behaviors.Add(new WebHttpBehavior());
     _host.Open();
 }
Beispiel #5
0
 private void With(bool readOnly, string url, Action<ContentStorage, ContentRecord> action)
 {
     using (ContentStorage storage = new ContentStorage(StoragePath(url), readOnly))
     {
         string relpath = new Uri(url, UriKind.Absolute).NormalizedPathAndQuery();
         ContentRecord rec;
         if (storage.TryGetValue(relpath, out rec))
         {
             action(storage, rec);
         }
         else
             throw new ApplicationException("Path not found: " + relpath);
     }
 }
Beispiel #6
0
        public void Dispose()
        {
            if (_content != null)
            {
                using (ExecutionLock.Write(LockTimeout))
                {
                    if (_content != null)
                        _content.Dispose();
                    _content = null;

                    if (_rsaKeyPair != null)
                        _rsaKeyPair.Dispose();
                }
            }
        }
Beispiel #7
0
        public ContentState(ContentStorage content)
        {
            //_executionLock = new SimpleReadWriteLocking();
            _executionLock = IgnoreLocking.Instance;
            _rsaKeyPair = ReadKeyFile();
            _content = content ?? ReadCurrent();

            _channel = new IpcEventChannel(Path.Combine(Settings.RegistryPath, "IISChannel"),
                BitConverter.ToString(Hash.MD5(Encoding.UTF8.GetBytes(StoragePath)).ToArray()));

            _channel.OnError += (o, e) => Log.Error(e.GetException());
            _channel[Events.ContentUpdate].OnEvent += OnContentUpdated;
            _channel[Events.CompletionAck].OnEvent += (o, e) => { };
            _channel.StartListening();
        }
Beispiel #8
0
        public SitePublisher(string storagePath, string site)
        {
            _storagePath = storagePath;
            _siteUri = new Uri(site, UriKind.Absolute);
            _content = new ContentStorage(storagePath, true);

            _keyfile = Path.Combine(storagePath, "client-publishing.key");
            if (File.Exists(_keyfile))
            {
                _rsaKeyPair = new RSAKeyPair(_keyfile, true);
                // we publish on the hash of both client and server keys so that if the handler is invoked there is already
                // a high-probability that the keyset will match.
                _publishUri = "/api/publish/" + Safe64Encoding.EncodeBytes(_rsaKeyPair.KeyPairHash.ToArray()) + "/";
            }
        }
        public ContentIndexing(string storagePath, string baseUri)
        {
            _baseUri = new Uri(baseUri, UriKind.Absolute);
            _config = Config.ReadConfig(_baseUri, storagePath).Searching;
            _mimeInfo = new MimeInfoMap(_baseUri, storagePath);
            _content = new ContentStorage(storagePath, false);

            string directory = _content.IndexDirectory;
            DirectoryInfo dirInfo = new DirectoryInfo(directory);
            if (dirInfo.Exists)
                dirInfo.Delete(true);
            _writer = new IndexWriter(FSDirectory.Open(dirInfo),
                                 new StandardAnalyzer(Version.LUCENE_29), true,
                                 IndexWriter.MaxFieldLength.LIMITED);

            BlurbLength = (uint)_config.BlubXPath.MaxLength;
        }
Beispiel #10
0
        public void ConvertTo(string target, ContentStorage writer)
        {
            Uri tgt = new Uri(target, UriKind.Absolute);
            ContentParser processor = new ContentParser(writer, _baseUri);

            processor.Reformat = Reformat;
            processor.RelativeUri = false;
            if (RebaseLinks)
            {
                processor.RewriteUri +=
                    uri =>
                        {
                            if (uri.IsSameHost(_baseUri))
                                return new Uri(tgt, uri.PathAndQuery);
                            return uri;
                        };
            }
            CopyTo(writer, processor.ProcessFile);
        }
Beispiel #11
0
        public SiteCollector(string directory, string uriStart)
        {
            CrawlTime = DateTime.Now;
            _instanceId = Guid.NewGuid().ToUInt64();
            _baseUri = new Uri(new Uri(uriStart, UriKind.Absolute), "/");

            if (!Directory.Exists(directory))
                Directory.CreateDirectory(directory);

            _config = Config.ReadConfig(_baseUri, directory);
            _excluded = new PathExclusionList();
            _queue = new TextQueue(Path.Combine(directory, "workqueue.txt"), true);
            _data = new ContentStorage(directory, false);
            _parser = new ContentParser(_data, _baseUri);
            _parser.VisitUri += AddUri;

            AddUrlsFound = true;
            UpdateSearchTemplate = true;
            MaxCrawlAge = TimeSpan.MaxValue;
            AddUri(new Uri(uriStart, UriKind.Absolute));
        }
Beispiel #12
0
        public void ChangeStorage(ContentStorage contentStorage)
        {
            ExecutionLock.ReleaseRead(); //they already have a read-lock
            try
            {
                SwapStorage(contentStorage, true);
            }
            finally
            {
                try
                {
                    ExecutionLock.Read(LockTimeout);
                }
                catch (Exception e)
                {
                    throw new CorruptApplicationDomainException(e);
                }
            }

        }
Beispiel #13
0
 private void SwapStorage(ContentStorage contentStorage, bool bNotify)
 {
     ContentStorage old;
     using (ExecutionLock.Write(LockTimeout))
     {
         if (bNotify)
         {
             BroadcastToOthers(Events.ContentUpdate);
         }
         old = _content;
         _content = contentStorage;
     }
     if(old != null)
         old.Dispose();
 }
 public SearchTemplateBuilder(ContentStorage data, Uri baseUri)
 {
     _data = data;
     _baseUri = baseUri;
     _config = Config.ReadConfig(_baseUri, data.StorageDirectory);
 }
Beispiel #15
0
        public void TestDeduplicate()
        {
            SetupW3Example();
            VerifyLinks(W3ExampleDirectory, KnownLinks);
            // the html wordpress generates is full of user-tracking junk, need to strip it out
            new CommandLine().Optimize(W3ExampleUrl.AbsoluteUri, false);
            // then one last piece to strip is to clean up the .css user tracking links
            new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^(.*?\.css)\?.*$", "{1}");

            string a, b;
            using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true))
            {
                a = Encoding.UTF8.GetString(store.ReadContent(store["/2011/09/23/hello-world-2-0/"], true));
                b = Encoding.UTF8.GetString(store.ReadContent(store["/2011/09/23/hello-world-2-0/?like=1"], true));
            }
            // make sure we now have a duplicate
            Assert.AreEqual(a, b);
            // Remove the duplicate
            new CommandLine().Deduplicate(W3ExampleUrl.AbsoluteUri, true, true);
            VerifyLinks(W3ExampleDirectory, KnownLinks.Where(u => u != "/2011/09/23/hello-world-2-0/?like=1"));
            Assert.AreEqual(0, CountLinks(W3ExampleDirectory, u => u.PathAndQuery == "/2011/09/23/hello-world-2-0/?like=1"));
        }
Beispiel #16
0
        public void TestCrawlSite()
        {
            Uri siteUri = new Uri("http://127.0.0.1:11080", UriKind.Absolute);
            string path = Path.Combine(TempDirectory.TempPath, siteUri.Authority.Replace(':', '.'));
            SetupW3Example();
            new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^http://w3example\.wordpress\.com(.*)$", "http://127.0.0.1:11080{1}");

            using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true))
            using (new WcfHttpHost(store, siteUri.Port))
            {
                Assert.IsFalse(Directory.Exists(path));
                new CommandLine().CrawlSite(siteUri.AbsoluteUri);
                Assert.IsTrue(Directory.Exists(path));
                VerifyLinks(path, KnownLinks);
            }
        }
Beispiel #17
0
 public SearchTemplate(ContentStorage data)
 {
     _data = data;
 }
Beispiel #18
0
 private void VerifyLinks(string directory, IEnumerable<string> knownLinks)
 {
     using (ContentStorage store = new ContentStorage(directory, true))
     {
         foreach (string url in knownLinks)
             Assert.IsTrue(store.ContainsKey(url), "missing link " + url);
         Assert.AreEqual(knownLinks.Count(), store.Count, "incorrect total links");
     }
 }
Beispiel #19
0
        public void TestImportAll()
        {
            SetupW3Example();
            new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^http://w3example\.wordpress\.com(.*)$", "http://127.0.0.1:11080{1}");

            using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true))
            using (new WcfHttpHost(store, 11080))
            {
                Uri tempUri = new Uri("http://w3example.localhost.test", UriKind.Absolute);
                string path = Path.Combine(TempDirectory.TempPath, tempUri.Authority.Replace(':', '.'));
                Assert.IsFalse(Directory.Exists(path));
                new CommandLine().Import(new Uri(tempUri, "/copy-of-root/").AbsoluteUri, "http://127.0.0.1:11080", /* Recursive: */ true, true);
                Assert.IsTrue(Directory.Exists(path));

                List<string> expect = new List<string>(KnownLinks);
                expect.Remove("/?s=%00");//not linked, added by search
                expect.Remove("/search.css");//not linked, added by search
                expect.Remove("/search/");//not linked, added by search
                expect.Remove("/");//root was renamed
                expect.Add("/copy-of-root/");//new rename of root path
                
                VerifyLinks(path, expect);
            }
        }
Beispiel #20
0
        public void TestImportOne()
        {
            SetupW3Example();
            new CommandLine().RelinkEx(W3ExampleUrl.AbsoluteUri, @"^http://w3example\.wordpress\.com(.*)$", "http://127.0.0.1:11080{1}");

            using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true))
            using (new WcfHttpHost(store, 11080))
            {
                Uri tempUri = new Uri("http://w3example.localhost.test", UriKind.Absolute);
                string path = Path.Combine(TempDirectory.TempPath, tempUri.Authority.Replace(':', '.'));
                Assert.IsFalse(Directory.Exists(path));
                new CommandLine().Import(new Uri(tempUri, "/copy-of-root/").AbsoluteUri, "http://127.0.0.1:11080", false, true);
                Assert.IsTrue(Directory.Exists(path));
                VerifyLinks(path, new[] {"/copy-of-root/"});
            }
        }
 public SimpleHttpHandler(ContentStorage storage)
     : this(new ContentState(storage))
 {
 }
Beispiel #22
0
 private int CountLinks(string directory, Predicate<Uri> test)
 {
     using (ContentStorage store = new ContentStorage(directory, true))
     {
         int counter = 0;
         ContentParser parser = new ContentParser(store, W3ExampleUrl);
         parser.VisitUri += u => { if (test(u)) counter++; };
         parser.ProcessAll();
         return counter;
     }
 }
 public ContentOptimizier(string storagePath, string baseUri)
 {
     _baseUri = new Uri(baseUri, UriKind.Absolute);
     _config = Config.ReadConfig(_baseUri, storagePath);
     _content = new ContentStorage(storagePath, false);
 }
Beispiel #24
0
 public void TestExampleSite()
 {
     SetupW3Example();
     using (ContentStorage store = new ContentStorage(W3ExampleDirectory, true))
     {
         foreach (string url in KnownLinks)
             Assert.IsTrue(store.ContainsKey(url));
     }
 }