Exemplo n.º 1
0
        public SiteConverter(string storagePath, string baseUri)
        {
            RebaseLinks = true;
            _baseUri    = new Uri(baseUri, UriKind.Absolute);
            HttpCloneConfig config = Config.ReadConfig(_baseUri, storagePath);

            _mime        = new MimeInfoMap(_baseUri, storagePath);
            CleanupRegex = new Regex(config.BadNameCharsExpression ?? @"[^\w]+", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            _content     = new ContentStorage(storagePath, true);
        }
Exemplo n.º 2
0
 public MimeInfoMap(Uri baseUri, string storageDir)
 {
     _config = Config.ReadConfig(baseUri, storageDir);
     foreach (HttpCloneDocType t in _config.DocumentTypes.Where(x=>!String.IsNullOrEmpty(x.MimeType)))
     {
         _map.Add(t.MimeType, t);
         if (t.Aliases != null)
             foreach (var a in t.Aliases.Where(x => !String.IsNullOrEmpty(x.MimeType)))
                 _map.Add(a.MimeType, t);
     }
 }
Exemplo n.º 3
0
 public MimeInfoMap(Uri baseUri, string storageDir)
 {
     _config = Config.ReadConfig(baseUri, storageDir);
     foreach (HttpCloneDocType t in _config.DocumentTypes.Where(x => !String.IsNullOrEmpty(x.MimeType)))
     {
         _map.Add(t.MimeType, t);
         if (t.Aliases != null)
         {
             foreach (var a in t.Aliases.Where(x => !String.IsNullOrEmpty(x.MimeType)))
             {
                 _map.Add(a.MimeType, t);
             }
         }
     }
 }
Exemplo n.º 4
0
        public ContentParser(ContentStorage content, Uri configUri, Uri baseUri)
        {
            _baseUri = baseUri;
            _config  = Config.ReadConfig(configUri, content.StorageDirectory);
            _content = content;

            _documentTypes = new Dictionary <string, HttpCloneDocType>(StringComparer.OrdinalIgnoreCase);
            foreach (HttpCloneDocType dtype in _config.DocumentTypes)
            {
                _documentTypes.Add(dtype.MimeType, dtype);
                if (dtype.FileExtension != null)
                {
                    _documentTypes.Add(dtype.FileExtension, dtype);
                }

                foreach (var alias in dtype.Aliases.SafeEnumeration())
                {
                    _documentTypes.Add(alias.MimeType, dtype);
                    if (alias.FileExtension != null)
                    {
                        _documentTypes.Add(alias.FileExtension, dtype);
                    }
                }
            }
            _documentTags = new Dictionary <string, List <HttpCloneDocumentTag> >(StringComparer.OrdinalIgnoreCase);
            foreach (HttpCloneDocType dtype in _config.DocumentTypes)
            {
                if (dtype.DocumentTags == null)
                {
                    continue;
                }
                foreach (HttpCloneDocumentTag tag in dtype.DocumentTags)
                {
                    List <HttpCloneDocumentTag> tags;
                    string key = dtype.MimeType + MimeTagDivide + tag.TagName;
                    if (!_documentTags.TryGetValue(key, out tags))
                    {
                        _documentTags.Add(key, tags = new List <HttpCloneDocumentTag>());

                        foreach (var alias in dtype.Aliases.SafeEnumeration())
                        {
                            _documentTags.Add(alias.MimeType + MimeTagDivide + tag.TagName, tags);
                        }
                    }
                    tags.Add(tag);
                }
            }
        }
Exemplo n.º 5
0
        public SiteCollector(string directory, string uriStart)
        {
            CrawlTime = DateTime.Now;
            _instanceId = Guid.NewGuid().ToUInt64();
            _baseUri = new Uri(new Uri(uriStart, UriKind.Absolute), "/");

            if (!Directory.Exists(directory))
                Directory.CreateDirectory(directory);

            _config = Config.ReadConfig(_baseUri, directory);
            _excluded = new PathExclusionList();
            _queue = new TextQueue(Path.Combine(directory, "workqueue.txt"), true);
            _data = new ContentStorage(directory, false);
            _parser = new ContentParser(_data, _baseUri);
            _parser.VisitUri += AddUri;

            AddUrlsFound = true;
            UpdateSearchTemplate = true;
            MaxCrawlAge = TimeSpan.MaxValue;
            AddUri(new Uri(uriStart, UriKind.Absolute));
        }
Exemplo n.º 6
0
        public SiteCollector(string directory, string uriStart)
        {
            CrawlTime   = DateTime.Now;
            _instanceId = Guid.NewGuid().ToUInt64();
            _baseUri    = new Uri(new Uri(uriStart, UriKind.Absolute), "/");

            if (!Directory.Exists(directory))
            {
                Directory.CreateDirectory(directory);
            }

            _config           = Config.ReadConfig(_baseUri, directory);
            _excluded         = new PathExclusionList();
            _queue            = new TextQueue(Path.Combine(directory, "workqueue.txt"), true);
            _data             = new ContentStorage(directory, false);
            _parser           = new ContentParser(_data, _baseUri);
            _parser.VisitUri += AddUri;

            AddUrlsFound         = true;
            UpdateSearchTemplate = true;
            MaxCrawlAge          = TimeSpan.MaxValue;
            AddUri(new Uri(uriStart, UriKind.Absolute));
        }
Exemplo n.º 7
0
 public ContentOptimizier(string storagePath, string baseUri)
 {
     _baseUri = new Uri(baseUri, UriKind.Absolute);
     _config = Config.ReadConfig(_baseUri, storagePath);
     _content = new ContentStorage(storagePath, false);
 }
Exemplo n.º 8
0
 public SearchTemplateBuilder(ContentStorage data, Uri baseUri)
 {
     _data    = data;
     _baseUri = baseUri;
     _config  = Config.ReadConfig(_baseUri, data.StorageDirectory);
 }
Exemplo n.º 9
0
 public ContentOptimizier(string storagePath, string baseUri)
 {
     _baseUri = new Uri(baseUri, UriKind.Absolute);
     _config  = Config.ReadConfig(_baseUri, storagePath);
     _content = new ContentStorage(storagePath, false);
 }
Exemplo n.º 10
0
 public SearchTemplateBuilder(ContentStorage data, Uri baseUri)
 {
     _data = data;
     _baseUri = baseUri;
     _config = Config.ReadConfig(_baseUri, data.StorageDirectory);
 }